In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import shuffle
tr = pd.read_csv('../input/X_train.csv')
te = pd.read_csv('../input/X_test.csv')
target = pd.read_csv('../input/y_train.csv')
ss = pd.read_csv('../input/sample_submission.csv')



In [2]:
def quaternion_to_euler(x, y, z, w):
    import math
    t0 = +2.0 * (w * x + y * z)
    t1 = +1.0 - 2.0 * (x * x + y * y)
    X = math.atan2(t0, t1)

    t2 = +2.0 * (w * y - z * x)
    t2 = +1.0 if t2 > +1.0 else t2
    t2 = -1.0 if t2 < -1.0 else t2
    Y = math.asin(t2)

    t3 = +2.0 * (w * z + x * y)
    t4 = +1.0 - 2.0 * (y * y + z * z)
    Z = math.atan2(t3, t4)

    return X, Y, Z

def fe(actual):
    new = pd.DataFrame()
    actual['total_angular_velocity'] = (actual['angular_velocity_X'] ** 2 + actual['angular_velocity_Y'] ** 2 + actual['angular_velocity_Z'] ** 2) ** 0.5
    actual['total_linear_acceleration'] = (actual['linear_acceleration_X'] ** 2 + actual['linear_acceleration_Y'] ** 2 + actual['linear_acceleration_Z'] ** 2) ** 0.5
    
    actual['acc_vs_vel'] = actual['total_linear_acceleration'] / actual['total_angular_velocity']
    
    x, y, z, w = actual['orientation_X'].tolist(), actual['orientation_Y'].tolist(), actual['orientation_Z'].tolist(), actual['orientation_W'].tolist()
    nx, ny, nz = [], [], []
    for i in range(len(x)):
        xx, yy, zz = quaternion_to_euler(x[i], y[i], z[i], w[i])
        nx.append(xx)
        ny.append(yy)
        nz.append(zz)
    
    actual['euler_x'] = nx
    actual['euler_y'] = ny
    actual['euler_z'] = nz
    
    actual['total_angle'] = (actual['euler_x'] ** 2 + actual['euler_y'] ** 2 + actual['euler_z'] ** 2) ** 0.5
    actual['angle_vs_acc'] = actual['total_angle'] / actual['total_linear_acceleration']
    actual['angle_vs_vel'] = actual['total_angle'] / actual['total_angular_velocity']
    
    def f1(x):
        return np.mean(np.diff(np.abs(np.diff(x))))
    
    def f2(x):
        return np.mean(np.abs(np.diff(x)))
    
    for col in actual.columns:
        if col in ['row_id', 'series_id', 'measurement_number','group_id']:
            continue
        new[col + '_mean'] = actual.groupby(['series_id'])[col].mean()
        new[col + '_min'] = actual.groupby(['series_id'])[col].min()
        new[col + '_max'] = actual.groupby(['series_id'])[col].max()
        new[col + '_std'] = actual.groupby(['series_id'])[col].std()
        new[col + '_max_to_min'] = new[col + '_max'] / new[col + '_min']
        
        # Change. 1st order.
        new[col + '_mean_abs_change'] = actual.groupby('series_id')[col].apply(f2)
        
        # Change of Change. 2nd order.
        new[col + '_mean_change_of_abs_change'] = actual.groupby('series_id')[col].apply(f1)
        
        new[col + '_abs_max'] = actual.groupby('series_id')[col].apply(lambda x: np.max(np.abs(x)))
        new[col + '_abs_min'] = actual.groupby('series_id')[col].apply(lambda x: np.min(np.abs(x)))

    return new

In [3]:
from IPython.display import HTML
import base64
def create_download_link(df, title = "Download ", filename = "data.csv"):  
    title = title + filename
    csv = df.to_csv(index=False)
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)


In [4]:
train_labels= target['surface']

In [5]:
# okay so I choose concrete and soft_pvc
train_csv = pd.read_csv('../input/X_train.csv')
test_csv = pd.read_csv('../input/X_test.csv')

In [6]:
train_concat = train_csv.merge(target,on='series_id')

In [7]:
target['surface'].value_counts()

concrete                  779
soft_pvc                  732
wood                      607
tiled                     514
fine_concrete             363
hard_tiles_large_space    308
soft_tiles                297
carpet                    189
hard_tiles                 21
Name: surface, dtype: int64

In [8]:
# soft_pvc_conrete = pd.merge([])
tr_ct = train_concat.copy()
tr_a = tr_ct.loc[tr_ct['surface'].isin(["hard_tiles", "concrete"])]
# "soft_pvc", "concrete"
tr_a_f = fe(tr_a.drop(columns=['surface','group_id']))
tr_a_f_concat = tr_a_f.reset_index().merge(target, on='series_id')
tr_a_target = tr_a_f_concat['surface']

In [9]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
tr_a_label  = le.fit_transform(tr_a_target)
# le.inverse_transform(t)

In [10]:
from sklearn.model_selection import KFold, StratifiedKFold
random_forest = RandomForestClassifier(n_estimators = 100, random_state = 50)
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=59)
predicted = np.zeros((152,2))
score = []
for times, (trn_idx, val_idx) in enumerate(folds.split(tr_a_f,tr_a_label)):
    random_forest.fit(tr_a_f.iloc[trn_idx],tr_a_label[trn_idx])
    score.append(random_forest.score(tr_a_f.iloc[val_idx],tr_a_label[val_idx]))
    p = random_forest.predict(tr_a_f.iloc[val_idx])
    print(tr_a_target[trn_idx].value_counts())
    print(tr_a_target[val_idx].value_counts())
score


concrete      623
hard_tiles     16
Name: surface, dtype: int64
concrete      156
hard_tiles      5
Name: surface, dtype: int64
concrete      623
hard_tiles     17
Name: surface, dtype: int64
concrete      156
hard_tiles      4
Name: surface, dtype: int64
concrete      623
hard_tiles     17
Name: surface, dtype: int64
concrete      156
hard_tiles      4
Name: surface, dtype: int64
concrete      623
hard_tiles     17
Name: surface, dtype: int64
concrete      156
hard_tiles      4
Name: surface, dtype: int64
concrete      624
hard_tiles     17
Name: surface, dtype: int64
concrete      155
hard_tiles      4
Name: surface, dtype: int64


[1.0, 0.99375, 1.0, 0.9875, 0.9937106918238994]

In [11]:
623/640

0.9734375

In [12]:
from sklearn.model_selection import cross_val_score
c = cross_val_score(random_forest,tr_a_f, tr_a_label, cv= 10 )
c

array([1.    , 1.    , 1.    , 1.    , 1.    , 1.    , 0.9875, 0.975 ,
       1.    , 1.    ])

In [13]:
import itertools
list_combinations = list(itertools.combinations(target["surface"].unique(), r=2))


In [14]:
two_surface_scores = {}

In [15]:
from tqdm import tqdm

In [16]:

for two_surface in tqdm(list_combinations):
    # soft_pvc_conrete = pd.merge([])
    tr_ct = train_concat.copy()
    tr_a = tr_ct.loc[tr_ct['surface'].isin(two_surface)]
    # "soft_pvc", "concrete"
    tr_a_f = fe(tr_a.drop(columns=['surface','group_id']))
    tr_a_f_concat = tr_a_f.reset_index().merge(target, on='series_id')
    tr_a_target = tr_a_f_concat['surface']
    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    tr_a_label  = le.fit_transform(tr_a_target)
    # le.inverse_transform(t)
    from sklearn.model_selection import KFold, StratifiedKFold
    random_forest = RandomForestClassifier(n_estimators = 100, random_state = 50)
    folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=59)
    predicted = np.zeros((152,2))
    score = []
    for times, (trn_idx, val_idx) in enumerate(folds.split(tr_a_f,tr_a_label)):
        random_forest.fit(tr_a_f.iloc[trn_idx],tr_a_label[trn_idx])
        score.append(random_forest.score(tr_a_f.iloc[val_idx],tr_a_label[val_idx]))
        p = random_forest.predict(tr_a_f.iloc[val_idx])
#         print(tr_a_target[trn_idx].value_counts())
#         print(tr_a_target[val_idx].value_counts())
    surface_id = two_surface[0]+"-"+two_surface[1]
#     print(surface_id)
    two_surface_scores[surface_id] = np.mean(score)

100%|██████████| 36/36 [07:31<00:00,  9.49s/it]


In [17]:
two_surface_scores

{'fine_concrete-concrete': 0.9632151502971114,
 'fine_concrete-soft_tiles': 0.9667106698042817,
 'fine_concrete-tiled': 0.9726490520973279,
 'fine_concrete-soft_pvc': 0.9772060430876568,
 'fine_concrete-hard_tiles_large_space': 0.9642216652742969,
 'fine_concrete-carpet': 0.9855197649693063,
 'fine_concrete-hard_tiles': 0.9947368421052631,
 'fine_concrete-wood': 0.9360810498014708,
 'concrete-soft_tiles': 0.9758568588149114,
 'concrete-tiled': 0.9651908718056579,
 'concrete-soft_pvc': 0.9583326852629819,
 'concrete-hard_tiles_large_space': 0.9677669107825959,
 'concrete-carpet': 0.9710910652920963,
 'concrete-hard_tiles': 0.99499213836478,
 'concrete-wood': 0.9567174889816604,
 'soft_tiles-tiled': 0.9679615445953136,
 'soft_tiles-soft_pvc': 0.9805537881465771,
 'soft_tiles-hard_tiles_large_space': 0.9834699453551913,
 'soft_tiles-carpet': 0.9876073883161511,
 'soft_tiles-hard_tiles': 0.9685485347985349,
 'soft_tiles-wood': 0.9724053724053725,
 'tiled-soft_pvc': 0.9727098587899988,
 'ti

In [18]:
s = pd.Series(list(two_surface_scores.values()),index = list(two_surface_scores.keys()))


In [19]:
s.sort_values()

fine_concrete-wood                      0.936081
concrete-wood                           0.956717
concrete-soft_pvc                       0.958333
tiled-carpet                            0.960182
fine_concrete-concrete                  0.963215
tiled-wood                              0.963436
fine_concrete-hard_tiles_large_space    0.964222
soft_pvc-wood                           0.964900
concrete-tiled                          0.965191
fine_concrete-soft_tiles                0.966711
concrete-hard_tiles_large_space         0.967767
soft_tiles-tiled                        0.967962
soft_tiles-hard_tiles                   0.968549
concrete-carpet                         0.971091
soft_tiles-wood                         0.972405
fine_concrete-tiled                     0.972649
tiled-soft_pvc                          0.972710
concrete-soft_tiles                     0.975857
fine_concrete-soft_pvc                  0.977206
hard_tiles_large_space-wood             0.978147
soft_pvc-hard_tiles_

In [20]:
target['surface'].value_counts()

concrete                  779
soft_pvc                  732
wood                      607
tiled                     514
fine_concrete             363
hard_tiles_large_space    308
soft_tiles                297
carpet                    189
hard_tiles                 21
Name: surface, dtype: int64

In [21]:
def two_surface_cal(two_surface):
    tr_ct = train_concat.copy()
    tr_a = tr_ct.loc[tr_ct['surface'].isin(two_surface)]
    # "soft_pvc", "concrete"
    tr_a_f = fe(tr_a.drop(columns=['surface','group_id']))
    tr_a_f_concat = tr_a_f.reset_index().merge(target, on='series_id')
    print(tr_a_f_concat['surface'].value_counts())
    tr_a_target = tr_a_f_concat['surface']
    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    tr_a_label  = le.fit_transform(tr_a_target)
    # le.inverse_transform(t)
    from sklearn.model_selection import KFold, StratifiedKFold
    random_forest = RandomForestClassifier(n_estimators = 100, random_state = 50)
    folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=59)
    predicted = np.zeros((152,2))
    score = []
    for times, (trn_idx, val_idx) in enumerate(folds.split(tr_a_f,tr_a_label)):
        random_forest.fit(tr_a_f.iloc[trn_idx],tr_a_label[trn_idx])
        score.append(random_forest.score(tr_a_f.iloc[val_idx],tr_a_label[val_idx]))
        p = random_forest.predict(tr_a_f.iloc[val_idx])
    return score

In [22]:
two_surface_cal(['fine_concrete','wood'])

wood             607
fine_concrete    363
Name: surface, dtype: int64


[0.9538461538461539,
 0.9128205128205128,
 0.9484536082474226,
 0.9119170984455959,
 0.9533678756476683]