In [1]:
import os
import random
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import gc
from scipy.stats import zscore

In [2]:
df_Xtrain = pd.read_csv('X_train.csv')
df_ytrain = pd.read_csv('y_train.csv')
df_Xtest = pd.read_csv('X_test.csv')


In [3]:
df_Xtest.head()

Unnamed: 0,row_id,series_id,measurement_number,orientation_X,orientation_Y,orientation_Z,orientation_W,angular_velocity_X,angular_velocity_Y,angular_velocity_Z,linear_acceleration_X,linear_acceleration_Y,linear_acceleration_Z
0,0_0,0,0,-0.025773,-0.98864,-0.14801,0.00335,-0.006524,-0.001071,-0.02739,0.10043,4.2061,-5.5439
1,0_1,0,1,-0.025683,-0.98862,-0.14816,0.003439,-0.11396,0.083987,-0.06059,-0.70889,3.9905,-8.0273
2,0_2,0,2,-0.025617,-0.98861,-0.14826,0.003571,-0.080518,0.11486,-0.037177,1.4571,2.2828,-11.299
3,0_3,0,3,-0.025566,-0.98862,-0.14817,0.003609,0.070067,0.03382,-0.035904,0.71096,1.8582,-12.227
4,0_4,0,4,-0.025548,-0.98866,-0.14792,0.003477,0.15205,-0.029016,-0.015314,3.3996,2.7881,-10.41


In [4]:
df_ytrain.head()

Unnamed: 0,series_id,group_id,surface
0,0,13,fine_concrete
1,1,31,concrete
2,2,20,concrete
3,3,31,concrete
4,4,22,soft_tiles


In [5]:
df_Xtrain.tail()

Unnamed: 0,row_id,series_id,measurement_number,orientation_X,orientation_Y,orientation_Z,orientation_W,angular_velocity_X,angular_velocity_Y,angular_velocity_Z,linear_acceleration_X,linear_acceleration_Y,linear_acceleration_Z
487675,3809_123,3809,123,0.62871,-0.76878,-0.084391,0.081093,0.003167,0.09376,-0.14274,3.2718,2.0115,-9.0063
487676,3809_124,3809,124,0.62884,-0.76868,-0.084365,0.081099,0.014994,0.032637,-0.13238,4.4275,3.0696,-8.1257
487677,3809_125,3809,125,0.62891,-0.76861,-0.084345,0.081178,-0.031184,-0.003961,-0.13894,2.7048,4.2622,-8.1443
487678,3809_126,3809,126,0.62903,-0.7685,-0.084414,0.081231,-0.069153,0.013229,-0.13021,2.541,4.713,-9.4435
487679,3809_127,3809,127,0.62915,-0.76839,-0.084441,0.081284,-0.042769,0.034049,-0.1258,0.82391,4.2751,-10.498


In [6]:
robot_stats2 = ['row_id','measurement_number', 'orientation_X', 'orientation_Y', 'orientation_Z', 'orientation_W',
              'angular_velocity_X', 'angular_velocity_Y', 'angular_velocity_Z',	
              'linear_acceleration_X', 'linear_acceleration_Y', 'linear_acceleration_Z']

In [7]:
robot_stats = ['orientation_X', 'orientation_Y', 'orientation_Z', 'orientation_W',
              'angular_velocity_X', 'angular_velocity_Y', 'angular_velocity_Z',	
              'linear_acceleration_X', 'linear_acceleration_Y', 'linear_acceleration_Z']

In [8]:
le = LabelEncoder()
df_ytrain['surface'] = le.fit_transform(df_ytrain['surface'])
df_ytrain.head()

Unnamed: 0,series_id,group_id,surface
0,0,13,2
1,1,31,1
2,2,20,1
3,3,31,1
4,4,22,6


In [9]:
df = df_Xtrain.merge(df_ytrain, on = 'series_id', how = 'inner')
y_train = df.drop(df[robot_stats2],axis = 1)
y_train.head()

Unnamed: 0,series_id,group_id,surface
0,0,13,2
1,0,13,2
2,0,13,2
3,0,13,2
4,0,13,2


In [10]:
def Z_Score(df):
    score = df[robot_stats].apply(zscore)
    remove_above = score[(score < 3).all(axis = 1)]
    remove_below = remove_above[(-3 < remove_above).all(axis = 1)]
    X_train = pd.concat([pd.DataFrame(remove_below), df['series_id']] , axis=1)
    X_train.fillna(0, inplace = True)
    
    return (X_train)
    

In [11]:
X_train = Z_Score(df_Xtrain)
X_train.head()

Unnamed: 0,orientation_X,orientation_Y,orientation_Z,orientation_W,angular_velocity_X,angular_velocity_Y,angular_velocity_Z,linear_acceleration_X,linear_acceleration_Y,linear_acceleration_Z,series_id
0,-1.079897,-1.001675,-1.10725,-0.979549,0.912613,0.104012,0.087064,-0.469289,-0.366095,-0.136474,0
1,-1.079897,-1.001661,-1.107439,-0.979837,0.574656,0.243598,0.098489,0.112621,-0.644872,-0.01684,0
2,-1.079897,-1.001675,-1.107628,-0.979549,0.060266,0.232265,0.057627,-0.210398,-0.60478,0.224292,0
3,-1.079882,-1.001689,-1.107911,-0.979549,-0.112348,0.125292,0.044556,0.159072,-0.8351,-0.256952,0
4,-1.079882,-1.001675,-1.107911,-0.979453,0.042096,-0.007735,0.106605,-0.341586,-0.662395,-0.378203,0


In [12]:
X_test = Z_Score(df_Xtest)
X_test.head()

Unnamed: 0,orientation_X,orientation_Y,orientation_Z,orientation_W,angular_velocity_X,angular_velocity_Y,angular_velocity_Z,linear_acceleration_X,linear_acceleration_Y,linear_acceleration_Z,series_id
0,-0.085968,-1.552497,-1.537093,-0.006672,-0.060484,-0.16481,0.037262,-0.013643,0.620373,1.361529,0
1,-0.085834,-1.552469,-1.538475,-0.005791,-0.998919,0.786202,-0.104746,-0.455258,0.517448,0.475444,0
2,-0.085736,-1.552455,-1.539397,-0.004479,-0.706809,1.131384,-0.004601,0.72664,-0.29779,-0.691909,0
3,-0.08566,-1.552469,-1.538568,-0.004109,0.608521,0.2253,0.000844,0.3195,-0.50049,-1.023022,0
4,-0.085633,-1.552525,-1.536263,-0.005412,1.324627,-0.47725,0.088915,1.786588,-0.056565,-0.374711,0


In [13]:
folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=60)
predicted = np.zeros((X_test.shape[0],9))
measured= np.zeros((X_train.shape[0]))
score = 0
for times, (trn_idx, val_idx) in enumerate(folds.split(X_train.values,y_train['surface'].values)):
    model = RandomForestClassifier(n_estimators=700, n_jobs = -1)
    model.fit(X_train.iloc[trn_idx],y_train['surface'][trn_idx])
    measured[val_idx] = model.predict(X_train.iloc[val_idx])
    predicted += model.predict_proba(X_test)/folds.n_splits
    score += model.score(X_train.iloc[val_idx],y_train['surface'][val_idx])
    print("Fold: {} score: {}".format(times,model.score(X_train.iloc[val_idx],y_train['surface'][val_idx])))
    
    gc.collect()

Fold: 0 score: 0.997970188423923
Fold: 1 score: 0.9980111947183893
Fold: 2 score: 0.997990567972114
Fold: 3 score: 0.9981340988312487
Fold: 4 score: 0.9982570538057742
Fold: 5 score: 0.9978469488188977
Fold: 6 score: 0.9986875833077002
Fold: 7 score: 0.9985235312211628
Fold: 8 score: 0.9981748831104913
Fold: 9 score: 0.9981748831104913


In [14]:
print('Average score', score / folds.n_splits)

Average score 0.9981770933320193


In [15]:
confusion_matrix(measured,y_train['surface'])

array([[24151,     4,     0,     0,     1,     6,     9,    11,    46],
       [    9, 99492,    16,     0,   134,    18,     6,    18,    74],
       [    0,    20, 46422,     0,    39,     3,     0,     6,     1],
       [    0,     0,     0,  2688,     0,     0,     0,     0,     0],
       [    0,    43,     6,     0, 39219,    11,     1,     7,    11],
       [    1,    47,     3,     0,     4, 93603,     3,    41,     3],
       [    1,     6,     0,     0,     1,     4, 37967,     7,     4],
       [   19,    31,    16,     0,     4,    39,    26, 65695,     3],
       [   11,    69,     1,     0,    22,    12,     4,     7, 77554]],
      dtype=int64)

In [31]:
Pred1 = le.inverse_transform(predicted.argmax(axis = 1))
Pred = pd.DataFrame(Pred1)
Pred[Pred.columns].nunique()

0    9
dtype: int64