In [1]:
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import shuffle

In [2]:
train_csv = pd.read_csv('../input/X_train.csv')
test_csv = pd.read_csv('../input/X_test.csv')
target = pd.read_csv('../input/y_train.csv')
ss = pd.read_csv('../input/sample_submission.csv')

In [3]:
def quaternion_to_euler(x, y, z, w):
    import math
    t0 = +2.0 * (w * x + y * z)
    t1 = +1.0 - 2.0 * (x * x + y * y)
    X = math.atan2(t0, t1)

    t2 = +2.0 * (w * y - z * x)
    t2 = +1.0 if t2 > +1.0 else t2
    t2 = -1.0 if t2 < -1.0 else t2
    Y = math.asin(t2)

    t3 = +2.0 * (w * z + x * y)
    t4 = +1.0 - 2.0 * (y * y + z * z)
    Z = math.atan2(t3, t4)

    return X, Y, Z

def fe(actual):
    new = pd.DataFrame()
    actual['total_angular_velocity'] = (actual['angular_velocity_X'] ** 2 + actual['angular_velocity_Y'] ** 2 + actual['angular_velocity_Z'] ** 2) ** 0.5
    actual['total_linear_acceleration'] = (actual['linear_acceleration_X'] ** 2 + actual['linear_acceleration_Y'] ** 2 + actual['linear_acceleration_Z'] ** 2) ** 0.5
    
    actual['acc_vs_vel'] = actual['total_linear_acceleration'] / actual['total_angular_velocity']
    
    x, y, z, w = actual['orientation_X'].tolist(), actual['orientation_Y'].tolist(), actual['orientation_Z'].tolist(), actual['orientation_W'].tolist()
    nx, ny, nz = [], [], []
    for i in range(len(x)):
        xx, yy, zz = quaternion_to_euler(x[i], y[i], z[i], w[i])
        nx.append(xx)
        ny.append(yy)
        nz.append(zz)
    
    actual['euler_x'] = nx
    actual['euler_y'] = ny
    actual['euler_z'] = nz
    
    actual['total_angle'] = (actual['euler_x'] ** 2 + actual['euler_y'] ** 2 + actual['euler_z'] ** 2) ** 0.5
    actual['angle_vs_acc'] = actual['total_angle'] / actual['total_linear_acceleration']
    actual['angle_vs_vel'] = actual['total_angle'] / actual['total_angular_velocity']
    
    def f1(x):
        return np.mean(np.diff(np.abs(np.diff(x))))
    
    def f2(x):
        return np.mean(np.abs(np.diff(x)))
    
    for col in actual.columns:
        if col in ['row_id', 'series_id','measurement_number','measurement_group']:
            continue
        new[col + '_mean'] = actual.groupby(['measurement_group'])[col].mean()
        new[col + '_min'] = actual.groupby(['measurement_group'])[col].min()
        new[col + '_max'] = actual.groupby(['measurement_group'])[col].max()
        new[col + '_std'] = actual.groupby(['measurement_group'])[col].std()
        new[col + '_max_to_min'] = new[col + '_max'] / new[col + '_min']
        
        # Change. 1st order.
        new[col + '_mean_abs_change'] = actual.groupby(['measurement_group'])[col].apply(f2)
        
        # Change of Change. 2nd order.
        new[col + '_mean_change_of_abs_change'] = actual.groupby(['measurement_group'])[col].apply(f1)
        
        new[col + '_abs_max'] = actual.groupby(['measurement_group'])[col].apply(lambda x: np.max(np.abs(x)))
        new[col + '_abs_min'] = actual.groupby(['measurement_group'])[col].apply(lambda x: np.min(np.abs(x)))

    return new

In [4]:
train_temp = train_csv.copy()
measurement_series = train_csv['measurement_number']/32
measurement_series = measurement_series.astype(int)
train_temp['measurement_group'] = (train_csv['series_id']).apply(str) + "_" + measurement_series.apply(str)
# grouped = train_temp.groupby(['series_id','measurement_group'])

In [5]:
train_temp.head(200)[['series_id','measurement_group']]
print(1)

1


In [6]:
train_temp.head(2)

Unnamed: 0,row_id,series_id,measurement_number,orientation_X,orientation_Y,orientation_Z,orientation_W,angular_velocity_X,angular_velocity_Y,angular_velocity_Z,linear_acceleration_X,linear_acceleration_Y,linear_acceleration_Z,measurement_group
0,0_0,0,0,-0.75853,-0.63435,-0.10488,-0.10597,0.10765,0.017561,0.000767,-0.74857,2.103,-9.7532,0_0
1,0_1,0,1,-0.75853,-0.63434,-0.1049,-0.106,0.067851,0.029939,0.003385,0.33995,1.5064,-9.4128,0_0


In [7]:
tr = fe(train_temp)
# te = fe(test_csv)

In [8]:
tr.shape

(15240, 171)

In [9]:
test_temp = test_csv.copy()
measurement_series = test_csv['measurement_number']/32
measurement_series = measurement_series.astype(int)
test_temp['measurement_group'] = (test_csv['series_id']).apply(str) + "_" + measurement_series.apply(str)

In [10]:
te = fe(test_temp)


In [11]:
tr.fillna(0, inplace = True)
te.fillna(0, inplace = True)
tr.replace(-np.inf, 0, inplace = True)
tr.replace(np.inf, 0, inplace = True)
te.replace(-np.inf, 0, inplace = True)
te.replace(np.inf, 0, inplace = True)

In [12]:
from IPython.display import HTML
import base64
def create_download_link(df, title = "Download ", filename = "data.csv"):  
    title = title + filename
    csv = df.to_csv(index=False)
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)

In [13]:
train_labels= target['surface']
# train_concat = pd.concat([tr,target],axis=1)


In [14]:
# tr['series_id'] = tr.reset_index()['measurement_group'].apply(lambda x: x.split('_')[0])
# split('_')
# str[:1]
# tr = tr.drop(columns='series_id')

In [15]:
tr_2 = tr.copy()
tr_2  = tr_2.reset_index()
tr_2['series_id'] = tr_2['measurement_group'].apply(lambda x: x.split('_')[0])
tr_2['series_id'] = tr_2['series_id'].astype(int)

tr_2.head(10)['series_id']
# train_concat_2 = pd.concat([tr_2,target],axis = 1)
train_concat_2 = tr_2.merge(target, on = 'series_id')

In [16]:
train_concat_2.columns.values.tolist()
train_concat_2.head(2)
train_label_2 = train_concat_2['surface']

In [17]:
# train_label_2.head(50)
print(1)

1


In [18]:
random_forest = RandomForestClassifier(n_estimators = 100, random_state = 50)
random_forest.fit(tr, train_label_2)
predictions = random_forest.predict(te)

In [19]:
submit_df = pd.DataFrame(te.reset_index()['measurement_group'],columns=['measurement_group'])
# submit_df = pd.DataFrame(te.reset_index()['series_id'],columns=['series_id'])
submit_df['surface'] = predictions
create_download_link(submit_df, filename="1th_sol_hard_tile_eng.csv")

In [20]:
submit_df['surface'].value_counts()

concrete                  3696
wood                      2862
soft_pvc                  2398
soft_tiles                1902
hard_tiles_large_space    1318
tiled                     1283
fine_concrete             1239
carpet                     561
hard_tiles                   5
Name: surface, dtype: int64

In [21]:
submit_df = submit_df.copy()
submit_df  = submit_df.reset_index()
submit_df['series_id'] = submit_df['measurement_group'].apply(lambda x: x.split('_')[0])
submit_df['series_id'] = submit_df['series_id'].astype(int)

In [22]:
submit_df.head(10)

Unnamed: 0,index,measurement_group,surface,series_id
0,0,0_0,hard_tiles_large_space,0
1,1,0_1,hard_tiles_large_space,0
2,2,0_2,hard_tiles_large_space,0
3,3,0_3,hard_tiles_large_space,0
4,4,1000_0,concrete,1000
5,5,1000_1,concrete,1000
6,6,1000_2,concrete,1000
7,7,1000_3,soft_tiles,1000
8,8,1001_0,concrete,1001
9,9,1001_1,concrete,1001


In [23]:
# submit_df = submit_df.drop(columns='most_frequent')

In [24]:
submit_df.head(5)

Unnamed: 0,index,measurement_group,surface,series_id
0,0,0_0,hard_tiles_large_space,0
1,1,0_1,hard_tiles_large_space,0
2,2,0_2,hard_tiles_large_space,0
3,3,0_3,hard_tiles_large_space,0
4,4,1000_0,concrete,1000


In [25]:
submit_df_2 = submit_df.groupby(['series_id'])['surface'].agg(lambda x:x.value_counts().index[0])


In [26]:
submit_df_2 = submit_df_2.reset_index()

In [27]:
create_download_link(submit_df_2, filename="14th_sol_bin_spliting_eng.csv")