In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import shuffle


In [2]:
tr = pd.read_csv('../input/X_train.csv')
te = pd.read_csv('../input/X_test.csv')
target = pd.read_csv('../input/y_train.csv')
ss = pd.read_csv('../input/sample_submission.csv')

In [3]:
def quaternion_to_euler(x, y, z, w):
    import math
    t0 = +2.0 * (w * x + y * z)
    t1 = +1.0 - 2.0 * (x * x + y * y)
    X = math.atan2(t0, t1)

    t2 = +2.0 * (w * y - z * x)
    t2 = +1.0 if t2 > +1.0 else t2
    t2 = -1.0 if t2 < -1.0 else t2
    Y = math.asin(t2)

    t3 = +2.0 * (w * z + x * y)
    t4 = +1.0 - 2.0 * (y * y + z * z)
    Z = math.atan2(t3, t4)

    return X, Y, Z

def fe(actual):
    new = pd.DataFrame()
    actual['total_angular_velocity'] = (actual['angular_velocity_X'] ** 2 + actual['angular_velocity_Y'] ** 2 + actual['angular_velocity_Z'] ** 2) ** 0.5
    actual['total_linear_acceleration'] = (actual['linear_acceleration_X'] ** 2 + actual['linear_acceleration_Y'] ** 2 + actual['linear_acceleration_Z'] ** 2) ** 0.5
    
    actual['acc_vs_vel'] = actual['total_linear_acceleration'] / actual['total_angular_velocity']
    
    x, y, z, w = actual['orientation_X'].tolist(), actual['orientation_Y'].tolist(), actual['orientation_Z'].tolist(), actual['orientation_W'].tolist()
    nx, ny, nz = [], [], []
    for i in range(len(x)):
        xx, yy, zz = quaternion_to_euler(x[i], y[i], z[i], w[i])
        nx.append(xx)
        ny.append(yy)
        nz.append(zz)
    
    actual['euler_x'] = nx
    actual['euler_y'] = ny
    actual['euler_z'] = nz
    
    actual['total_angle'] = (actual['euler_x'] ** 2 + actual['euler_y'] ** 2 + actual['euler_z'] ** 2) ** 0.5
    actual['angle_vs_acc'] = actual['total_angle'] / actual['total_linear_acceleration']
    actual['angle_vs_vel'] = actual['total_angle'] / actual['total_angular_velocity']
    
    def f1(x):
        return np.mean(np.diff(np.abs(np.diff(x))))
    
    def f2(x):
        return np.mean(np.abs(np.diff(x)))
    
    for col in actual.columns:
        if col in ['row_id', 'series_id', 'measurement_number']:
            continue
        new[col + '_mean'] = actual.groupby(['series_id'])[col].mean()
        new[col + '_min'] = actual.groupby(['series_id'])[col].min()
        new[col + '_max'] = actual.groupby(['series_id'])[col].max()
        new[col + '_std'] = actual.groupby(['series_id'])[col].std()
        new[col + '_max_to_min'] = new[col + '_max'] / new[col + '_min']
        
        # Change. 1st order.
        new[col + '_mean_abs_change'] = actual.groupby('series_id')[col].apply(f2)
        
        # Change of Change. 2nd order.
        new[col + '_mean_change_of_abs_change'] = actual.groupby('series_id')[col].apply(f1)
        
        new[col + '_abs_max'] = actual.groupby('series_id')[col].apply(lambda x: np.max(np.abs(x)))
        new[col + '_abs_min'] = actual.groupby('series_id')[col].apply(lambda x: np.min(np.abs(x)))

    return new

In [4]:
tr = fe(tr)
te = fe(te)

In [5]:
tr.fillna(0, inplace = True)
te.fillna(0, inplace = True)
tr.replace(-np.inf, 0, inplace = True)
tr.replace(np.inf, 0, inplace = True)
te.replace(-np.inf, 0, inplace = True)
te.replace(np.inf, 0, inplace = True)

In [6]:
from IPython.display import HTML
import base64
def create_download_link(df, title = "Download ", filename = "data.csv"):  
    title = title + filename
    csv = df.to_csv(index=False)
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)

In [7]:
train_labels= target['surface']
train_concat = pd.concat([tr,target],axis=1)

In [8]:
random_forest = RandomForestClassifier(n_estimators = 100, random_state = 50)
random_forest.fit(tr, train_labels)
predictions = random_forest.predict(te)

In [9]:
test_concat = te.copy()
test_concat["surface"] = predictions

In [10]:
ldict = {
        'concrete': 0.16,
        'soft_pvc': 0.18,
        'wood': 0.06,
        'tiled': 0.03,
        'fine_concrete': 0.10,
        'hard_tiles_large_space': 0.12,
        'soft_tiles': 0.23,
        'carpet': 0.05,
        'hard_tiles': 0.07,
    }

In [11]:
from sklearn.svm import SVC 
svclassifier_rbc = SVC(kernel='rbf')  
# svclassifier_rbc.fit(tr_c, train_labels)  
# predictions_svm_rbc = svclassifier_rbc.predict(test_c)  
# submit_svm_rbf = pd.DataFrame(test_c.reset_index()['series_id'],columns=['series_id'])
# submit_svm_rbf['surface'] = predictions_svm_rbc


In [12]:
model_dict_2 = {}
unique_surfaces = target["surface"].unique()
hard_tile = train_concat[train_concat['surface'] =='hard_tiles']
hard_tile_number = train_concat['surface'].value_counts()["hard_tiles"]
for non_hard_tiled_surface in unique_surfaces:
    if non_hard_tiled_surface == "hard_tiles":
        continue
#     random_forest = RandomForestClassifier(n_estimators = 100, random_state = 50)
    svm_classifier_rbc = SVC(kernel='rbf')  
    non_hard_tiled = train_concat[train_concat['surface'] == non_hard_tiled_surface]
    model_dict_2[non_hard_tiled_surface] = []
    non_hard_tiled_count = (ldict[non_hard_tiled_surface]/ldict["hard_tiles"])*hard_tile_number
    n_splits = train_concat['surface'].value_counts()[non_hard_tiled_surface]/non_hard_tiled_count
    for t_s in np.array_split(non_hard_tiled, n_splits):
        combined_t_nt = pd.concat([hard_tile, t_s], axis = 0 )
        combined_t_nt.drop(combined_t_nt[['series_id', 'group_id']], axis=1, inplace = True)
        combined_t_nt = combined_t_nt.reset_index()
        combined_t_nt.drop(combined_t_nt[['series_id']], axis=1, inplace = True)
        combined_t_nt["surface"] = combined_t_nt["surface"] == "hard_tiles"
        combined_t_nt = shuffle(combined_t_nt)
        labels_surface_wise =  combined_t_nt["surface"]
        train_surface_wise = combined_t_nt.drop(combined_t_nt[['surface']], axis=1)
        model_dict_2[non_hard_tiled_surface].append(svm_classifier_rbc.fit(train_surface_wise, labels_surface_wise))
#         model_dict_2[non_hard_tiled_surface].append(random_forest.fit(train_surface_wise, labels_surface_wise))
   



In [13]:
test_concat_new = test_concat.reset_index()
conversions = {}
for non_hard_tiled_surface in unique_surfaces:
    if non_hard_tiled_surface == "hard_tiles":
        continue
    t_s = test_concat[test_concat["surface"]==non_hard_tiled_surface]
    t_s_features = t_s.drop(['surface'], axis =1)
    t_s_predictions = 0
    for model in model_dict_2[non_hard_tiled_surface]:
        t_s_predictions = t_s_predictions + model.predict(t_s_features)
    t_s_predictions = t_s_predictions/len(model_dict_2[non_hard_tiled_surface])
    index_list = t_s.iloc[t_s_predictions==True].index
    selected_series = index_list.values.tolist()
    test_concat_new.loc[test_concat_new['series_id'].isin(selected_series),'surface'] = 'hard_tiles'
    conversions[non_hard_tiled_surface] = {}
    conversions[non_hard_tiled_surface]["total_predicted"] =  t_s.shape[0]
    conversions[non_hard_tiled_surface]["converted"] = index_list.size
    conversions[non_hard_tiled_surface]["training_count"] = target['surface'].value_counts()[non_hard_tiled_surface]
conversions

{'fine_concrete': {'total_predicted': 328,
  'converted': 4,
  'training_count': 363},
 'concrete': {'total_predicted': 922, 'converted': 2, 'training_count': 779},
 'soft_tiles': {'total_predicted': 461,
  'converted': 17,
  'training_count': 297},
 'tiled': {'total_predicted': 326, 'converted': 326, 'training_count': 514},
 'soft_pvc': {'total_predicted': 696, 'converted': 16, 'training_count': 732},
 'hard_tiles_large_space': {'total_predicted': 289,
  'converted': 0,
  'training_count': 308},
 'carpet': {'total_predicted': 148, 'converted': 128, 'training_count': 189},
 'wood': {'total_predicted': 644, 'converted': 367, 'training_count': 607}}

In [14]:
submit_df = pd.DataFrame(test_concat_new[['series_id','surface']],columns=['series_id','surface'])
create_download_link(submit_df, filename="12th_sol_hard_tile_eng.csv")

In [15]:
submit_df["surface"].value_counts()

concrete                  920
hard_tiles                862
soft_pvc                  680
soft_tiles                444
fine_concrete             324
hard_tiles_large_space    289
wood                      277
carpet                     20
Name: surface, dtype: int64