In [62]:
import numpy as np
import pandas as pd
from math import pi

# spot check on engineered-features
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold

In [32]:
def load_data_raw():
    X = pd.read_csv('Data/X_train.csv')
    y = pd.read_csv('Data/y_train.csv')
    return (X, y)

def get_values_X_2D(X):
    n_timestep, n_feature = 128, 10
    X_data_ori = X.iloc[:,3:].values # original values
    X_data = StandardScaler().fit_transform(X_data_ori)# standarlize the data
    #X_data = MinMaxScaler().fit_transform(X_data)# standarlize the data

    X_data_2D = np.reshape(X_data, (int(len(X_data)/n_timestep),n_timestep,n_feature)) # convert to 2D images
    return (X_data_2D)


def get_data(flatten = False): 
    # intend format, X = (n_example, n_timestep*n_features) when flatten is true
    # otherwise, X = (n_example, n_timestep, n_features)
    # y = surface name 
    X, y = load_data_raw()
    X_data = get_values_X_2D(X)
    if flatten == True:
        X_data = np.reshape(X_data, (np.shape(X_data)[0], np.shape(X_data)[1]*np.shape(X_data)[2]))

    y_data = y.surface
    print (np.shape(X_data))
    print (np.shape(y_data))
    return (X_data, y_data)
    
    

# create a dict of standard models to evaluate {name:object}
def define_models(models=dict()):
    # nonlinear models
    models['knn'] = KNeighborsClassifier(n_neighbors=7)
    models['cart'] = DecisionTreeClassifier()
    models['svm'] = SVC()
    models['bayes'] = GaussianNB()
    # ensemble models
    models['bag'] = BaggingClassifier(n_estimators=100)
    models['rf'] = RandomForestClassifier(n_estimators=100)
    models['et'] = ExtraTreesClassifier(n_estimators=100)
    models['gbm'] = GradientBoostingClassifier(n_estimators=100)
    print('Defined %d models' % len(models))
    return models
 
# evaluate a single model
def evaluate_model(trainX, trainy, testX, testy, model):
    # fit the model
    model.fit(trainX, trainy)
    # make predictions
    yhat = model.predict(testX)
    # evaluate predictions
    accuracy = accuracy_score(testy, yhat)
    return accuracy * 100.0
 
# evaluate a dict of models {name:object}, returns {name:score}
def evaluate_models(trainX, trainy, testX, testy, models):
    results = dict()
    for name, model in models.items():
        # evaluate the model
        results[name] = evaluate_model(trainX, trainy, testX, testy, model)
        # show process
        print('>%s: %.3f' % (name, results[name]))
    return results
 
# print and plot the results
def summarize_results(results, maximize=True):
    # create a list of (name, mean(scores)) tuples
    mean_scores = [(k,v) for k,v in results.items()]
    # sort tuples by mean score
    mean_scores = sorted(mean_scores, key=lambda x: x[1])
    # reverse for descending order (e.g. for accuracy)
    if maximize:
        mean_scores = list(reversed(mean_scores))
    print()
    for name, score in mean_scores:
        print('Name=%s, Score=%.3f' % (name, score))

def conv_angles(x,y,z,w):
    t0 = +2.0 * (w * x + y * z)
    t1 = +1.0 - 2.0 * (x * x + y * y)

    X = np.arctan2(t0, t1)

    t2 = +2.0 * (w * y - z * x)
    index_out = np.where(np.absolute(t2)>=1)# get the index which is out of range
    sign_out = np.sign(t2) # if out of range change it to pi/2 or -pi/2
    t2[index_out] = t2[index_out] *sign_out[index_out] 
    Y = np.arcsin(t2)
    

    t3 = +2.0 * (w * z + x * y)
    t4 = +1.0 - 2.0 * (y * y + z * z)
    Z = np.arctan2(t3, t4)
    

    return X, Y, Z
    
def conv_euler_angles(X_pd): # input is the panda dataframe of X
    n_timestep, n_feature = 128, 10
    X_data_ori = X_pd.iloc[:,3:].values # original values
    n_example = int(np.shape(X_data_ori)[0]/n_timestep)

    new_feature= np.zeros((n_example, n_timestep, 3))
    X = np.reshape(X_data_ori, (n_example, n_timestep, n_feature))
    for id_n in range(n_example):
        x = X[id_n,:,0] # x
        y = X[id_n,:,1] # y
        z = X[id_n,:,2] # z
        w = X[id_n,:,3] # w
        
        new_feature[id_n,:,0], new_feature[id_n,:,1], new_feature[id_n,:,2] = conv_angles(x,y,z,w) 

        
    new_feature_norm_1d= np.reshape(new_feature,(n_example*n_timestep, 3))
    #new_feature_norm_1d = StandardScaler().fit_transform(new_feature_norm_1d)# standarlize the data
    
    new_feature_norm_2d = np.reshape(new_feature_norm_1d,(n_example, n_timestep, 3))
    X_values_comb = np.concatenate((X, new_feature_norm_2d), axis=2)

    return(X_values_comb)  


In [29]:
skf = StratifiedKFold(n_splits=2)
StratifiedKFold(n_splits=2, random_state=None, shuffle=False)

for train_index, test_index in skf.split(X_data, y_data):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X_data[train_index,:], X_data[test_index,:]
    y_train, y_test = y_data[train_index], y_data[test_index]
    models = define_models()
    # evaluate models
    results = evaluate_models(X_train, y_train, X_test, y_test, models)
    # summarize results
    summarize_results(results)


TRAIN: [ 804  826  830 ... 3807 3808 3809] TEST: [   0    1    2 ... 2223 2225 2230]
Defined 8 models
>knn: 17.453
>cart: 30.294




>svm: 34.015
>bayes: 27.830
>bag: 32.495
>rf: 31.132
>et: 39.413
>gbm: 38.260

Name=et, Score=39.413
Name=gbm, Score=38.260
Name=svm, Score=34.015
Name=bag, Score=32.495
Name=rf, Score=31.132
Name=cart, Score=30.294
Name=bayes, Score=27.830
Name=knn, Score=17.453
TRAIN: [   0    1    2 ... 2223 2225 2230] TEST: [ 804  826  830 ... 3807 3808 3809]
Defined 8 models
>knn: 20.768
>cart: 37.802




>svm: 28.601
>bayes: 17.823
>bag: 41.693
>rf: 43.323
>et: 47.056
>gbm: 45.058

Name=et, Score=47.056
Name=gbm, Score=45.058
Name=rf, Score=43.323
Name=bag, Score=41.693
Name=cart, Score=37.802
Name=svm, Score=28.601
Name=knn, Score=20.768
Name=bayes, Score=17.823
