In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
import os
import time
import numpy as np
import pandas as pd
from seaborn import countplot,lineplot, barplot
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, confusion_matrix

from bayes_opt import BayesianOptimization
import lightgbm as lgb
import xgboost as xgb
import catboost as cb

In [2]:
tr = pd.read_csv('../input/X_train.csv')
te = pd.read_csv('../input/X_test.csv')
target = pd.read_csv('../input/y_train.csv')
ss = pd.read_csv('../input/sample_submission.csv')

In [3]:
def quaternion_to_euler(x, y, z, w):
    import math
    t0 = +2.0 * (w * x + y * z)
    t1 = +1.0 - 2.0 * (x * x + y * y)
    X = math.atan2(t0, t1)

    t2 = +2.0 * (w * y - z * x)
    t2 = +1.0 if t2 > +1.0 else t2
    t2 = -1.0 if t2 < -1.0 else t2
    Y = math.asin(t2)

    t3 = +2.0 * (w * z + x * y)
    t4 = +1.0 - 2.0 * (y * y + z * z)
    Z = math.atan2(t3, t4)

    return X, Y, Z

def fe(actual):
    new = pd.DataFrame()
    actual['total_angular_velocity'] = (actual['angular_velocity_X'] ** 2 + actual['angular_velocity_Y'] ** 2 + actual['angular_velocity_Z'] ** 2) ** 0.5
    actual['total_linear_acceleration'] = (actual['linear_acceleration_X'] ** 2 + actual['linear_acceleration_Y'] ** 2 + actual['linear_acceleration_Z'] ** 2) ** 0.5
    
    actual['acc_vs_vel'] = actual['total_linear_acceleration'] / actual['total_angular_velocity']
    
    x, y, z, w = actual['orientation_X'].tolist(), actual['orientation_Y'].tolist(), actual['orientation_Z'].tolist(), actual['orientation_W'].tolist()
    nx, ny, nz = [], [], []
    for i in range(len(x)):
        xx, yy, zz = quaternion_to_euler(x[i], y[i], z[i], w[i])
        nx.append(xx)
        ny.append(yy)
        nz.append(zz)
    
    actual['euler_x'] = nx
    actual['euler_y'] = ny
    actual['euler_z'] = nz
    
    actual['total_angle'] = (actual['euler_x'] ** 2 + actual['euler_y'] ** 2 + actual['euler_z'] ** 2) ** 5
    actual['angle_vs_acc'] = actual['total_angle'] / actual['total_linear_acceleration']
    actual['angle_vs_vel'] = actual['total_angle'] / actual['total_angular_velocity']
    
    def f1(x):
        return np.mean(np.diff(np.abs(np.diff(x))))
    
    def f2(x):
        return np.mean(np.abs(np.diff(x)))
    
    for col in actual.columns:
        if col in ['row_id', 'series_id', 'measurement_number']:
            continue
        new[col + '_mean'] = actual.groupby(['series_id'])[col].mean()
        new[col + '_min'] = actual.groupby(['series_id'])[col].min()
        new[col + '_max'] = actual.groupby(['series_id'])[col].max()
        new[col + '_std'] = actual.groupby(['series_id'])[col].std()
        new[col + '_max_to_min'] = new[col + '_max'] / new[col + '_min']
        
        # Change. 1st order.
        new[col + '_mean_abs_change'] = actual.groupby('series_id')[col].apply(f2)
        
        # Change of Change. 2nd order.
        new[col + '_mean_change_of_abs_change'] = actual.groupby('series_id')[col].apply(f1)
        
        new[col + '_abs_max'] = actual.groupby('series_id')[col].apply(lambda x: np.max(np.abs(x)))
        new[col + '_abs_min'] = actual.groupby('series_id')[col].apply(lambda x: np.min(np.abs(x)))

    return new

In [4]:
tr = fe(tr)
te = fe(te)

In [5]:
tr.shape

(3810, 171)

In [6]:
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier(n_estimators = 100, random_state = 50)

In [7]:
train_labels= target['surface']

In [8]:
tr.fillna(0, inplace = True)
te.fillna(0, inplace = True)
tr.replace(-np.inf, 0, inplace = True)
tr.replace(np.inf, 0, inplace = True)
te.replace(-np.inf, 0, inplace = True)
te.replace(np.inf, 0, inplace = True)

In [9]:
random_forest.fit(tr, train_labels)
predictions = random_forest.predict_proba(te)[:,1]

In [10]:
predict_class = random_forest.predict(te)

In [11]:
submit_df = pd.DataFrame(te.reset_index()['series_id'],columns=['series_id'])
submit_df['surface']= predict_class
submit_df.to_csv('Sixth_sol_random_forest_feature_eng.csv',index=False)


In [12]:
from IPython.display import HTML
import base64

In [13]:
def create_download_link(df, title = "Download CSV file", filename = "data.csv"):  
    csv = df.to_csv(index=False)
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)

In [14]:
create_download_link(submit_df, filename="Sixth_sol_random_forest_feature_eng.csv")

In [15]:
from sklearn.model_selection import KFold
kf = KFold(n_splits = 3)

In [16]:
le = LabelEncoder()
train_label_encoded = le.fit_transform(train_labels)

In [17]:
splits = kf.split(tr, train_label_encoded)

In [18]:
print(tr.shape)

(3810, 171)


In [19]:
k_fold_test_predictions = np.zeros((te.shape[0], 9))

In [20]:
for i,(train_index,test_index) in enumerate(splits):
    k_fold_train = tr.iloc[train_index]
    k_fold_test = te.iloc[test_index]
    random_forest.fit(k_fold_train, train_label_encoded[train_index])
    predictions = random_forest.predict_proba(k_fold_test)[:,1]
    k_fold_test_predictions += random_forest.predict_proba(te) / kf.n_splits

    

In [21]:
k_fold_test_predictions
predictions_new = le.inverse_transform(k_fold_test_predictions.argmax(axis=1))

In [22]:
submit_df = pd.DataFrame(te.reset_index()['series_id'],columns=['series_id'])
submit_df['surface']= predictions_new
submit_df.to_csv('seventh_sol_random_forest_feature_eng.csv',index=False)

In [23]:
create_download_link(submit_df, filename="seventh_sol_random_forest_featue_eng_k_fold.csv")

In [24]:
import lightgbm as lgb
model = lgb.LGBMClassifier()
default_params = model.get_params()
del default_params['n_estimators']
# cv_results = lgb.csv(default_params, te, num_boost_round = 10000, early_stopping_rounds = 100. metric='auc', nfold=N_FOLDS, seed = 42)


In [25]:
N_FOLDS = 5
MAX_EVALS = 5

In [26]:
from sklearn.model_selection import train_test_split

In [27]:
train_features, test_features, train_labels, test_labels = train_test_split(tr,train_label_encoded,test_size = 0.33 ,random_state = 50)

In [28]:
train_set = lgb.Dataset(data = train_features, label = train_labels)
test_set = lgb.Dataset(data = test_features, label = test_labels)

In [29]:
np.unique(train_label_encoded)

array([0, 1, 2, 3, 4, 5, 6, 7, 8])

In [30]:
# params = {'task': 'train',
#     'boosting_type': 'gbdt',
#     'objective': 'multiclass',
#     'num_class':3,
#     'metric': 'multi_logloss',
#     'learning_rate': 0.002296,
#     'max_depth': 7,
#     'num_leaves': 17,
#     'feature_fraction': 0.4,
#     'bagging_fraction': 0.6,
#     'bagging_freq': 17}

In [31]:
params = default_params
params['metric'] = 'multi_error'
params['objective'] = 'multiclass'
params['num_class'] = 9

In [32]:
cv_results = lgb.cv(params, train_set, num_boost_round = 10000, early_stopping_rounds = 100, nfold=N_FOLDS, seed = 42)

Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


In [33]:
n_estimators = len(cv_results['multi_error-mean'])

In [34]:
# model = lgb.train(params,train_features, train_labels)
model = lgb.train(params,
                       train_set,
                       valid_sets=test_set,
                       num_boost_round=5000,
                       early_stopping_rounds=50,
                      )

Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


[1]	valid_0's multi_error: 0.600159
Training until validation scores don't improve for 50 rounds.
[2]	valid_0's multi_error: 0.350556
[3]	valid_0's multi_error: 0.272655
[4]	valid_0's multi_error: 0.254372
[5]	valid_0's multi_error: 0.221781
[6]	valid_0's multi_error: 0.203498
[7]	valid_0's multi_error: 0.196343
[8]	valid_0's multi_error: 0.183625
[9]	valid_0's multi_error: 0.174881
[10]	valid_0's multi_error: 0.166137
[11]	valid_0's multi_error: 0.156598
[12]	valid_0's multi_error: 0.153418
[13]	valid_0's multi_error: 0.150238
[14]	valid_0's multi_error: 0.143084
[15]	valid_0's multi_error: 0.141494
[16]	valid_0's multi_error: 0.141494
[17]	valid_0's multi_error: 0.142289
[18]	valid_0's multi_error: 0.143084
[19]	valid_0's multi_error: 0.138315
[20]	valid_0's multi_error: 0.135135
[21]	valid_0's multi_error: 0.133545
[22]	valid_0's multi_error: 0.130366
[23]	valid_0's multi_error: 0.128776
[24]	valid_0's multi_error: 0.122417
[25]	valid_0's multi_error: 0.121622
[26]	valid_0's multi_e

In [35]:
y_pred = model.predict(te)

In [36]:
y_pred.shape
predictions_new = le.inverse_transform(y_pred.argmax(axis=1))

In [37]:
submit_df = pd.DataFrame(te.reset_index()['series_id'],columns=['series_id'])
submit_df['surface']= predictions_new
create_download_link(submit_df, filename="Eight_sol_lighGBM_featue_eng_k_fold.csv")

In [38]:
train_csv = pd.read_csv('../input/X_train.csv')
test_csv = pd.read_csv('../input/X_test.csv')
target_csv = pd.read_csv('../input/y_train.csv')
ss_csv = pd.read_csv('../input/sample_submission.csv')

In [39]:
train_merge = train_csv.merge(target_csv,on='series_id')

In [40]:
train_merge.columns.values.tolist()
# np.unique(target_csv['group_id'])
grouped = target_csv.groupby(by=['group_id'])

In [41]:
# grouped.agg(['size'])

In [42]:
param_grid = {
    'boosting_type': ['gbdt', 'goss', 'dart'],
    'num_leaves': list(range(20, 150)),
    'learning_rate': list(np.logspace(np.log10(0.005), np.log10(0.5), base = 10, num = 1000)),
    'subsample_for_bin': list(range(20000, 300000, 20000)),
    'min_child_samples': list(range(20, 500, 5)),
    'reg_alpha': list(np.linspace(0, 1)),
    'reg_lambda': list(np.linspace(0, 1)),
    'colsample_bytree': list(np.linspace(0.6, 1, 10)),
    'subsample': list(np.linspace(0.5, 1, 100)),
    'is_unbalance': [True, False]
}


In [43]:
import random
random.seed(50)

In [44]:
boosting_type = random.sample(param_grid['boosting_type'], 1)[0]

# Set subsample depending on boosting type
subsample = 1.0 if boosting_type == 'goss' else random.sample(param_grid['subsample'], 1)[0]

In [45]:
# random.sample(param_grid['boosting_type'], 2)

In [46]:
print('Boosting type: ', boosting_type)
print('Subsample ratio: ', subsample)

Boosting type:  goss
Subsample ratio:  1.0


In [47]:
random_results = pd.DataFrame(columns = ['score', 'params', 'iteration'],
                              index = list(range(MAX_EVALS)))

grid_results = pd.DataFrame(columns = ['score', 'params', 'iteration'],
                              index = list(range(MAX_EVALS)))


In [48]:
def objective2(train_set, hyperparameters, iteration):
    """Objective function for grid and random search. Returns
       the cross validation score from a set of hyperparameters."""
    
    # Number of estimators will be found using early stopping
    if 'n_estimators' in hyperparameters.keys():
        del hyperparameters['n_estimators']
    
     # Perform n_folds cross validation
#     num_boost_round = 10000
    cv_results = lgb.cv(hyperparameters, train_set, num_boost_round = 10, nfold = N_FOLDS, 
                        early_stopping_rounds = 100, seed = 42)
    
#     results to retun
    score = cv_results['multi_error-mean'][-1]
#     cv_results['multi_error-mean'][-1]
    estimators = len(cv_results['multi_error-mean'])
    hyperparameters['n_estimators'] = estimators 
    
    return score, hyperparameters

In [49]:
score, hyperparameters = objective2(train_set, params, 1)

In [50]:
# hyperparameters

In [51]:
cv_results['multi_error-mean']
print(1)

1


In [52]:
import itertools


In [53]:
def grid_search(param_grid, max_evals = MAX_EVALS):
    """Grid search algorithm (with limit on max evals)"""
    
    # Dataframe to store results
    results = pd.DataFrame(columns = ['score', 'params', 'iteration'],
                              index = list(range(MAX_EVALS)))
    
    # https://codereview.stackexchange.com/questions/171173/list-all-possible-permutations-from-a-python-dictionary-of-lists
    keys, values = zip(*param_grid.items())
    
    i = 0
    
    # Iterate through every possible combination of hyperparameters
    for v in itertools.product(*values):
        
        # Create a hyperparameter dictionary
        hyperparameters = dict(zip(keys, v))
        
        # Set the subsample ratio accounting for boosting type
        hyperparameters['subsample'] = 1.0 if hyperparameters['boosting_type'] == 'goss' else hyperparameters['subsample']
        hyperparameters['metric'] = 'multi_error'
        hyperparameters['objective'] = 'multiclass'
        hyperparameters['num_class'] = 9
        # Evalute the hyperparameters
        eval_results,hyperparameters = objective2(train_set,hyperparameters, i)
        print(hyperparameters)
        print(eval_results)
        results.loc[i, :] = eval_results
        
        i += 1
        
        # Normally would not limit iterations
        if i > MAX_EVALS:
            break
       
    # Sort with best score on top
    results.sort_values('score', ascending = False, inplace = True)
    results.reset_index(inplace = True)
    
    return results    

In [54]:
# param_grid = {
#     'boosting_type': ['gbdt', 'goss', 'dart'],
#     'num_leaves': list(range(20, 150)),
#     'learning_rate': list(np.logspace(np.log10(0.005), np.log10(0.5), base = 10, num = 1000)),
#     'subsample_for_bin': list(range(20000, 300000, 20000)),
#     'min_child_samples': list(range(20, 500, 5)),
#     'reg_alpha': list(np.linspace(0, 1)),
#     'reg_lambda': list(np.linspace(0, 1)),
#     'colsample_bytree': list(np.linspace(0.6, 1, 10)),
#     'subsample': list(np.linspace(0.5, 1, 100)),
#     'is_unbalance': [True, False]
# }

param_grid_dummy = {
    'boosting_type': ['gbdt', 'goss', 'dart'],
    'subsample': list(np.linspace(0.5, 1, 100)),
    'num_leaves': list(range(20, 150)),
    'learning_rate': list(np.logspace(np.log10(0.005), np.log10(0.5), base = 10, num = 1000))
}


In [55]:
grid_results = grid_search(param_grid_dummy)

{'boosting_type': 'gbdt', 'subsample': 0.5, 'num_leaves': 20, 'learning_rate': 0.004999999999999999, 'metric': 'multi_error', 'objective': 'multiclass', 'num_class': 9, 'n_estimators': 10}
0.6516457139706852
{'boosting_type': 'gbdt', 'subsample': 0.5, 'num_leaves': 20, 'learning_rate': 0.005023102106734065, 'metric': 'multi_error', 'objective': 'multiclass', 'num_class': 9, 'n_estimators': 10}
0.6512558504229269
{'boosting_type': 'gbdt', 'subsample': 0.5, 'num_leaves': 20, 'learning_rate': 0.005046310954935236, 'metric': 'multi_error', 'objective': 'multiclass', 'num_class': 9, 'n_estimators': 10}
0.6496902687105145
{'boosting_type': 'gbdt', 'subsample': 0.5, 'num_leaves': 20, 'learning_rate': 0.005069627037794076, 'metric': 'multi_error', 'objective': 'multiclass', 'num_class': 9, 'n_estimators': 10}
0.6492981118477694
{'boosting_type': 'gbdt', 'subsample': 0.5, 'num_leaves': 20, 'learning_rate': 0.005093050850779875, 'metric': 'multi_error', 'objective': 'multiclass', 'num_class': 9,

In [56]:
grid_results

Unnamed: 0,index,score,params,iteration
0,0,0.651646,0.651646,0.651646
1,1,0.651256,0.651256,0.651256
2,2,0.64969,0.64969,0.64969
3,4,0.649688,0.649688,0.649688
4,5,0.649688,0.649688,0.649688
5,3,0.649298,0.649298,0.649298


In [57]:
def random_search(param_grid, max_evals = MAX_EVALS):
    """Random search for hyperparameter optimization"""
    
    # Dataframe for results
    results = pd.DataFrame(columns = ['score', 'params', 'iteration'],
                                  index = list(range(MAX_EVALS)))
    
    # Keep searching until reach max evaluations
    for i in range(MAX_EVALS):
        
        # Choose random hyperparameters
        hyperparameters = {k: random.sample(v, 1)[0] for k, v in param_grid.items()}
        hyperparameters['subsample'] = 1.0 if hyperparameters['boosting_type'] == 'goss' else hyperparameters['subsample']
        hyperparameters['metric'] = 'multi_error'
        hyperparameters['objective'] = 'multiclass'
        hyperparameters['num_class'] = 9
        # Evaluate randomly selected hyperparameters
        eval_results,hyperparameters = objective2(train_set, hyperparameters, i)
        print(eval_results)
        print(hyperparameters)
        results.loc[i, :] = eval_results
    
    # Sort with best score on top
    results.sort_values('score', ascending = False, inplace = True)
    results.reset_index(inplace = True)
    return results 

In [58]:
random_results = random_search(param_grid_dummy)

0.18067722675820497
{'boosting_type': 'goss', 'subsample': 1.0, 'num_leaves': 82, 'learning_rate': 0.13133770518619184, 'metric': 'multi_error', 'objective': 'multiclass', 'num_class': 9, 'n_estimators': 10}
0.6418646714855607
{'boosting_type': 'goss', 'subsample': 1.0, 'num_leaves': 104, 'learning_rate': 0.007466966080621258, 'metric': 'multi_error', 'objective': 'multiclass', 'num_class': 9, 'n_estimators': 10}




0.18497260572850655
{'boosting_type': 'dart', 'subsample': 0.702020202020202, 'num_leaves': 77, 'learning_rate': 0.12143821912302256, 'metric': 'multi_error', 'objective': 'multiclass', 'num_class': 9, 'n_estimators': 10}




0.3876291338639891
{'boosting_type': 'dart', 'subsample': 0.5505050505050505, 'num_leaves': 59, 'learning_rate': 0.0256850677167567, 'metric': 'multi_error', 'objective': 'multiclass', 'num_class': 9, 'n_estimators': 10}
0.4938242610205951
{'boosting_type': 'gbdt', 'subsample': 0.7222222222222222, 'num_leaves': 101, 'learning_rate': 0.014237195832336229, 'metric': 'multi_error', 'objective': 'multiclass', 'num_class': 9, 'n_estimators': 10}
