# Load the training data

In [3]:
import pandas as pd
feature_df = pd.read_csv('../data/train.csv')

In [4]:
display(feature_df)

Unnamed: 0,cos_dist_0,cos_dist_1,cos_dist_2,cos_dist_3,cos_dist_4,cos_dist_5,cos_dist_6,cos_dist_7,cos_dist_8,cos_dist_9,cos_dist_10,True Index,Pred Index
0,0.869066,0.724019,0.755597,0.806772,0.600662,0.611901,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
1,0.654552,0.611382,0.636398,0.756420,0.601566,0.634997,0.000000,0.0,0.0,0.0,0.0,3.0,3.0
2,0.582731,0.611122,0.533559,0.713906,0.403162,0.803759,0.000000,0.0,0.0,0.0,0.0,5.0,5.0
3,0.582731,0.611122,0.533559,0.713906,0.403162,0.803759,0.000000,0.0,0.0,0.0,0.0,5.0,5.0
4,0.713523,0.630822,0.746562,0.632307,0.577110,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
23227,0.799940,0.552146,0.727635,0.672169,0.666680,0.635924,0.563466,0.0,0.0,0.0,0.0,0.0,0.0
23228,0.556298,0.756451,0.539537,0.531432,0.676728,0.494054,0.675802,0.0,0.0,0.0,0.0,1.0,1.0
23229,0.625296,0.580917,0.731684,0.726662,0.668341,0.658828,0.612406,0.0,0.0,0.0,0.0,2.0,2.0
23230,0.637104,0.455297,0.680673,0.694067,0.592344,0.694899,0.423192,0.0,0.0,0.0,0.0,3.0,5.0


# Train

In [5]:
new_feature_df = feature_df[["cos_dist_0",
                            "cos_dist_1",
                            "cos_dist_2",
                            "cos_dist_3",
                            "cos_dist_4",
                            "cos_dist_5",
                            "cos_dist_6",
                            "cos_dist_7",
                            "cos_dist_8",
                            "cos_dist_9",
                            "cos_dist_10",
                            "True Index"]].copy()

In [6]:
new_feature_df.apply(max, axis = 0)

cos_dist_0     1.000000
cos_dist_1     0.986369
cos_dist_2     1.000000
cos_dist_3     1.000000
cos_dist_4     0.981930
cos_dist_5     0.983972
cos_dist_6     0.988034
cos_dist_7     0.977589
cos_dist_8     0.955797
cos_dist_9     0.871140
cos_dist_10    0.000000
True Index     9.000000
dtype: float64

In [7]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

scaler = MinMaxScaler()
X = scaler.fit_transform(new_feature_df.iloc[:,:-1])

In [8]:
from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y = train_test_split(X,
new_feature_df.iloc[:,-1], train_size=0.8, random_state = 5)

# Multinomial Logistic regression

In [9]:
from sklearn import linear_model,metrics

mul_lr = linear_model.LogisticRegression(multi_class='multinomial', solver='newton-cg')
mul_lr.fit(train_x, train_y)

print("Multinomial Logistic regression Train Accuracy : ", metrics.accuracy_score(train_y, mul_lr.predict(train_x)))
print("Multinomial Logistic regression Test Accuracy : ", metrics.accuracy_score(test_y, mul_lr.predict(test_x)))

Multinomial Logistic regression Train Accuracy :  0.6802797955340328
Multinomial Logistic regression Test Accuracy :  0.6935657413384979


# Random Forest Classifier

In [10]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(min_samples_leaf=8, n_estimators=60)
rf.fit(train_x, train_y)

print("RandomForestClassifier Train Accuracy : ", metrics.accuracy_score(train_y, rf.predict(train_x)))
print("RandomForestClassifier Test Accuracy : ", metrics.accuracy_score(test_y, rf.predict(test_x)))

RandomForestClassifier Train Accuracy :  0.7437180521926284
RandomForestClassifier Test Accuracy :  0.6853884226382613


# XGBoost

In [11]:
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
model = xgb.XGBClassifier()

param_dist = {"max_depth": [3,5,10],
              "min_child_weight" : [1,5,10],
              "learning_rate": [0.07, 0.1,0.2],
               }

# run randomized search
grid_search = GridSearchCV(model, param_grid=param_dist, cv = 3, 
                                   verbose=5, n_jobs=-1)
grid_search.fit(train_x, train_y)

Fitting 3 folds for each of 27 candidates, totalling 81 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   54.1s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  6.8min
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:  8.9min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estim...
                                     objective='binary:logistic',
                                     random_state=None, reg_alpha=None,
                                     reg_lambda=None, scale_pos_weight=None,
                                     subsample=None, tree_method=Non

In [12]:
grid_search.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [13]:
xg = xgb.XGBClassifier(max_depth=5)
xg.fit(train_x, train_y)

print("XGBClassifier Train Accuracy : ", metrics.accuracy_score(train_y, xg.predict(train_x)))
print("XGBClassifier Test Accuracy : ", metrics.accuracy_score(test_y, xg.predict(test_x)))

XGBClassifier Train Accuracy :  0.7793919827818133
XGBClassifier Test Accuracy :  0.6821605336776415
