In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import f1_score, accuracy_score, matthews_corrcoef
from imblearn.over_sampling import SMOTE, SVMSMOTE
from tqdm import tqdm
import random

random.seed(42)
np.random.seed(42)

In [3]:
train_1 = pd.read_csv('Data/PRML_Datacontest_MKN_JUL_2021/Dataset_1_Training.csv')
train_1 = train_1.set_index('ID_REF').T
train_1.head()

ID_REF,gene_1,gene_2,gene_3,gene_4,gene_5,gene_6,gene_7,gene_8,gene_9,gene_10,...,gene_22276,gene_22277,gene_22278,gene_22279,gene_22280,gene_22281,gene_22282,gene_22283,CO: 1,CO: 2
Train_01,12.444,8.3774,6.7866,10.2851,5.9064,8.3767,8.0356,6.6745,6.2325,6.845,...,15.6236,15.2785,3.2915,3.6526,2.6412,1.2652,3.069,2.0271,0.0,1.0
Train_02,12.2005,7.8592,8.0963,10.4624,4.9582,9.2973,7.0581,6.4607,6.9047,5.8878,...,15.3234,15.1286,3.3811,2.588,4.4798,4.8098,3.1637,2.4758,0.0,1.0
Train_03,12.6709,8.6762,7.4812,10.1887,5.2332,9.1721,8.6061,7.0932,6.594,5.6843,...,15.4604,15.2674,3.1665,3.9743,5.2597,4.3815,2.8034,2.4669,0.0,0.0
Train_04,11.6619,8.2557,7.9923,10.7705,6.3296,9.3777,8.4776,6.5878,6.0877,6.5169,...,15.5185,15.1655,4.0045,3.8503,5.9114,0.7882,3.1831,3.482,0.0,0.0
Train_05,11.8397,8.7971,7.8321,10.2869,5.8389,7.0841,7.3419,7.3167,6.3456,6.1708,...,15.3143,14.9506,3.0514,3.2946,5.1537,3.9179,3.1881,2.9769,0.0,0.0


In [4]:
train_1.shape

(130, 22285)

In [5]:
test_1 = pd.read_csv('Data/PRML_Datacontest_MKN_JUL_2021/Dataset_1_Testing.csv')
test_1 = test_1.set_index('ID_REF').T
test_1.head()

ID_REF,gene_1,gene_2,gene_3,gene_4,gene_5,gene_6,gene_7,gene_8,gene_9,gene_10,...,gene_22274,gene_22275,gene_22276,gene_22277,gene_22278,gene_22279,gene_22280,gene_22281,gene_22282,gene_22283
Test_01,12.3446,7.0781,7.5017,10.6764,6.4327,9.2305,8.1481,6.1196,4.2718,7.1375,...,8.5471,10.4398,15.7291,15.2421,3.5092,3.9866,5.9038,2.4871,6.4559,3.8567
Test_02,12.0376,7.6011,7.3458,10.5366,6.5568,9.118,8.3105,7.1575,8.454,6.6935,...,8.1799,10.1184,15.904,15.4787,3.6931,3.9041,4.5131,2.0954,6.0398,1.8521
Test_03,10.9684,7.4696,8.3759,11.1175,7.0579,9.3514,8.1214,7.7247,9.9479,7.7319,...,10.6116,11.6278,16.5099,15.923,4.9476,4.6681,6.1192,3.4668,6.3087,2.7725
Test_04,12.1004,7.9643,6.7908,10.7172,6.8571,9.1938,7.6762,6.6899,6.2971,4.7757,...,8.704,9.8122,15.7377,15.3561,5.802,4.0437,4.1655,1.1474,3.5803,2.8827
Test_05,11.5133,7.6576,9.9053,11.3643,7.2523,8.7346,8.316,6.9291,9.5212,6.8443,...,9.8935,10.1678,16.5602,16.1432,5.2615,4.8368,6.6252,3.1672,4.556,3.0307


In [6]:
test_1.shape

(100, 22283)

In [7]:
sample_submission = pd.read_csv('Data/PRML_Datacontest_MKN_JUL_2021/dummy_submission.csv')
sample_submission.head()

Unnamed: 0,Id,Predicted
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


In [8]:
sample_submission.tail()

Unnamed: 0,Id,Predicted
1051,1051,0
1052,1052,0
1053,1053,0
1054,1054,0
1055,1055,1


In [9]:
train_1.isnull().sum().sum()

0

In [10]:
train_2 = pd.read_csv('Data/PRML_Datacontest_MKN_JUL_2021/Dataset_2_Training.csv')
train_2 = train_2.set_index('ID_REF').T
train_2.head()

ID_REF,gene_1,gene_2,gene_3,gene_4,gene_5,gene_6,gene_7,gene_8,gene_9,gene_10,...,gene_54670,gene_54671,gene_54672,gene_54673,gene_54674,gene_54675,CO: 3,CO: 4,CO: 5,CO: 6
Train_001,8.2843,8.7174,8.3019,11.8611,4.6883,9.8524,8.2301,4.8406,7.5667,8.4165,...,6.3546,7.2572,3.608,3.9483,4.52,4.5195,0.0,0.0,1.0,1.0
Train_002,9.0419,8.8729,8.4559,11.2112,5.7059,10.2263,7.9439,4.6463,6.1828,7.0852,...,4.9707,6.4847,5.1648,1.8369,3.8223,3.4288,0.0,0.0,1.0,1.0
Train_003,7.6171,8.3904,8.1184,11.0341,5.805,9.5912,6.8044,4.0878,8.1184,7.3179,...,3.9571,6.2647,3.4216,4.7911,5.491,3.8438,0.0,0.0,0.0,1.0
Train_004,9.6283,8.384,9.9061,12.0571,5.1193,10.5774,8.1628,5.3257,5.6398,7.6829,...,4.6561,7.5776,6.2385,4.1413,5.3047,5.4693,0.0,0.0,0.0,1.0
Train_005,10.1735,9.1585,7.9649,11.0843,5.0119,8.7764,8.1379,3.6227,7.7334,7.6336,...,3.7007,3.872,3.1052,5.4053,5.3599,2.8328,0.0,0.0,0.0,1.0


In [11]:
train_2.shape

(340, 54679)

In [12]:
test_2 = pd.read_csv('Data/PRML_Datacontest_MKN_JUL_2021/Dataset_2_Testing.csv')
test_2 = test_2.set_index('ID_REF').T
test_2.head()

ID_REF,gene_1,gene_2,gene_3,gene_4,gene_5,gene_6,gene_7,gene_8,gene_9,gene_10,...,gene_54666,gene_54667,gene_54668,gene_54669,gene_54670,gene_54671,gene_54672,gene_54673,gene_54674,gene_54675
Sample_001,10.6521,8.3713,8.3213,10.706,4.1977,10.4983,7.2623,3.9235,7.989,7.6231,...,12.9485,12.5151,14.9606,14.6512,4.0528,3.9327,3.7199,2.8913,5.2739,5.2258
Sample_002,9.8898,8.6521,9.3985,11.7578,6.7244,8.8695,8.4035,4.5643,4.4399,6.2953,...,12.9284,12.4207,14.7406,14.4576,4.0443,6.7296,3.4096,4.1144,4.8281,5.1647
Sample_003,7.8641,7.7326,10.9325,11.7974,5.0897,9.509,7.9873,5.5779,3.2286,8.2194,...,13.5891,12.8904,15.3215,15.1596,5.0116,6.9377,4.6141,4.4477,5.2113,7.2413
Sample_004,9.6442,8.0831,8.7473,11.3764,4.9031,10.0253,6.952,4.4099,4.6201,7.5609,...,13.2997,12.8575,15.3859,15.2592,3.8695,4.8133,6.0293,5.4598,6.2368,6.7245
Sample_005,8.7543,8.5221,8.0261,10.3775,4.7883,10.9461,8.0983,4.1806,6.0419,8.8402,...,15.4379,15.3208,16.7965,16.4698,4.5736,3.8728,2.826,6.2769,6.6379,3.993


In [13]:
train_2.shape

(340, 54679)

In [14]:
test_2.shape

(214, 54675)

In [15]:
X_data = train_1.drop(['CO: 1', 'CO: 2'], axis = 1)
y_data = train_1['CO: 1']

pca = PCA(n_components=100)

X_data = pd.DataFrame(pca.fit_transform(X_data), columns = ['compressed_'+str(i) for i in range(100)])

#sc = StandardScaler()
#X_data = pd.DataFrame(sc.fit_transform(X_data), columns = ['compressed_'+str(i) for i in range(100)])


#X_data = pd.DataFrame(np.square(X_data), columns = cols)

#X_data.fillna(-1, inplace = True)

#X_data = SelectKBest(f_classif, k=2000).fit_transform(X_data, y_data)

kfold = StratifiedKFold(random_state = 10, n_splits = 5, shuffle = True)
splits = kfold.split(X_data, y_data)

sm = SMOTE(random_state = 10, k_neighbors = 7, n_jobs = -1)

macro_scores = []
acc_scores = []
weighted_scores = []
mcc_scores = []
Co1_models = []

for Train, Test in splits:
    X_Train, X_Test, Y_Train, Y_Test = X_data.iloc[Train], X_data.iloc[Test], y_data.iloc[Train], y_data.iloc[Test]
    xgb = XGBClassifier(n_estimators = 2000, max_depth = 8, n_jobs = -1,
                        learning_rate = 0.01, reg_lambda = 0.1, random_state = 10)
    #X_Train, Y_Train = sm.fit_resample(X_Train, Y_Train)
    xgb.fit(X_Train, Y_Train, early_stopping_rounds = 200,
            eval_set = [(X_Train, Y_Train), (X_Test, Y_Test)],
            eval_metric = 'logloss', verbose = True)
    pred = xgb.predict(X_Test)
    
    #svm = SVC(verbose = True, max_iter = 20000, probability = True, kernel = 'rbf')
    #svm.fit(X_Train, Y_Train)
    
    #pred = svm.predict(X_Test)
    
    macro_scores.append(f1_score(Y_Test, pred, average = 'macro'))
    weighted_scores.append(f1_score(Y_Test, pred, average = 'weighted'))
    acc_scores.append(accuracy_score(Y_Test, pred)*100)
    mcc_scores.append(matthews_corrcoef(Y_Test, pred))
    Co1_models.append(xgb)



[0]	validation_0-logloss:0.68488	validation_1-logloss:0.68832
[1]	validation_0-logloss:0.67763	validation_1-logloss:0.68563
[2]	validation_0-logloss:0.67052	validation_1-logloss:0.68285
[3]	validation_0-logloss:0.66354	validation_1-logloss:0.68001
[4]	validation_0-logloss:0.65670	validation_1-logloss:0.67742
[5]	validation_0-logloss:0.64998	validation_1-logloss:0.67458
[6]	validation_0-logloss:0.64334	validation_1-logloss:0.67022
[7]	validation_0-logloss:0.63686	validation_1-logloss:0.66788
[8]	validation_0-logloss:0.63050	validation_1-logloss:0.66559
[9]	validation_0-logloss:0.62425	validation_1-logloss:0.66340
[10]	validation_0-logloss:0.61785	validation_1-logloss:0.66012
[11]	validation_0-logloss:0.61180	validation_1-logloss:0.65787
[12]	validation_0-logloss:0.60586	validation_1-logloss:0.65570
[13]	validation_0-logloss:0.59973	validation_1-logloss:0.65413
[14]	validation_0-logloss:0.59397	validation_1-logloss:0.65220
[15]	validation_0-logloss:0.58828	validation_1-logloss:0.65023
[1

[130]	validation_0-logloss:0.21509	validation_1-logloss:0.52230
[131]	validation_0-logloss:0.21349	validation_1-logloss:0.52183
[132]	validation_0-logloss:0.21199	validation_1-logloss:0.52202
[133]	validation_0-logloss:0.21042	validation_1-logloss:0.52157
[134]	validation_0-logloss:0.20887	validation_1-logloss:0.52113
[135]	validation_0-logloss:0.20766	validation_1-logloss:0.52058
[136]	validation_0-logloss:0.20614	validation_1-logloss:0.52026
[137]	validation_0-logloss:0.20464	validation_1-logloss:0.51968
[138]	validation_0-logloss:0.20341	validation_1-logloss:0.51921
[139]	validation_0-logloss:0.20194	validation_1-logloss:0.51868
[140]	validation_0-logloss:0.20064	validation_1-logloss:0.51831
[141]	validation_0-logloss:0.19920	validation_1-logloss:0.51780
[142]	validation_0-logloss:0.19777	validation_1-logloss:0.51730
[143]	validation_0-logloss:0.19646	validation_1-logloss:0.51706
[144]	validation_0-logloss:0.19506	validation_1-logloss:0.51660
[145]	validation_0-logloss:0.19377	valid

[259]	validation_0-logloss:0.09760	validation_1-logloss:0.50403
[260]	validation_0-logloss:0.09711	validation_1-logloss:0.50414
[261]	validation_0-logloss:0.09660	validation_1-logloss:0.50375
[262]	validation_0-logloss:0.09614	validation_1-logloss:0.50377
[263]	validation_0-logloss:0.09567	validation_1-logloss:0.50369
[264]	validation_0-logloss:0.09517	validation_1-logloss:0.50332
[265]	validation_0-logloss:0.09475	validation_1-logloss:0.50346
[266]	validation_0-logloss:0.09427	validation_1-logloss:0.50358
[267]	validation_0-logloss:0.09387	validation_1-logloss:0.50422
[268]	validation_0-logloss:0.09338	validation_1-logloss:0.50354
[269]	validation_0-logloss:0.09299	validation_1-logloss:0.50352
[270]	validation_0-logloss:0.09255	validation_1-logloss:0.50344
[271]	validation_0-logloss:0.09207	validation_1-logloss:0.50279
[272]	validation_0-logloss:0.09154	validation_1-logloss:0.50324
[273]	validation_0-logloss:0.09109	validation_1-logloss:0.50337
[274]	validation_0-logloss:0.09071	valid

[388]	validation_0-logloss:0.05886	validation_1-logloss:0.52160
[389]	validation_0-logloss:0.05865	validation_1-logloss:0.52125
[390]	validation_0-logloss:0.05851	validation_1-logloss:0.52167
[391]	validation_0-logloss:0.05833	validation_1-logloss:0.52187
[392]	validation_0-logloss:0.05813	validation_1-logloss:0.52283
[393]	validation_0-logloss:0.05800	validation_1-logloss:0.52290
[394]	validation_0-logloss:0.05783	validation_1-logloss:0.52337
[395]	validation_0-logloss:0.05770	validation_1-logloss:0.52337
[396]	validation_0-logloss:0.05752	validation_1-logloss:0.52407
[397]	validation_0-logloss:0.05734	validation_1-logloss:0.52440
[398]	validation_0-logloss:0.05720	validation_1-logloss:0.52502
[399]	validation_0-logloss:0.05702	validation_1-logloss:0.52546
[400]	validation_0-logloss:0.05686	validation_1-logloss:0.52594
[401]	validation_0-logloss:0.05668	validation_1-logloss:0.52611
[402]	validation_0-logloss:0.05649	validation_1-logloss:0.52660
[403]	validation_0-logloss:0.05637	valid



[45]	validation_0-logloss:0.43169	validation_1-logloss:0.62486
[46]	validation_0-logloss:0.42781	validation_1-logloss:0.62490
[47]	validation_0-logloss:0.42398	validation_1-logloss:0.62465
[48]	validation_0-logloss:0.41999	validation_1-logloss:0.62200
[49]	validation_0-logloss:0.41613	validation_1-logloss:0.62133
[50]	validation_0-logloss:0.41224	validation_1-logloss:0.61838
[51]	validation_0-logloss:0.40861	validation_1-logloss:0.61775
[52]	validation_0-logloss:0.40482	validation_1-logloss:0.61549
[53]	validation_0-logloss:0.40114	validation_1-logloss:0.61437
[54]	validation_0-logloss:0.39764	validation_1-logloss:0.61425
[55]	validation_0-logloss:0.39399	validation_1-logloss:0.61214
[56]	validation_0-logloss:0.39039	validation_1-logloss:0.60957
[57]	validation_0-logloss:0.38689	validation_1-logloss:0.60865
[58]	validation_0-logloss:0.38355	validation_1-logloss:0.60859
[59]	validation_0-logloss:0.38008	validation_1-logloss:0.60656
[60]	validation_0-logloss:0.37665	validation_1-logloss:

[174]	validation_0-logloss:0.15362	validation_1-logloss:0.53439
[175]	validation_0-logloss:0.15253	validation_1-logloss:0.53355
[176]	validation_0-logloss:0.15155	validation_1-logloss:0.53280
[177]	validation_0-logloss:0.15061	validation_1-logloss:0.53292
[178]	validation_0-logloss:0.14961	validation_1-logloss:0.53313
[179]	validation_0-logloss:0.14863	validation_1-logloss:0.53353
[180]	validation_0-logloss:0.14771	validation_1-logloss:0.53377
[181]	validation_0-logloss:0.14684	validation_1-logloss:0.53382
[182]	validation_0-logloss:0.14601	validation_1-logloss:0.53307
[183]	validation_0-logloss:0.14518	validation_1-logloss:0.53312
[184]	validation_0-logloss:0.14409	validation_1-logloss:0.53335
[185]	validation_0-logloss:0.14313	validation_1-logloss:0.53372
[186]	validation_0-logloss:0.14217	validation_1-logloss:0.53409
[187]	validation_0-logloss:0.14133	validation_1-logloss:0.53432
[188]	validation_0-logloss:0.14040	validation_1-logloss:0.53471
[189]	validation_0-logloss:0.13942	valid

[303]	validation_0-logloss:0.07595	validation_1-logloss:0.53590
[304]	validation_0-logloss:0.07563	validation_1-logloss:0.53624
[305]	validation_0-logloss:0.07532	validation_1-logloss:0.53680
[306]	validation_0-logloss:0.07501	validation_1-logloss:0.53715
[307]	validation_0-logloss:0.07466	validation_1-logloss:0.53749
[308]	validation_0-logloss:0.07436	validation_1-logloss:0.53805
[309]	validation_0-logloss:0.07405	validation_1-logloss:0.53841
[310]	validation_0-logloss:0.07369	validation_1-logloss:0.53881
[311]	validation_0-logloss:0.07339	validation_1-logloss:0.53938
[312]	validation_0-logloss:0.07311	validation_1-logloss:0.53946
[313]	validation_0-logloss:0.07279	validation_1-logloss:0.54005
[314]	validation_0-logloss:0.07253	validation_1-logloss:0.54014
[315]	validation_0-logloss:0.07224	validation_1-logloss:0.54049
[316]	validation_0-logloss:0.07196	validation_1-logloss:0.54069
[317]	validation_0-logloss:0.07159	validation_1-logloss:0.54085
[318]	validation_0-logloss:0.07127	valid

[29]	validation_0-logloss:0.50732	validation_1-logloss:0.62016
[30]	validation_0-logloss:0.50236	validation_1-logloss:0.61858
[31]	validation_0-logloss:0.49721	validation_1-logloss:0.61669
[32]	validation_0-logloss:0.49239	validation_1-logloss:0.61535
[33]	validation_0-logloss:0.48739	validation_1-logloss:0.61385
[34]	validation_0-logloss:0.48270	validation_1-logloss:0.61239
[35]	validation_0-logloss:0.47828	validation_1-logloss:0.61174
[36]	validation_0-logloss:0.47372	validation_1-logloss:0.61055
[37]	validation_0-logloss:0.46923	validation_1-logloss:0.60941
[38]	validation_0-logloss:0.46473	validation_1-logloss:0.60692
[39]	validation_0-logloss:0.46037	validation_1-logloss:0.60586
[40]	validation_0-logloss:0.45575	validation_1-logloss:0.60475
[41]	validation_0-logloss:0.45150	validation_1-logloss:0.60365
[42]	validation_0-logloss:0.44732	validation_1-logloss:0.60254
[43]	validation_0-logloss:0.44332	validation_1-logloss:0.60058
[44]	validation_0-logloss:0.43924	validation_1-logloss:



[56]	validation_0-logloss:0.39412	validation_1-logloss:0.58843
[57]	validation_0-logloss:0.39050	validation_1-logloss:0.58692
[58]	validation_0-logloss:0.38711	validation_1-logloss:0.58635
[59]	validation_0-logloss:0.38377	validation_1-logloss:0.58593
[60]	validation_0-logloss:0.38030	validation_1-logloss:0.58515
[61]	validation_0-logloss:0.37704	validation_1-logloss:0.58486
[62]	validation_0-logloss:0.37365	validation_1-logloss:0.58360
[63]	validation_0-logloss:0.37047	validation_1-logloss:0.58327
[64]	validation_0-logloss:0.36733	validation_1-logloss:0.58299
[65]	validation_0-logloss:0.36422	validation_1-logloss:0.58170
[66]	validation_0-logloss:0.36115	validation_1-logloss:0.58134
[67]	validation_0-logloss:0.35813	validation_1-logloss:0.58100
[68]	validation_0-logloss:0.35492	validation_1-logloss:0.58028
[69]	validation_0-logloss:0.35197	validation_1-logloss:0.58030
[70]	validation_0-logloss:0.34886	validation_1-logloss:0.57922
[71]	validation_0-logloss:0.34598	validation_1-logloss:

[185]	validation_0-logloss:0.14562	validation_1-logloss:0.54717
[186]	validation_0-logloss:0.14459	validation_1-logloss:0.54753
[187]	validation_0-logloss:0.14372	validation_1-logloss:0.54723
[188]	validation_0-logloss:0.14282	validation_1-logloss:0.54728
[189]	validation_0-logloss:0.14181	validation_1-logloss:0.54763
[190]	validation_0-logloss:0.14085	validation_1-logloss:0.54804
[191]	validation_0-logloss:0.14002	validation_1-logloss:0.54888
[192]	validation_0-logloss:0.13915	validation_1-logloss:0.54926
[193]	validation_0-logloss:0.13817	validation_1-logloss:0.54962
[194]	validation_0-logloss:0.13732	validation_1-logloss:0.54999
[195]	validation_0-logloss:0.13640	validation_1-logloss:0.55043
[196]	validation_0-logloss:0.13567	validation_1-logloss:0.55082
[197]	validation_0-logloss:0.13480	validation_1-logloss:0.55121
[198]	validation_0-logloss:0.13401	validation_1-logloss:0.55206
[199]	validation_0-logloss:0.13317	validation_1-logloss:0.55227
[200]	validation_0-logloss:0.13238	valid

[314]	validation_0-logloss:0.07422	validation_1-logloss:0.58796
[315]	validation_0-logloss:0.07395	validation_1-logloss:0.58799
[316]	validation_0-logloss:0.07368	validation_1-logloss:0.58808
[317]	validation_0-logloss:0.07336	validation_1-logloss:0.58867
[318]	validation_0-logloss:0.07310	validation_1-logloss:0.58883
[319]	validation_0-logloss:0.07282	validation_1-logloss:0.58889
[320]	validation_0-logloss:0.07252	validation_1-logloss:0.58871
[321]	validation_0-logloss:0.07222	validation_1-logloss:0.58909
[322]	validation_0-logloss:0.07196	validation_1-logloss:0.58981
[323]	validation_0-logloss:0.07170	validation_1-logloss:0.58985
[324]	validation_0-logloss:0.07145	validation_1-logloss:0.59033
[325]	validation_0-logloss:0.07115	validation_1-logloss:0.59047
[326]	validation_0-logloss:0.07085	validation_1-logloss:0.59107
[327]	validation_0-logloss:0.07060	validation_1-logloss:0.59126
[328]	validation_0-logloss:0.07034	validation_1-logloss:0.59134
[329]	validation_0-logloss:0.07005	valid



[51]	validation_0-logloss:0.39870	validation_1-logloss:0.58712
[52]	validation_0-logloss:0.39493	validation_1-logloss:0.58580
[53]	validation_0-logloss:0.39155	validation_1-logloss:0.58422
[54]	validation_0-logloss:0.38823	validation_1-logloss:0.58348
[55]	validation_0-logloss:0.38448	validation_1-logloss:0.58180
[56]	validation_0-logloss:0.38063	validation_1-logloss:0.58130
[57]	validation_0-logloss:0.37684	validation_1-logloss:0.58085
[58]	validation_0-logloss:0.37365	validation_1-logloss:0.57936
[59]	validation_0-logloss:0.37053	validation_1-logloss:0.57846
[60]	validation_0-logloss:0.36700	validation_1-logloss:0.57746
[61]	validation_0-logloss:0.36337	validation_1-logloss:0.57722
[62]	validation_0-logloss:0.36036	validation_1-logloss:0.57698
[63]	validation_0-logloss:0.35695	validation_1-logloss:0.57562
[64]	validation_0-logloss:0.35358	validation_1-logloss:0.57472
[65]	validation_0-logloss:0.35041	validation_1-logloss:0.57305
[66]	validation_0-logloss:0.34752	validation_1-logloss:

[180]	validation_0-logloss:0.14951	validation_1-logloss:0.56709
[181]	validation_0-logloss:0.14874	validation_1-logloss:0.56793
[182]	validation_0-logloss:0.14779	validation_1-logloss:0.56816
[183]	validation_0-logloss:0.14674	validation_1-logloss:0.56784
[184]	validation_0-logloss:0.14571	validation_1-logloss:0.56789
[185]	validation_0-logloss:0.14478	validation_1-logloss:0.56819
[186]	validation_0-logloss:0.14377	validation_1-logloss:0.56798
[187]	validation_0-logloss:0.14292	validation_1-logloss:0.56891
[188]	validation_0-logloss:0.14205	validation_1-logloss:0.56982
[189]	validation_0-logloss:0.14106	validation_1-logloss:0.56990
[190]	validation_0-logloss:0.14021	validation_1-logloss:0.57059
[191]	validation_0-logloss:0.13950	validation_1-logloss:0.57146
[192]	validation_0-logloss:0.13853	validation_1-logloss:0.57124
[193]	validation_0-logloss:0.13770	validation_1-logloss:0.57194
[194]	validation_0-logloss:0.13675	validation_1-logloss:0.57187
[195]	validation_0-logloss:0.13590	valid

[309]	validation_0-logloss:0.07617	validation_1-logloss:0.63016
[310]	validation_0-logloss:0.07590	validation_1-logloss:0.63125
[311]	validation_0-logloss:0.07562	validation_1-logloss:0.63064
[312]	validation_0-logloss:0.07529	validation_1-logloss:0.63096
[313]	validation_0-logloss:0.07502	validation_1-logloss:0.63184
[314]	validation_0-logloss:0.07478	validation_1-logloss:0.63290
[315]	validation_0-logloss:0.07450	validation_1-logloss:0.63263
[316]	validation_0-logloss:0.07416	validation_1-logloss:0.63260
[317]	validation_0-logloss:0.07387	validation_1-logloss:0.63236
[318]	validation_0-logloss:0.07362	validation_1-logloss:0.63263
[319]	validation_0-logloss:0.07335	validation_1-logloss:0.63231
[320]	validation_0-logloss:0.07309	validation_1-logloss:0.63339
[321]	validation_0-logloss:0.07283	validation_1-logloss:0.63427
[322]	validation_0-logloss:0.07256	validation_1-logloss:0.63416
[323]	validation_0-logloss:0.07233	validation_1-logloss:0.63523
[324]	validation_0-logloss:0.07207	valid



[62]	validation_0-logloss:0.37682	validation_1-logloss:0.64727
[63]	validation_0-logloss:0.37350	validation_1-logloss:0.64596
[64]	validation_0-logloss:0.37064	validation_1-logloss:0.64566
[65]	validation_0-logloss:0.36724	validation_1-logloss:0.64625
[66]	validation_0-logloss:0.36401	validation_1-logloss:0.64660
[67]	validation_0-logloss:0.36084	validation_1-logloss:0.64578
[68]	validation_0-logloss:0.35811	validation_1-logloss:0.64513
[69]	validation_0-logloss:0.35486	validation_1-logloss:0.64523
[70]	validation_0-logloss:0.35165	validation_1-logloss:0.64481
[71]	validation_0-logloss:0.34884	validation_1-logloss:0.64546
[72]	validation_0-logloss:0.34582	validation_1-logloss:0.64558
[73]	validation_0-logloss:0.34325	validation_1-logloss:0.64589
[74]	validation_0-logloss:0.34040	validation_1-logloss:0.64639
[75]	validation_0-logloss:0.33750	validation_1-logloss:0.64586
[76]	validation_0-logloss:0.33502	validation_1-logloss:0.64585
[77]	validation_0-logloss:0.33205	validation_1-logloss:

[191]	validation_0-logloss:0.14429	validation_1-logloss:0.69451
[192]	validation_0-logloss:0.14364	validation_1-logloss:0.69406
[193]	validation_0-logloss:0.14262	validation_1-logloss:0.69579
[194]	validation_0-logloss:0.14190	validation_1-logloss:0.69460
[195]	validation_0-logloss:0.14102	validation_1-logloss:0.69342
[196]	validation_0-logloss:0.14016	validation_1-logloss:0.69225
[197]	validation_0-logloss:0.13925	validation_1-logloss:0.69345
[198]	validation_0-logloss:0.13833	validation_1-logloss:0.69251
[199]	validation_0-logloss:0.13760	validation_1-logloss:0.69414
[200]	validation_0-logloss:0.13700	validation_1-logloss:0.69366
[201]	validation_0-logloss:0.13621	validation_1-logloss:0.69246
[202]	validation_0-logloss:0.13550	validation_1-logloss:0.69411
[203]	validation_0-logloss:0.13470	validation_1-logloss:0.69345
[204]	validation_0-logloss:0.13412	validation_1-logloss:0.69246
[205]	validation_0-logloss:0.13342	validation_1-logloss:0.69412
[206]	validation_0-logloss:0.13255	valid

In [16]:
print("\n\n Classification Model Metrics:")
print("Macro F1: ", sum(macro_scores)/len(macro_scores))
print("Weighted F1: ", sum(weighted_scores)/len(weighted_scores))
print("Accuracy: ", sum(acc_scores)/len(acc_scores), "%")
print("MCC: ", sum(mcc_scores)/len(mcc_scores))



 Classification Model Metrics:
Macro F1:  0.6086789275910689
Weighted F1:  0.717168720231681
Accuracy:  73.84615384615384 %
MCC:  0.24099641850641546


In [17]:
X_data = train_1.drop(['CO: 1', 'CO: 2'], axis = 1)
y_data = train_1['CO: 2']

pca = PCA(n_components=100)

X_data = pd.DataFrame(pca.fit_transform(X_data), columns = ['compressed_'+str(i) for i in range(100)])

#cols = X_data.columns
#sc = StandardScaler()
#X_data = pd.DataFrame(sc.fit_transform(X_data), columns = cols)


#X_data = pd.DataFrame(np.square(X_data), columns = cols)

#X_data.fillna(-1, inplace = True)

#X_data = SelectKBest(f_classif, k=2000).fit_transform(X_data, y_data)

kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 10)
splits = kfold.split(X_data, y_data)

sm = SVMSMOTE(random_state = 10, k_neighbors = 7, n_jobs = -1)

macro_scores = []
acc_scores = []
weighted_scores = []
mcc_scores = []
Co2_models = []

for Train, Test in splits:
    X_Train, X_Test, Y_Train, Y_Test = X_data.iloc[Train], X_data.iloc[Test], y_data.iloc[Train], y_data.iloc[Test]
    xgb = XGBClassifier(n_estimators = 2000, max_depth = 8, n_jobs = -1, learning_rate = 0.01, reg_lambda = 0.1, random_state = 0)
    #X_Train, Y_Train = sm.fit_resample(X_Train, Y_Train)
    xgb.fit(X_Train, Y_Train, early_stopping_rounds = 100, eval_set = [(X_Train, Y_Train), (X_Test, Y_Test)],  verbose = True)
    pred = xgb.predict(X_Test)
    
    #rf = RandomForestClassifier(n_estimators = 2000, n_jobs = -1, random_state = 10)
    #rf.fit(X_Train, Y_Train)
    
    #pred = rf.predict(X_Test)
    
    macro_scores.append(f1_score(Y_Test, pred, average = 'macro'))
    weighted_scores.append(f1_score(Y_Test, pred, average = 'weighted'))
    acc_scores.append(accuracy_score(Y_Test, pred)*100)
    mcc_scores.append(matthews_corrcoef(Y_Test, pred))
    Co2_models.append(xgb)

[0]	validation_0-logloss:0.68507	validation_1-logloss:0.69225
[1]	validation_0-logloss:0.67734	validation_1-logloss:0.69212
[2]	validation_0-logloss:0.66974	validation_1-logloss:0.69159
[3]	validation_0-logloss:0.66229	validation_1-logloss:0.69146
[4]	validation_0-logloss:0.65497	validation_1-logloss:0.69094
[5]	validation_0-logloss:0.64779	validation_1-logloss:0.69097
[6]	validation_0-logloss:0.64073	validation_1-logloss:0.69027
[7]	validation_0-logloss:0.63381	validation_1-logloss:0.69028
[8]	validation_0-logloss:0.62700	validation_1-logloss:0.69032
[9]	validation_0-logloss:0.62032	validation_1-logloss:0.69064
[10]	validation_0-logloss:0.61375	validation_1-logloss:0.69088




[11]	validation_0-logloss:0.60729	validation_1-logloss:0.69131
[12]	validation_0-logloss:0.60095	validation_1-logloss:0.69191
[13]	validation_0-logloss:0.59471	validation_1-logloss:0.69168
[14]	validation_0-logloss:0.58859	validation_1-logloss:0.69188
[15]	validation_0-logloss:0.58256	validation_1-logloss:0.69240
[16]	validation_0-logloss:0.57663	validation_1-logloss:0.69281
[17]	validation_0-logloss:0.57100	validation_1-logloss:0.69319
[18]	validation_0-logloss:0.56532	validation_1-logloss:0.69375
[19]	validation_0-logloss:0.55969	validation_1-logloss:0.69419
[20]	validation_0-logloss:0.55432	validation_1-logloss:0.69450
[21]	validation_0-logloss:0.54890	validation_1-logloss:0.69555
[22]	validation_0-logloss:0.54354	validation_1-logloss:0.69607
[23]	validation_0-logloss:0.53829	validation_1-logloss:0.69682
[24]	validation_0-logloss:0.53325	validation_1-logloss:0.69763
[25]	validation_0-logloss:0.52812	validation_1-logloss:0.69813
[26]	validation_0-logloss:0.52311	validation_1-logloss:

[141]	validation_0-logloss:0.20461	validation_1-logloss:0.72526
[142]	validation_0-logloss:0.20333	validation_1-logloss:0.72536
[143]	validation_0-logloss:0.20198	validation_1-logloss:0.72559
[144]	validation_0-logloss:0.20058	validation_1-logloss:0.72467
[145]	validation_0-logloss:0.19926	validation_1-logloss:0.72491
[146]	validation_0-logloss:0.19793	validation_1-logloss:0.72314
[147]	validation_0-logloss:0.19664	validation_1-logloss:0.72340
[0]	validation_0-logloss:0.68571	validation_1-logloss:0.69387
[1]	validation_0-logloss:0.67890	validation_1-logloss:0.69479
[2]	validation_0-logloss:0.67194	validation_1-logloss:0.69540
[3]	validation_0-logloss:0.66511	validation_1-logloss:0.69695
[4]	validation_0-logloss:0.65840	validation_1-logloss:0.69882
[5]	validation_0-logloss:0.65204	validation_1-logloss:0.69992
[6]	validation_0-logloss:0.64555	validation_1-logloss:0.70161
[7]	validation_0-logloss:0.63941	validation_1-logloss:0.70306
[8]	validation_0-logloss:0.63293	validation_1-logloss:0.



[38]	validation_0-logloss:0.46911	validation_1-logloss:0.72079
[39]	validation_0-logloss:0.46495	validation_1-logloss:0.72046
[40]	validation_0-logloss:0.46055	validation_1-logloss:0.71908
[41]	validation_0-logloss:0.45595	validation_1-logloss:0.72143
[42]	validation_0-logloss:0.45166	validation_1-logloss:0.72086
[43]	validation_0-logloss:0.44771	validation_1-logloss:0.71986
[44]	validation_0-logloss:0.44328	validation_1-logloss:0.72222
[45]	validation_0-logloss:0.43891	validation_1-logloss:0.72429
[46]	validation_0-logloss:0.43478	validation_1-logloss:0.72365
[47]	validation_0-logloss:0.43053	validation_1-logloss:0.72612
[48]	validation_0-logloss:0.42633	validation_1-logloss:0.72859
[49]	validation_0-logloss:0.42261	validation_1-logloss:0.72772
[50]	validation_0-logloss:0.41872	validation_1-logloss:0.72698
[51]	validation_0-logloss:0.41467	validation_1-logloss:0.72947
[52]	validation_0-logloss:0.41068	validation_1-logloss:0.73169
[53]	validation_0-logloss:0.40728	validation_1-logloss:



[46]	validation_0-logloss:0.42801	validation_1-logloss:0.76401
[47]	validation_0-logloss:0.42411	validation_1-logloss:0.76596
[48]	validation_0-logloss:0.41987	validation_1-logloss:0.76866
[49]	validation_0-logloss:0.41597	validation_1-logloss:0.77041
[50]	validation_0-logloss:0.41184	validation_1-logloss:0.77314
[51]	validation_0-logloss:0.40788	validation_1-logloss:0.77547
[52]	validation_0-logloss:0.40424	validation_1-logloss:0.77750
[53]	validation_0-logloss:0.40027	validation_1-logloss:0.78027
[54]	validation_0-logloss:0.39663	validation_1-logloss:0.78211
[55]	validation_0-logloss:0.39273	validation_1-logloss:0.78349
[56]	validation_0-logloss:0.38899	validation_1-logloss:0.78391
[57]	validation_0-logloss:0.38498	validation_1-logloss:0.78570
[58]	validation_0-logloss:0.38102	validation_1-logloss:0.78753
[59]	validation_0-logloss:0.37712	validation_1-logloss:0.78938
[60]	validation_0-logloss:0.37328	validation_1-logloss:0.79127
[61]	validation_0-logloss:0.36929	validation_1-logloss:



[41]	validation_0-logloss:0.46193	validation_1-logloss:0.67917
[42]	validation_0-logloss:0.45794	validation_1-logloss:0.67783
[43]	validation_0-logloss:0.45356	validation_1-logloss:0.67818
[44]	validation_0-logloss:0.44968	validation_1-logloss:0.67615
[45]	validation_0-logloss:0.44532	validation_1-logloss:0.67583
[46]	validation_0-logloss:0.44139	validation_1-logloss:0.67605
[47]	validation_0-logloss:0.43765	validation_1-logloss:0.67480
[48]	validation_0-logloss:0.43344	validation_1-logloss:0.67452
[49]	validation_0-logloss:0.42937	validation_1-logloss:0.67487
[50]	validation_0-logloss:0.42576	validation_1-logloss:0.67393
[51]	validation_0-logloss:0.42171	validation_1-logloss:0.67370
[52]	validation_0-logloss:0.41805	validation_1-logloss:0.67397
[53]	validation_0-logloss:0.41457	validation_1-logloss:0.67210
[54]	validation_0-logloss:0.41066	validation_1-logloss:0.67192
[55]	validation_0-logloss:0.40680	validation_1-logloss:0.67207
[56]	validation_0-logloss:0.40344	validation_1-logloss:

[170]	validation_0-logloss:0.17629	validation_1-logloss:0.63378
[171]	validation_0-logloss:0.17509	validation_1-logloss:0.63310
[172]	validation_0-logloss:0.17416	validation_1-logloss:0.63277
[173]	validation_0-logloss:0.17298	validation_1-logloss:0.63212
[174]	validation_0-logloss:0.17183	validation_1-logloss:0.63256
[175]	validation_0-logloss:0.17086	validation_1-logloss:0.63319
[176]	validation_0-logloss:0.16971	validation_1-logloss:0.63257
[177]	validation_0-logloss:0.16872	validation_1-logloss:0.63258
[178]	validation_0-logloss:0.16785	validation_1-logloss:0.63113
[179]	validation_0-logloss:0.16690	validation_1-logloss:0.63147
[180]	validation_0-logloss:0.16580	validation_1-logloss:0.63153
[181]	validation_0-logloss:0.16477	validation_1-logloss:0.63214
[182]	validation_0-logloss:0.16368	validation_1-logloss:0.63222
[183]	validation_0-logloss:0.16246	validation_1-logloss:0.63189
[184]	validation_0-logloss:0.16141	validation_1-logloss:0.63252
[185]	validation_0-logloss:0.16063	valid

[299]	validation_0-logloss:0.09252	validation_1-logloss:0.60035
[300]	validation_0-logloss:0.09216	validation_1-logloss:0.60040
[301]	validation_0-logloss:0.09168	validation_1-logloss:0.60034
[302]	validation_0-logloss:0.09130	validation_1-logloss:0.60025
[303]	validation_0-logloss:0.09096	validation_1-logloss:0.60033
[304]	validation_0-logloss:0.09063	validation_1-logloss:0.60006
[305]	validation_0-logloss:0.09025	validation_1-logloss:0.59946
[306]	validation_0-logloss:0.08989	validation_1-logloss:0.59949
[307]	validation_0-logloss:0.08957	validation_1-logloss:0.59883
[308]	validation_0-logloss:0.08930	validation_1-logloss:0.59921
[309]	validation_0-logloss:0.08893	validation_1-logloss:0.59785
[310]	validation_0-logloss:0.08859	validation_1-logloss:0.59791
[311]	validation_0-logloss:0.08823	validation_1-logloss:0.59782
[312]	validation_0-logloss:0.08788	validation_1-logloss:0.59816
[313]	validation_0-logloss:0.08754	validation_1-logloss:0.59819
[314]	validation_0-logloss:0.08723	valid

[428]	validation_0-logloss:0.06028	validation_1-logloss:0.57304
[429]	validation_0-logloss:0.06013	validation_1-logloss:0.57293
[430]	validation_0-logloss:0.05994	validation_1-logloss:0.57213
[431]	validation_0-logloss:0.05979	validation_1-logloss:0.57272
[432]	validation_0-logloss:0.05963	validation_1-logloss:0.57327
[433]	validation_0-logloss:0.05945	validation_1-logloss:0.57373
[434]	validation_0-logloss:0.05932	validation_1-logloss:0.57358
[435]	validation_0-logloss:0.05912	validation_1-logloss:0.57391
[436]	validation_0-logloss:0.05896	validation_1-logloss:0.57332
[437]	validation_0-logloss:0.05881	validation_1-logloss:0.57322
[438]	validation_0-logloss:0.05865	validation_1-logloss:0.57307
[439]	validation_0-logloss:0.05851	validation_1-logloss:0.57357
[440]	validation_0-logloss:0.05834	validation_1-logloss:0.57406
[441]	validation_0-logloss:0.05817	validation_1-logloss:0.57357
[442]	validation_0-logloss:0.05801	validation_1-logloss:0.57412
[443]	validation_0-logloss:0.05787	valid

[22]	validation_0-logloss:0.55238	validation_1-logloss:0.68111
[23]	validation_0-logloss:0.54705	validation_1-logloss:0.68042
[24]	validation_0-logloss:0.54224	validation_1-logloss:0.67986
[25]	validation_0-logloss:0.53697	validation_1-logloss:0.67848
[26]	validation_0-logloss:0.53158	validation_1-logloss:0.67725
[27]	validation_0-logloss:0.52659	validation_1-logloss:0.67731
[28]	validation_0-logloss:0.52161	validation_1-logloss:0.67706
[29]	validation_0-logloss:0.51653	validation_1-logloss:0.67659
[30]	validation_0-logloss:0.51174	validation_1-logloss:0.67656
[31]	validation_0-logloss:0.50671	validation_1-logloss:0.67550
[32]	validation_0-logloss:0.50198	validation_1-logloss:0.67544
[33]	validation_0-logloss:0.49738	validation_1-logloss:0.67566
[34]	validation_0-logloss:0.49263	validation_1-logloss:0.67556
[35]	validation_0-logloss:0.48808	validation_1-logloss:0.67552
[36]	validation_0-logloss:0.48353	validation_1-logloss:0.67411
[37]	validation_0-logloss:0.47910	validation_1-logloss:



[42]	validation_0-logloss:0.45754	validation_1-logloss:0.66997
[43]	validation_0-logloss:0.45341	validation_1-logloss:0.66934
[44]	validation_0-logloss:0.44933	validation_1-logloss:0.66942
[45]	validation_0-logloss:0.44527	validation_1-logloss:0.66894
[46]	validation_0-logloss:0.44129	validation_1-logloss:0.66892
[47]	validation_0-logloss:0.43717	validation_1-logloss:0.66752
[48]	validation_0-logloss:0.43335	validation_1-logloss:0.66682
[49]	validation_0-logloss:0.42949	validation_1-logloss:0.66625
[50]	validation_0-logloss:0.42568	validation_1-logloss:0.66543
[51]	validation_0-logloss:0.42203	validation_1-logloss:0.66492
[52]	validation_0-logloss:0.41839	validation_1-logloss:0.66430
[53]	validation_0-logloss:0.41463	validation_1-logloss:0.66307
[54]	validation_0-logloss:0.41099	validation_1-logloss:0.66258
[55]	validation_0-logloss:0.40748	validation_1-logloss:0.66204
[56]	validation_0-logloss:0.40401	validation_1-logloss:0.66131
[57]	validation_0-logloss:0.40050	validation_1-logloss:

[171]	validation_0-logloss:0.17554	validation_1-logloss:0.65228
[172]	validation_0-logloss:0.17439	validation_1-logloss:0.65294
[173]	validation_0-logloss:0.17331	validation_1-logloss:0.65296
[174]	validation_0-logloss:0.17231	validation_1-logloss:0.65249
[175]	validation_0-logloss:0.17115	validation_1-logloss:0.65209
[176]	validation_0-logloss:0.17014	validation_1-logloss:0.65356
[177]	validation_0-logloss:0.16903	validation_1-logloss:0.65388
[178]	validation_0-logloss:0.16806	validation_1-logloss:0.65402
[179]	validation_0-logloss:0.16692	validation_1-logloss:0.65566
[180]	validation_0-logloss:0.16595	validation_1-logloss:0.65714
[181]	validation_0-logloss:0.16501	validation_1-logloss:0.65660
[182]	validation_0-logloss:0.16405	validation_1-logloss:0.65809
[183]	validation_0-logloss:0.16304	validation_1-logloss:0.65797
[184]	validation_0-logloss:0.16198	validation_1-logloss:0.66013
[185]	validation_0-logloss:0.16106	validation_1-logloss:0.66013
[186]	validation_0-logloss:0.16003	valid

In [18]:
print("\n\n Classification Model Metrics:")
print("Macro F1: ", sum(macro_scores)/len(macro_scores))
print("Weighted F1: ", sum(weighted_scores)/len(weighted_scores))
print("Accuracy: ", sum(acc_scores)/len(acc_scores), "%")
print("MCC: ", sum(mcc_scores)/len(mcc_scores))



 Classification Model Metrics:
Macro F1:  0.5118260872938293
Weighted F1:  0.535142837449289
Accuracy:  55.38461538461538 %
MCC:  0.05840538001632776


In [27]:
X_data = train_2.drop(['CO: 3', 'CO: 4', 'CO: 5', 'CO: 6'], axis = 1)
y_data = train_2['CO: 3']

cols = X_data.columns
sc = StandardScaler()
X_data = pd.DataFrame(sc.fit_transform(X_data), columns = cols)


#X_data = pd.DataFrame(np.square(X_data), columns = cols)

#X_data = X_data.apply(lambda x: x.fillna(x.mean()),axis=0)
#X_data.fillna(-1, inplace = True)
#X_data = SelectKBest(f_classif, k=2000).fit_transform(X_data, y_data)

kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 10)
splits = kfold.split(X_data, y_data)

sm = SMOTE(random_state = 10, k_neighbors = 7, n_jobs = -1)

macro_scores = []
acc_scores = []
weighted_scores = []
mcc_scores = []
Co3_models = []

for Train, Test in splits:
    X_Train, X_Test, Y_Train, Y_Test = X_data.iloc[Train], X_data.iloc[Test], y_data.iloc[Train], y_data.iloc[Test]
    #xgb = XGBClassifier(n_estimators = 2000, max_depth = 8, n_jobs = -1, learning_rate = 0.01, reg_lambda = 0.1)
    #X_Train, Y_Train = sm.fit_resample(X_Train, Y_Train)
    #xgb.fit(X_Train, Y_Train, early_stopping_rounds = 200, eval_set = [(X_Train, Y_Train), (X_Test, Y_Test)],  verbose = True)
    #pred = xgb.predict(X_Test)
    #'''
    rf = RandomForestClassifier(n_estimators = 3000, n_jobs = -1, random_state = 10)
    rf.fit(X_Train, Y_Train)
    
    pred = rf.predict(X_Test)
    #'''
    macro_scores.append(f1_score(Y_Test, pred, average = 'macro'))
    weighted_scores.append(f1_score(Y_Test, pred, average = 'weighted'))
    acc_scores.append(accuracy_score(Y_Test, pred)*100)
    mcc_scores.append(matthews_corrcoef(Y_Test, pred))
    Co3_models.append(rf)

In [28]:
print("\n\n Classification Model Metrics:")
print("Macro F1: ", sum(macro_scores)/len(macro_scores))
print("Weighted F1: ", sum(weighted_scores)/len(weighted_scores))
print("Accuracy: ", sum(acc_scores)/len(acc_scores), "%")
print("MCC: ", sum(mcc_scores)/len(mcc_scores))



 Classification Model Metrics:
Macro F1:  0.5285555163816568
Weighted F1:  0.7020897921743481
Accuracy:  77.35294117647058 %
MCC:  0.23432691358123506


In [29]:
X_data = train_2.drop(['CO: 3', 'CO: 4', 'CO: 5', 'CO: 6'], axis = 1)
y_data = train_2['CO: 4']

cols = X_data.columns
sc = StandardScaler()
X_data = pd.DataFrame(sc.fit_transform(X_data), columns = cols)


#X_data = pd.DataFrame(np.square(X_data), columns = cols)

#X_data = X_data.apply(lambda x: x.fillna(x.mean()),axis=0)
#X_data.fillna(-1, inplace = True)
#X_data = SelectKBest(f_classif, k=2000).fit_transform(X_data, y_data)

kfold = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 10)
splits = kfold.split(X_data, y_data)

sm = SMOTE(random_state = 42, k_neighbors = 7, n_jobs = -1)

macro_scores = []
acc_scores = []
weighted_scores = []
mcc_scores = []
Co4_models = []

for Train, Test in splits:
    X_Train, X_Test, Y_Train, Y_Test = X_data.iloc[Train], X_data.iloc[Test], y_data.iloc[Train], y_data.iloc[Test]
    #xgb = XGBClassifier(n_estimators = 2000, max_depth = 8, n_jobs = -1, learning_rate = 0.01, reg_lambda = 0.1)
    #X_Train, Y_Train = sm.fit_resample(X_Train, Y_Train)
    #xgb.fit(X_Train, Y_Train, early_stopping_rounds = 200, eval_set = [(X_Train, Y_Train), (X_Test, Y_Test)],  verbose = True)
    #pred = xgb.predict(X_Test)
    '''
    svm = SVC(verbose = True, probability = True)
    svm.fit(X_Train, Y_Train)
    
    pred = svm.predict(X_Test)
    #'''
    lr = LogisticRegression()
    lr.fit(X_Train, Y_Train)
    pred = lr.predict(X_Test)
    macro_scores.append(f1_score(Y_Test, pred, average = 'macro'))
    weighted_scores.append(f1_score(Y_Test, pred, average = 'weighted'))
    acc_scores.append(accuracy_score(Y_Test, pred)*100)
    mcc_scores.append(matthews_corrcoef(Y_Test, pred))
    Co4_models.append(lr)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/sta

In [30]:
print("\n\n Classification Model Metrics:")
print("Macro F1: ", sum(macro_scores)/len(macro_scores))
print("Weighted F1: ", sum(weighted_scores)/len(weighted_scores))
print("Accuracy: ", sum(acc_scores)/len(acc_scores), "%")
print("MCC: ", sum(mcc_scores)/len(mcc_scores))



 Classification Model Metrics:
Macro F1:  0.5024799803580872
Weighted F1:  0.7939674330864022
Accuracy:  85.0 %
MCC:  0.08866203028815722


In [31]:
X_data = train_2.drop(['CO: 3', 'CO: 4', 'CO: 5', 'CO: 6'], axis = 1)
y_data = train_2['CO: 5']

cols = X_data.columns
sc = StandardScaler()
X_data = pd.DataFrame(sc.fit_transform(X_data), columns = cols)


#X_data = pd.DataFrame(np.square(X_data), columns = cols)

#X_data.fillna(-1, inplace = True)

#X_data = SelectKBest(f_classif, k=2000).fit_transform(X_data, y_data)

kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 10)
splits = kfold.split(X_data, y_data)

sm = SMOTE(random_state = 42, k_neighbors = 7, n_jobs = -1)

macro_scores = []
acc_scores = []
weighted_scores = []
mcc_scores = []
Co5_models = []

for Train, Test in splits:
    X_Train, X_Test, Y_Train, Y_Test = X_data.iloc[Train], X_data.iloc[Test], y_data.iloc[Train], y_data.iloc[Test]
    xgb = XGBClassifier(n_estimators = 2000, max_depth = 8, n_jobs = -1, learning_rate = 0.01, reg_lambda = 0.1, random_state = 10)
    #X_Train, Y_Train = sm.fit_resample(X_Train, Y_Train)
    xgb.fit(X_Train, Y_Train, early_stopping_rounds = 100, eval_set = [(X_Train, Y_Train), (X_Test, Y_Test)],  verbose = True)
    pred = xgb.predict(X_Test)
    
    macro_scores.append(f1_score(Y_Test, pred, average = 'macro'))
    weighted_scores.append(f1_score(Y_Test, pred, average = 'weighted'))
    acc_scores.append(accuracy_score(Y_Test, pred)*100)
    mcc_scores.append(matthews_corrcoef(Y_Test, pred))
    Co5_models.append(xgb)



[0]	validation_0-logloss:0.68395	validation_1-logloss:0.68678
[1]	validation_0-logloss:0.67502	validation_1-logloss:0.68022
[2]	validation_0-logloss:0.66635	validation_1-logloss:0.67385
[3]	validation_0-logloss:0.65775	validation_1-logloss:0.66769
[4]	validation_0-logloss:0.64931	validation_1-logloss:0.66192
[5]	validation_0-logloss:0.64103	validation_1-logloss:0.65571
[6]	validation_0-logloss:0.63290	validation_1-logloss:0.65017
[7]	validation_0-logloss:0.62491	validation_1-logloss:0.64446
[8]	validation_0-logloss:0.61708	validation_1-logloss:0.63911
[9]	validation_0-logloss:0.60938	validation_1-logloss:0.63351
[10]	validation_0-logloss:0.60181	validation_1-logloss:0.62832
[11]	validation_0-logloss:0.59439	validation_1-logloss:0.62300
[12]	validation_0-logloss:0.58709	validation_1-logloss:0.61803
[13]	validation_0-logloss:0.57992	validation_1-logloss:0.61299
[14]	validation_0-logloss:0.57287	validation_1-logloss:0.60813
[15]	validation_0-logloss:0.56594	validation_1-logloss:0.60334
[1

[126]	validation_0-logloss:0.17629	validation_1-logloss:0.31035
[127]	validation_0-logloss:0.17452	validation_1-logloss:0.30886
[128]	validation_0-logloss:0.17277	validation_1-logloss:0.30729
[129]	validation_0-logloss:0.17105	validation_1-logloss:0.30578
[130]	validation_0-logloss:0.16934	validation_1-logloss:0.30426
[131]	validation_0-logloss:0.16765	validation_1-logloss:0.30280
[132]	validation_0-logloss:0.16598	validation_1-logloss:0.30142
[133]	validation_0-logloss:0.16433	validation_1-logloss:0.30005
[134]	validation_0-logloss:0.16285	validation_1-logloss:0.29900
[135]	validation_0-logloss:0.16124	validation_1-logloss:0.29781
[136]	validation_0-logloss:0.15965	validation_1-logloss:0.29632
[137]	validation_0-logloss:0.15807	validation_1-logloss:0.29495
[138]	validation_0-logloss:0.15651	validation_1-logloss:0.29372
[139]	validation_0-logloss:0.15497	validation_1-logloss:0.29239
[140]	validation_0-logloss:0.15345	validation_1-logloss:0.29116
[141]	validation_0-logloss:0.15208	valid

[255]	validation_0-logloss:0.05781	validation_1-logloss:0.20947
[256]	validation_0-logloss:0.05737	validation_1-logloss:0.20864
[257]	validation_0-logloss:0.05697	validation_1-logloss:0.20865
[258]	validation_0-logloss:0.05654	validation_1-logloss:0.20812
[259]	validation_0-logloss:0.05612	validation_1-logloss:0.20768
[260]	validation_0-logloss:0.05568	validation_1-logloss:0.20711
[261]	validation_0-logloss:0.05531	validation_1-logloss:0.20708
[262]	validation_0-logloss:0.05489	validation_1-logloss:0.20664
[263]	validation_0-logloss:0.05447	validation_1-logloss:0.20618
[264]	validation_0-logloss:0.05411	validation_1-logloss:0.20592
[265]	validation_0-logloss:0.05371	validation_1-logloss:0.20516
[266]	validation_0-logloss:0.05330	validation_1-logloss:0.20471
[267]	validation_0-logloss:0.05294	validation_1-logloss:0.20451
[268]	validation_0-logloss:0.05255	validation_1-logloss:0.20384
[269]	validation_0-logloss:0.05221	validation_1-logloss:0.20361
[270]	validation_0-logloss:0.05183	valid

[384]	validation_0-logloss:0.02553	validation_1-logloss:0.18010
[385]	validation_0-logloss:0.02540	validation_1-logloss:0.18002
[386]	validation_0-logloss:0.02528	validation_1-logloss:0.18018
[387]	validation_0-logloss:0.02515	validation_1-logloss:0.18006
[388]	validation_0-logloss:0.02502	validation_1-logloss:0.18000
[389]	validation_0-logloss:0.02492	validation_1-logloss:0.17991
[390]	validation_0-logloss:0.02480	validation_1-logloss:0.18006
[391]	validation_0-logloss:0.02467	validation_1-logloss:0.18018
[392]	validation_0-logloss:0.02455	validation_1-logloss:0.17972
[393]	validation_0-logloss:0.02443	validation_1-logloss:0.17984
[394]	validation_0-logloss:0.02431	validation_1-logloss:0.17978
[395]	validation_0-logloss:0.02418	validation_1-logloss:0.17967
[396]	validation_0-logloss:0.02407	validation_1-logloss:0.17922
[397]	validation_0-logloss:0.02398	validation_1-logloss:0.17914
[398]	validation_0-logloss:0.02386	validation_1-logloss:0.17911
[399]	validation_0-logloss:0.02377	valid

[513]	validation_0-logloss:0.01528	validation_1-logloss:0.17641
[514]	validation_0-logloss:0.01524	validation_1-logloss:0.17613
[515]	validation_0-logloss:0.01520	validation_1-logloss:0.17632
[516]	validation_0-logloss:0.01515	validation_1-logloss:0.17636
[517]	validation_0-logloss:0.01511	validation_1-logloss:0.17635
[518]	validation_0-logloss:0.01506	validation_1-logloss:0.17621
[519]	validation_0-logloss:0.01502	validation_1-logloss:0.17621
[520]	validation_0-logloss:0.01497	validation_1-logloss:0.17625
[521]	validation_0-logloss:0.01492	validation_1-logloss:0.17631
[522]	validation_0-logloss:0.01488	validation_1-logloss:0.17649
[523]	validation_0-logloss:0.01484	validation_1-logloss:0.17650
[524]	validation_0-logloss:0.01479	validation_1-logloss:0.17636
[525]	validation_0-logloss:0.01475	validation_1-logloss:0.17609
[526]	validation_0-logloss:0.01470	validation_1-logloss:0.17616
[527]	validation_0-logloss:0.01468	validation_1-logloss:0.17616
[528]	validation_0-logloss:0.01463	valid



[0]	validation_0-logloss:0.68387	validation_1-logloss:0.68611
[1]	validation_0-logloss:0.67490	validation_1-logloss:0.68013
[2]	validation_0-logloss:0.66610	validation_1-logloss:0.67334
[3]	validation_0-logloss:0.65747	validation_1-logloss:0.66753
[4]	validation_0-logloss:0.64900	validation_1-logloss:0.66056
[5]	validation_0-logloss:0.64068	validation_1-logloss:0.65507
[6]	validation_0-logloss:0.63252	validation_1-logloss:0.64872
[7]	validation_0-logloss:0.62450	validation_1-logloss:0.64306
[8]	validation_0-logloss:0.61663	validation_1-logloss:0.63677
[9]	validation_0-logloss:0.60890	validation_1-logloss:0.63148
[10]	validation_0-logloss:0.60131	validation_1-logloss:0.62565
[11]	validation_0-logloss:0.59385	validation_1-logloss:0.62064
[12]	validation_0-logloss:0.58652	validation_1-logloss:0.61520
[13]	validation_0-logloss:0.57932	validation_1-logloss:0.61021
[14]	validation_0-logloss:0.57224	validation_1-logloss:0.60481
[15]	validation_0-logloss:0.56528	validation_1-logloss:0.60050
[1

[126]	validation_0-logloss:0.17274	validation_1-logloss:0.37677
[127]	validation_0-logloss:0.17109	validation_1-logloss:0.37559
[128]	validation_0-logloss:0.16942	validation_1-logloss:0.37446
[129]	validation_0-logloss:0.16781	validation_1-logloss:0.37346
[130]	validation_0-logloss:0.16618	validation_1-logloss:0.37202
[131]	validation_0-logloss:0.16469	validation_1-logloss:0.37111
[132]	validation_0-logloss:0.16314	validation_1-logloss:0.37028
[133]	validation_0-logloss:0.16157	validation_1-logloss:0.36892
[134]	validation_0-logloss:0.16013	validation_1-logloss:0.36811
[135]	validation_0-logloss:0.15862	validation_1-logloss:0.36687
[136]	validation_0-logloss:0.15710	validation_1-logloss:0.36620
[137]	validation_0-logloss:0.15563	validation_1-logloss:0.36491
[138]	validation_0-logloss:0.15425	validation_1-logloss:0.36401
[139]	validation_0-logloss:0.15278	validation_1-logloss:0.36311
[140]	validation_0-logloss:0.15135	validation_1-logloss:0.36195
[141]	validation_0-logloss:0.14990	valid

[255]	validation_0-logloss:0.05733	validation_1-logloss:0.30933
[256]	validation_0-logloss:0.05694	validation_1-logloss:0.30942
[257]	validation_0-logloss:0.05650	validation_1-logloss:0.30880
[258]	validation_0-logloss:0.05608	validation_1-logloss:0.30820
[259]	validation_0-logloss:0.05565	validation_1-logloss:0.30743
[260]	validation_0-logloss:0.05527	validation_1-logloss:0.30705
[261]	validation_0-logloss:0.05489	validation_1-logloss:0.30718
[262]	validation_0-logloss:0.05448	validation_1-logloss:0.30652
[263]	validation_0-logloss:0.05407	validation_1-logloss:0.30592
[264]	validation_0-logloss:0.05370	validation_1-logloss:0.30593
[265]	validation_0-logloss:0.05333	validation_1-logloss:0.30570
[266]	validation_0-logloss:0.05293	validation_1-logloss:0.30502
[267]	validation_0-logloss:0.05256	validation_1-logloss:0.30549
[268]	validation_0-logloss:0.05217	validation_1-logloss:0.30467
[269]	validation_0-logloss:0.05178	validation_1-logloss:0.30401
[270]	validation_0-logloss:0.05143	valid

[384]	validation_0-logloss:0.02613	validation_1-logloss:0.29484
[385]	validation_0-logloss:0.02601	validation_1-logloss:0.29482
[386]	validation_0-logloss:0.02587	validation_1-logloss:0.29472
[387]	validation_0-logloss:0.02575	validation_1-logloss:0.29449
[388]	validation_0-logloss:0.02563	validation_1-logloss:0.29393
[389]	validation_0-logloss:0.02549	validation_1-logloss:0.29339
[390]	validation_0-logloss:0.02535	validation_1-logloss:0.29317
[391]	validation_0-logloss:0.02523	validation_1-logloss:0.29321
[392]	validation_0-logloss:0.02511	validation_1-logloss:0.29354
[393]	validation_0-logloss:0.02499	validation_1-logloss:0.29364
[394]	validation_0-logloss:0.02487	validation_1-logloss:0.29341
[395]	validation_0-logloss:0.02475	validation_1-logloss:0.29322
[396]	validation_0-logloss:0.02464	validation_1-logloss:0.29338
[397]	validation_0-logloss:0.02452	validation_1-logloss:0.29284
[398]	validation_0-logloss:0.02441	validation_1-logloss:0.29262
[399]	validation_0-logloss:0.02428	valid

[513]	validation_0-logloss:0.01546	validation_1-logloss:0.29215
[514]	validation_0-logloss:0.01541	validation_1-logloss:0.29230
[515]	validation_0-logloss:0.01537	validation_1-logloss:0.29241
[516]	validation_0-logloss:0.01532	validation_1-logloss:0.29250
[517]	validation_0-logloss:0.01528	validation_1-logloss:0.29269
[518]	validation_0-logloss:0.01523	validation_1-logloss:0.29287
[519]	validation_0-logloss:0.01518	validation_1-logloss:0.29299
[520]	validation_0-logloss:0.01513	validation_1-logloss:0.29299
[521]	validation_0-logloss:0.01508	validation_1-logloss:0.29304
[522]	validation_0-logloss:0.01504	validation_1-logloss:0.29319
[523]	validation_0-logloss:0.01500	validation_1-logloss:0.29323
[524]	validation_0-logloss:0.01495	validation_1-logloss:0.29341
[525]	validation_0-logloss:0.01490	validation_1-logloss:0.29349
[526]	validation_0-logloss:0.01486	validation_1-logloss:0.29353
[527]	validation_0-logloss:0.01484	validation_1-logloss:0.29355
[528]	validation_0-logloss:0.01479	valid



[0]	validation_0-logloss:0.68380	validation_1-logloss:0.68666
[1]	validation_0-logloss:0.67472	validation_1-logloss:0.68019
[2]	validation_0-logloss:0.66581	validation_1-logloss:0.67363
[3]	validation_0-logloss:0.65707	validation_1-logloss:0.66762
[4]	validation_0-logloss:0.64850	validation_1-logloss:0.66151
[5]	validation_0-logloss:0.64008	validation_1-logloss:0.65538
[6]	validation_0-logloss:0.63182	validation_1-logloss:0.64981
[7]	validation_0-logloss:0.62371	validation_1-logloss:0.64416
[8]	validation_0-logloss:0.61575	validation_1-logloss:0.63847
[9]	validation_0-logloss:0.60794	validation_1-logloss:0.63319
[10]	validation_0-logloss:0.60026	validation_1-logloss:0.62784
[11]	validation_0-logloss:0.59272	validation_1-logloss:0.62235
[12]	validation_0-logloss:0.58531	validation_1-logloss:0.61730
[13]	validation_0-logloss:0.57803	validation_1-logloss:0.61240
[14]	validation_0-logloss:0.57088	validation_1-logloss:0.60735
[15]	validation_0-logloss:0.56385	validation_1-logloss:0.60278
[1

[126]	validation_0-logloss:0.16850	validation_1-logloss:0.36838
[127]	validation_0-logloss:0.16703	validation_1-logloss:0.36737
[128]	validation_0-logloss:0.16556	validation_1-logloss:0.36673
[129]	validation_0-logloss:0.16412	validation_1-logloss:0.36561
[130]	validation_0-logloss:0.16263	validation_1-logloss:0.36451
[131]	validation_0-logloss:0.16120	validation_1-logloss:0.36363
[132]	validation_0-logloss:0.15981	validation_1-logloss:0.36236
[133]	validation_0-logloss:0.15832	validation_1-logloss:0.36185
[134]	validation_0-logloss:0.15683	validation_1-logloss:0.36148
[135]	validation_0-logloss:0.15549	validation_1-logloss:0.36031
[136]	validation_0-logloss:0.15406	validation_1-logloss:0.35975
[137]	validation_0-logloss:0.15261	validation_1-logloss:0.35949
[138]	validation_0-logloss:0.15121	validation_1-logloss:0.35825
[139]	validation_0-logloss:0.14982	validation_1-logloss:0.35712
[140]	validation_0-logloss:0.14848	validation_1-logloss:0.35628
[141]	validation_0-logloss:0.14712	valid

[255]	validation_0-logloss:0.05747	validation_1-logloss:0.29828
[256]	validation_0-logloss:0.05706	validation_1-logloss:0.29817
[257]	validation_0-logloss:0.05664	validation_1-logloss:0.29806
[258]	validation_0-logloss:0.05620	validation_1-logloss:0.29793
[259]	validation_0-logloss:0.05577	validation_1-logloss:0.29769
[260]	validation_0-logloss:0.05536	validation_1-logloss:0.29752
[261]	validation_0-logloss:0.05497	validation_1-logloss:0.29739
[262]	validation_0-logloss:0.05455	validation_1-logloss:0.29727
[263]	validation_0-logloss:0.05413	validation_1-logloss:0.29710
[264]	validation_0-logloss:0.05373	validation_1-logloss:0.29678
[265]	validation_0-logloss:0.05335	validation_1-logloss:0.29669
[266]	validation_0-logloss:0.05298	validation_1-logloss:0.29643
[267]	validation_0-logloss:0.05258	validation_1-logloss:0.29623
[268]	validation_0-logloss:0.05221	validation_1-logloss:0.29603
[269]	validation_0-logloss:0.05183	validation_1-logloss:0.29575
[270]	validation_0-logloss:0.05143	valid

[384]	validation_0-logloss:0.02420	validation_1-logloss:0.27915
[385]	validation_0-logloss:0.02404	validation_1-logloss:0.27933
[386]	validation_0-logloss:0.02391	validation_1-logloss:0.27893
[387]	validation_0-logloss:0.02379	validation_1-logloss:0.27936
[388]	validation_0-logloss:0.02367	validation_1-logloss:0.27958
[389]	validation_0-logloss:0.02354	validation_1-logloss:0.27978
[390]	validation_0-logloss:0.02342	validation_1-logloss:0.27959
[391]	validation_0-logloss:0.02329	validation_1-logloss:0.27920
[392]	validation_0-logloss:0.02319	validation_1-logloss:0.27955
[393]	validation_0-logloss:0.02305	validation_1-logloss:0.27966
[394]	validation_0-logloss:0.02292	validation_1-logloss:0.27999
[395]	validation_0-logloss:0.02281	validation_1-logloss:0.28000
[396]	validation_0-logloss:0.02269	validation_1-logloss:0.27963
[397]	validation_0-logloss:0.02258	validation_1-logloss:0.27973
[398]	validation_0-logloss:0.02245	validation_1-logloss:0.28002
[399]	validation_0-logloss:0.02235	valid



[0]	validation_0-logloss:0.68402	validation_1-logloss:0.68480
[1]	validation_0-logloss:0.67526	validation_1-logloss:0.67697
[2]	validation_0-logloss:0.66667	validation_1-logloss:0.66978
[3]	validation_0-logloss:0.65824	validation_1-logloss:0.66255
[4]	validation_0-logloss:0.64996	validation_1-logloss:0.65540
[5]	validation_0-logloss:0.64184	validation_1-logloss:0.64794
[6]	validation_0-logloss:0.63387	validation_1-logloss:0.64107
[7]	validation_0-logloss:0.62604	validation_1-logloss:0.63426
[8]	validation_0-logloss:0.61836	validation_1-logloss:0.62760
[9]	validation_0-logloss:0.61081	validation_1-logloss:0.62084
[10]	validation_0-logloss:0.60340	validation_1-logloss:0.61440
[11]	validation_0-logloss:0.59611	validation_1-logloss:0.60813
[12]	validation_0-logloss:0.58875	validation_1-logloss:0.60371
[13]	validation_0-logloss:0.58172	validation_1-logloss:0.59765
[14]	validation_0-logloss:0.57461	validation_1-logloss:0.59341
[15]	validation_0-logloss:0.56761	validation_1-logloss:0.58740
[1

[126]	validation_0-logloss:0.17663	validation_1-logloss:0.33743
[127]	validation_0-logloss:0.17500	validation_1-logloss:0.33637
[128]	validation_0-logloss:0.17338	validation_1-logloss:0.33568
[129]	validation_0-logloss:0.17183	validation_1-logloss:0.33486
[130]	validation_0-logloss:0.17024	validation_1-logloss:0.33417
[131]	validation_0-logloss:0.16873	validation_1-logloss:0.33333
[132]	validation_0-logloss:0.16718	validation_1-logloss:0.33217
[133]	validation_0-logloss:0.16570	validation_1-logloss:0.33143
[134]	validation_0-logloss:0.16419	validation_1-logloss:0.33049
[135]	validation_0-logloss:0.16274	validation_1-logloss:0.32980
[136]	validation_0-logloss:0.16131	validation_1-logloss:0.32905
[137]	validation_0-logloss:0.15981	validation_1-logloss:0.32818
[138]	validation_0-logloss:0.15841	validation_1-logloss:0.32736
[139]	validation_0-logloss:0.15695	validation_1-logloss:0.32638
[140]	validation_0-logloss:0.15557	validation_1-logloss:0.32585
[141]	validation_0-logloss:0.15415	valid

[255]	validation_0-logloss:0.05890	validation_1-logloss:0.28229
[256]	validation_0-logloss:0.05851	validation_1-logloss:0.28241
[257]	validation_0-logloss:0.05807	validation_1-logloss:0.28227
[258]	validation_0-logloss:0.05763	validation_1-logloss:0.28214
[259]	validation_0-logloss:0.05717	validation_1-logloss:0.28205
[260]	validation_0-logloss:0.05680	validation_1-logloss:0.28225
[261]	validation_0-logloss:0.05638	validation_1-logloss:0.28230
[262]	validation_0-logloss:0.05596	validation_1-logloss:0.28207
[263]	validation_0-logloss:0.05553	validation_1-logloss:0.28167
[264]	validation_0-logloss:0.05511	validation_1-logloss:0.28113
[265]	validation_0-logloss:0.05469	validation_1-logloss:0.28133
[266]	validation_0-logloss:0.05427	validation_1-logloss:0.28085
[267]	validation_0-logloss:0.05387	validation_1-logloss:0.28084
[268]	validation_0-logloss:0.05347	validation_1-logloss:0.28097
[269]	validation_0-logloss:0.05307	validation_1-logloss:0.28060
[270]	validation_0-logloss:0.05267	valid

[384]	validation_0-logloss:0.02603	validation_1-logloss:0.28546
[385]	validation_0-logloss:0.02590	validation_1-logloss:0.28562
[386]	validation_0-logloss:0.02575	validation_1-logloss:0.28537
[387]	validation_0-logloss:0.02562	validation_1-logloss:0.28553
[388]	validation_0-logloss:0.02545	validation_1-logloss:0.28560
[389]	validation_0-logloss:0.02531	validation_1-logloss:0.28536
[390]	validation_0-logloss:0.02518	validation_1-logloss:0.28553
[391]	validation_0-logloss:0.02505	validation_1-logloss:0.28588
[392]	validation_0-logloss:0.02489	validation_1-logloss:0.28593
[393]	validation_0-logloss:0.02475	validation_1-logloss:0.28592
[394]	validation_0-logloss:0.02462	validation_1-logloss:0.28640
[395]	validation_0-logloss:0.02450	validation_1-logloss:0.28681
[396]	validation_0-logloss:0.02438	validation_1-logloss:0.28686
[397]	validation_0-logloss:0.02426	validation_1-logloss:0.28700
[398]	validation_0-logloss:0.02410	validation_1-logloss:0.28698
[399]	validation_0-logloss:0.02397	valid



[0]	validation_0-logloss:0.68388	validation_1-logloss:0.68627
[1]	validation_0-logloss:0.67498	validation_1-logloss:0.67946
[2]	validation_0-logloss:0.66637	validation_1-logloss:0.67280
[3]	validation_0-logloss:0.65780	validation_1-logloss:0.66595
[4]	validation_0-logloss:0.64938	validation_1-logloss:0.65988
[5]	validation_0-logloss:0.64113	validation_1-logloss:0.65363
[6]	validation_0-logloss:0.63302	validation_1-logloss:0.64762
[7]	validation_0-logloss:0.62506	validation_1-logloss:0.64163
[8]	validation_0-logloss:0.61724	validation_1-logloss:0.63606
[9]	validation_0-logloss:0.60956	validation_1-logloss:0.63044
[10]	validation_0-logloss:0.60201	validation_1-logloss:0.62471
[11]	validation_0-logloss:0.59470	validation_1-logloss:0.61946
[12]	validation_0-logloss:0.58741	validation_1-logloss:0.61414
[13]	validation_0-logloss:0.58025	validation_1-logloss:0.60905
[14]	validation_0-logloss:0.57321	validation_1-logloss:0.60406
[15]	validation_0-logloss:0.56629	validation_1-logloss:0.59900
[1

[126]	validation_0-logloss:0.17933	validation_1-logloss:0.34652
[127]	validation_0-logloss:0.17778	validation_1-logloss:0.34614
[128]	validation_0-logloss:0.17616	validation_1-logloss:0.34532
[129]	validation_0-logloss:0.17460	validation_1-logloss:0.34452
[130]	validation_0-logloss:0.17304	validation_1-logloss:0.34383
[131]	validation_0-logloss:0.17155	validation_1-logloss:0.34350
[132]	validation_0-logloss:0.17000	validation_1-logloss:0.34289
[133]	validation_0-logloss:0.16841	validation_1-logloss:0.34268
[134]	validation_0-logloss:0.16689	validation_1-logloss:0.34201
[135]	validation_0-logloss:0.16546	validation_1-logloss:0.34174
[136]	validation_0-logloss:0.16403	validation_1-logloss:0.34082
[137]	validation_0-logloss:0.16251	validation_1-logloss:0.33950
[138]	validation_0-logloss:0.16107	validation_1-logloss:0.33895
[139]	validation_0-logloss:0.15959	validation_1-logloss:0.33760
[140]	validation_0-logloss:0.15825	validation_1-logloss:0.33718
[141]	validation_0-logloss:0.15679	valid

[255]	validation_0-logloss:0.06022	validation_1-logloss:0.28369
[256]	validation_0-logloss:0.05981	validation_1-logloss:0.28397
[257]	validation_0-logloss:0.05935	validation_1-logloss:0.28350
[258]	validation_0-logloss:0.05891	validation_1-logloss:0.28318
[259]	validation_0-logloss:0.05846	validation_1-logloss:0.28263
[260]	validation_0-logloss:0.05806	validation_1-logloss:0.28269
[261]	validation_0-logloss:0.05760	validation_1-logloss:0.28222
[262]	validation_0-logloss:0.05717	validation_1-logloss:0.28235
[263]	validation_0-logloss:0.05678	validation_1-logloss:0.28243
[264]	validation_0-logloss:0.05637	validation_1-logloss:0.28214
[265]	validation_0-logloss:0.05594	validation_1-logloss:0.28159
[266]	validation_0-logloss:0.05550	validation_1-logloss:0.28115
[267]	validation_0-logloss:0.05513	validation_1-logloss:0.28124
[268]	validation_0-logloss:0.05473	validation_1-logloss:0.28088
[269]	validation_0-logloss:0.05432	validation_1-logloss:0.28028
[270]	validation_0-logloss:0.05390	valid

[384]	validation_0-logloss:0.02618	validation_1-logloss:0.27346
[385]	validation_0-logloss:0.02605	validation_1-logloss:0.27367
[386]	validation_0-logloss:0.02591	validation_1-logloss:0.27403
[387]	validation_0-logloss:0.02573	validation_1-logloss:0.27379
[388]	validation_0-logloss:0.02556	validation_1-logloss:0.27392
[389]	validation_0-logloss:0.02539	validation_1-logloss:0.27369
[390]	validation_0-logloss:0.02527	validation_1-logloss:0.27390
[391]	validation_0-logloss:0.02510	validation_1-logloss:0.27426
[392]	validation_0-logloss:0.02493	validation_1-logloss:0.27450
[393]	validation_0-logloss:0.02481	validation_1-logloss:0.27461
[394]	validation_0-logloss:0.02469	validation_1-logloss:0.27453
[395]	validation_0-logloss:0.02453	validation_1-logloss:0.27489
[396]	validation_0-logloss:0.02437	validation_1-logloss:0.27513
[397]	validation_0-logloss:0.02420	validation_1-logloss:0.27550
[398]	validation_0-logloss:0.02404	validation_1-logloss:0.27574
[399]	validation_0-logloss:0.02393	valid

In [32]:
print("\n\n Classification Model Metrics:")
print("Macro F1: ", sum(macro_scores)/len(macro_scores))
print("Weighted F1: ", sum(weighted_scores)/len(weighted_scores))
print("Accuracy: ", sum(acc_scores)/len(acc_scores), "%")
print("MCC: ", sum(mcc_scores)/len(mcc_scores))



 Classification Model Metrics:
Macro F1:  0.9010001731585714
Weighted F1:  0.9029388131532787
Accuracy:  90.29411764705883 %
MCC:  0.8038591413632978


In [33]:
X_data = train_2.drop(['CO: 3', 'CO: 4', 'CO: 5', 'CO: 6'], axis = 1)
y_data = train_2['CO: 6']

cols = X_data.columns
sc = StandardScaler()
X_data = pd.DataFrame(sc.fit_transform(X_data), columns = cols)


#X_data = pd.DataFrame(np.square(X_data), columns = cols)

#X_data = X_data.fillna(-1)

kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 10)
splits = kfold.split(X_data, y_data)

sm = SMOTE(random_state = 10, k_neighbors = 7, n_jobs = -1)

macro_scores = []
acc_scores = []
weighted_scores = []
mcc_scores = []
Co6_models = []

for Train, Test in splits:
    X_Train, X_Test, Y_Train, Y_Test = X_data.iloc[Train], X_data.iloc[Test], y_data.iloc[Train], y_data.iloc[Test]
    #xgb = XGBClassifier(n_estimators = 2000, max_depth = 8, n_jobs = -1, learning_rate = 0.01, reg_lambda = 0.1)
    #X_Train, Y_Train = sm.fit_resample(X_Train, Y_Train)
    #xgb.fit(X_Train, Y_Train, early_stopping_rounds = 200, eval_set = [(X_Train, Y_Train), (X_Test, Y_Test)],  verbose = True)
    #pred = xgb.predict(X_Test)
    #'''
    rf = RandomForestClassifier(n_jobs = -1, n_estimators = 175, random_state = 10)

    ada_rf = AdaBoostClassifier(base_estimator=rf, n_estimators=100, learning_rate = 0.5, random_state = 10)

    ada_rf.fit(X_Train, Y_Train)
    pred = ada_rf.predict(X_Test)
    #'''
    
    #lr = LogisticRegression()
    #lr.fit(X_Train, Y_Train)
    #pred = lr.predict(X_Test)
    
    macro_scores.append(f1_score(Y_Test, pred, average = 'macro'))
    weighted_scores.append(f1_score(Y_Test, pred, average = 'weighted'))
    acc_scores.append(accuracy_score(Y_Test, pred)*100)
    mcc_scores.append(matthews_corrcoef(Y_Test, pred))
    Co6_models.append(ada_rf)

In [34]:
print("\n\n Classification Model Metrics:")
print("Macro F1: ", sum(macro_scores)/len(macro_scores))
print("Weighted F1: ", sum(weighted_scores)/len(weighted_scores))
print("Accuracy: ", sum(acc_scores)/len(acc_scores), "%")
print("MCC: ", sum(mcc_scores)/len(mcc_scores))



 Classification Model Metrics:
Macro F1:  0.466711170961171
Weighted F1:  0.5119800209506092
Accuracy:  59.11764705882352 %
MCC:  0.07820546092511826


In [37]:
test_1 = pca.fit_transform(test_1)
test_2 = sc.fit_transform(test_2)


CO1_preds = np.argmax(Co1_models[0].predict_proba(test_1) +
                      Co1_models[1].predict_proba(test_1) +
                      Co1_models[2].predict_proba(test_1) +
                      Co1_models[3].predict_proba(test_1) +
                      #Co1_models[4].predict_proba(test_1) +
                      #Co1_models[5].predict_proba(test_1) +
                      #Co1_models[6].predict_proba(test_1) +
                      #Co1_models[7].predict_proba(test_1) +
                      #Co1_models[8].predict_proba(test_1) +
                      Co1_models[4].predict_proba(test_1), axis = 1)

CO2_preds = np.argmax(Co2_models[0].predict_proba(test_1) +
                      Co2_models[1].predict_proba(test_1) +
                      Co2_models[2].predict_proba(test_1) +
                      Co2_models[3].predict_proba(test_1) +
                      #Co2_models[4].predict_proba(test_1) +
                      #Co2_models[5].predict_proba(test_1) +
                      #Co2_models[6].predict_proba(test_1) +
                      #Co2_models[7].predict_proba(test_1) +
                      #Co2_models[8].predict_proba(test_1) +
                      Co2_models[4].predict_proba(test_1), axis = 1)

CO3_preds = np.argmax(Co3_models[0].predict_proba(test_2) +
                      Co3_models[1].predict_proba(test_2) +
                      Co3_models[2].predict_proba(test_2) +
                      Co3_models[3].predict_proba(test_2) +
                      Co3_models[4].predict_proba(test_2), axis = 1)

CO4_preds = np.argmax(Co4_models[0].predict_proba(test_2) +
                      Co4_models[1].predict_proba(test_2) +
                      Co4_models[2].predict_proba(test_2) +
                      Co4_models[3].predict_proba(test_2) +
                      Co4_models[4].predict_proba(test_2) +
                      Co4_models[5].predict_proba(test_2) +
                      Co4_models[6].predict_proba(test_2) +
                      Co4_models[7].predict_proba(test_2) +
                      Co4_models[8].predict_proba(test_2) +
                      Co4_models[9].predict_proba(test_2), axis = 1)

CO5_preds = np.argmax(Co5_models[0].predict_proba(test_2) +
                      Co5_models[1].predict_proba(test_2) +
                      Co5_models[2].predict_proba(test_2) +
                      Co5_models[3].predict_proba(test_2) +
                      Co5_models[4].predict_proba(test_2), axis = 1)

CO6_preds = np.argmax(Co6_models[0].predict_proba(test_2) +
                      Co6_models[1].predict_proba(test_2) +
                      Co6_models[2].predict_proba(test_2) +
                      Co6_models[3].predict_proba(test_2) +
                      #Co6_models[4].predict_proba(test_2) +
                      #Co6_models[5].predict_proba(test_2) +
                      #Co6_models[6].predict_proba(test_2) +
                      #Co6_models[7].predict_proba(test_2) +
                      #Co6_models[8].predict_proba(test_2) +
                      Co6_models[4].predict_proba(test_2), axis = 1)

In [38]:
len(CO1_preds)

100

In [39]:
print(len(CO1_preds[CO1_preds==1]))
print(len(CO2_preds[CO2_preds==1]))
print(len(CO3_preds[CO3_preds==1]))
print(len(CO4_preds[CO4_preds==1]))
print(len(CO5_preds[CO5_preds==1]))
print(len(CO6_preds[CO6_preds==1]))

25
8
9
4
87
201


In [40]:
df1 = {'Id': range(100), 'Predicted': CO1_preds}
df1 = pd.DataFrame(df1)

df2 = {'Id': range(100, 200), 'Predicted': CO2_preds}
df2 = pd.DataFrame(df2)

df3 = {'Id': range(200, 414), 'Predicted': CO3_preds}
df3 = pd.DataFrame(df3)

df4 = {'Id': range(414, 628), 'Predicted': CO4_preds}
df4 = pd.DataFrame(df4)

df5 = {'Id': range(628, 842), 'Predicted': CO5_preds}
df5 = pd.DataFrame(df5)

df6 = {'Id': range(842, 1056), 'Predicted': CO6_preds}
df6 = pd.DataFrame(df6)

In [41]:
pred_df = pd.concat([df1,df2,df3,df4,df5,df6])

pred_df.shape

(1056, 2)

In [42]:
pred_df.head()

Unnamed: 0,Id,Predicted
0,0,0
1,1,1
2,2,1
3,3,0
4,4,1


In [43]:
pred_df.tail()

Unnamed: 0,Id,Predicted
209,1051,1
210,1052,1
211,1053,1
212,1054,1
213,1055,1


In [44]:
pred_df.reset_index(drop = True, inplace = True)

In [45]:
pred_df.to_csv('Submissions/Baseline-42.csv', index = False)