In [None]:
import pandas as pd
import numpy as np

labels = ['label_1', 'label_2', 'label_3', 'label_4']
features = [f'feature_{i}' for i in range (1,257)]

In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/files/train.csv')
valid_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/files/valid.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/files/test.csv')

train_df.shape, valid_df.shape, test_df.shape

((28520, 260), (750, 260), (750, 260))

In [None]:
from sklearn.preprocessing import StandardScaler

x_train = {}
y_train = {}

x_valid = {}
y_valid = {}

x_test = {}

for label in labels:
  tr_df = train_df[train_df['label_2'].notna()] if label == 'label_2' else train_df
  vl_df = valid_df

  scaler = StandardScaler()
  x_train[label] = pd.DataFrame(scaler.fit_transform(tr_df.drop(labels, axis = 1)), columns = features)
  y_train[label] = tr_df[label]

  x_valid[label] = pd.DataFrame(scaler.transform(vl_df.drop(labels, axis = 1)), columns = features)
  y_valid[label] = vl_df[label]

  x_test[label] = pd.DataFrame(scaler.transform(vl_df.drop(labels, axis = 1)), columns = features)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(x_train['label_4'], y_train['label_4'])


In [None]:
y_pred = knn.predict(x_valid['label_4'])

In [None]:
y_pred_test_before = knn.predict(x_test['label_4'])

In [None]:
from sklearn import metrics

print(metrics.confusion_matrix(y_valid['label_4'], y_pred))
print(metrics.accuracy_score(y_valid['label_4'], y_pred))
print(metrics.precision_score(y_valid['label_4'], y_pred, average='weighted'))
print(metrics.recall_score(y_valid['label_4'], y_pred, average='weighted'))

[[ 21   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0  10   0   0   0   0   1   0   0   0   0   0   0   0]
 [  0   0  27   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   8   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0  15   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0  10   1   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0 532   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   1  31   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0  19   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0  17   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0  10   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0  11   0   0]
 [  0   0   0   0   0   0   0   1   0   0   0   0  25   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0  10]]
0.9946666666666667
0.9946890965732088
0.9946666666666667


In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

selector = SelectKBest(f_classif, k=60)
x_train_kBest = selector.fit_transform(x_train['label_4'], y_train['label_4'])
x_valid_kBest = selector.transform(x_valid['label_4'])
x_test_kBest = selector.transform(x_test['label_4'])
print (x_train_kBest.shape)
print (x_valid_kBest.shape)
print (x_test_kBest.shape)

(28520, 60)
(750, 60)
(750, 60)


In [None]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(x_train_kBest, y_train['label_4'])

In [None]:
y_pred = knn.predict(x_valid_kBest)
y_pred_test = knn.predict(x_test_kBest)
print(metrics.confusion_matrix(y_valid['label_4'], y_pred))
print(metrics.accuracy_score(y_valid['label_4'], y_pred))
print(metrics.precision_score(y_valid['label_4'], y_pred, average='weighted'))
print(metrics.recall_score(y_valid['label_4'], y_pred, average='weighted'))

[[ 20   0   0   0   0   0   1   0   0   0   0   0   0   0]
 [  0  11   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0  27   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   8   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0  15   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0  10   1   0   0   0   0   0   0   0]
 [  1   0   0   0   0   0 531   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   1  31   0   0   0   0   0   0]
 [  0   0   0   0   0   0   1   0  18   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0  17   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0  10   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0  11   0   0]
 [  0   0   0   0   0   0   3   1   0   0   0   0  22   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0  10]]
0.988
0.9881040892193308
0.988


In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components= 0.95, svd_solver='full')
pca.fit(x_train_kBest)
x_train_trf = pd.DataFrame(pca.transform(x_train_kBest))
x_valid_trf = pd.DataFrame(pca.transform(x_valid_kBest))
x_test_trf = pd.DataFrame(pca.transform(x_test_kBest))
print(x_train_trf.shape)
print(x_valid_trf.shape)
print(x_test_trf.shape)

(28520, 37)
(750, 37)
(750, 37)


In [None]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(x_train_trf, y_train['label_4'])

In [None]:
y_pred = knn.predict(x_valid_trf)
y_pred_test_after = knn.predict(x_test_trf)
print(metrics.confusion_matrix(y_valid['label_4'], y_pred))
print(metrics.accuracy_score(y_valid['label_4'], y_pred))
print(metrics.precision_score(y_valid['label_4'], y_pred, average='weighted'))
print(metrics.recall_score(y_valid['label_4'], y_pred, average='weighted'))

[[ 20   0   0   0   0   0   1   0   0   0   0   0   0   0]
 [  0  10   0   0   0   0   0   1   0   0   0   0   0   0]
 [  0   0  27   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   8   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0  15   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0  10   1   0   0   0   0   0   0   0]
 [  1   0   0   0   0   0 531   0   0   0   0   0   0   0]
 [  1   0   0   0   0   0   1  30   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   1  18   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0  17   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0  10   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0  11   0   0]
 [  0   1   0   0   0   0   3   0   0   0   0   0  22   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0  10]]
0.9853333333333333
0.9855290333502624
0.9853333333333333


In [None]:
num_new_features = x_test_trf.shape[1]
print(num_new_features)

37


In [None]:
output_df = pd.DataFrame({
    'Predicted labels before feature engineering': y_pred_test_before,
    'Predicted labels after feature engineering': y_pred_test_after,
    'No. of new features': x_test_trf.shape[1]
})

for i in range(x_test_trf.shape[1]):
    output_df[f'new feature {i+1}'] = x_test_trf.iloc[:, i]

output_df

Unnamed: 0,Predicted labels before feature engineering,Predicted labels after feature engineering,No. of new features,new feature 1,new feature 2,new feature 3,new feature 4,new feature 5,new feature 6,new feature 7,...,new feature 28,new feature 29,new feature 30,new feature 31,new feature 32,new feature 33,new feature 34,new feature 35,new feature 36,new feature 37
0,6,6,37,-3.313408,3.971337,1.744291,-0.804551,2.756000,-1.223531,-0.225108,...,-0.723412,0.744039,-0.894402,1.539021,-0.305422,-0.314277,-0.854320,0.447029,0.516338,-0.608075
1,6,6,37,-4.340096,3.241674,-1.058111,-1.382537,4.635603,-1.649134,-0.052092,...,-0.796131,1.362043,-0.149149,1.266280,0.036471,0.433333,-0.955409,-0.820698,0.330251,-0.128613
2,6,6,37,-3.711165,1.282441,2.467684,0.332289,2.056713,-1.179571,2.993641,...,-1.575480,0.713239,-0.146178,-0.258767,-0.129922,-0.138110,-0.494576,0.289759,-0.500893,0.695137
3,6,6,37,-2.151661,-0.213248,3.624044,-0.046254,1.672402,-0.770450,1.351400,...,0.825445,0.792099,0.198310,0.031420,-0.335194,-0.782582,0.064115,0.249965,-0.470602,-0.293663
4,6,6,37,-4.479495,-0.420705,3.052763,-0.525286,3.458332,-0.049744,0.000096,...,-1.270208,1.936805,-0.146642,0.133789,0.110009,-0.701704,-0.733898,-0.442450,0.523941,-0.482758
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
745,6,6,37,-1.061815,-1.941989,-0.653881,-0.239028,1.576374,-1.518824,2.429679,...,-0.025595,0.203848,-0.237023,0.531899,-0.314948,1.250377,-0.278585,-0.164459,0.637839,0.935229
746,6,6,37,-4.198447,1.330081,-2.351318,1.980400,-2.561068,-0.411105,-1.372720,...,-0.110202,0.369643,-0.309416,0.796288,0.895817,1.278864,-0.342137,0.535824,-0.339322,0.299136
747,6,6,37,-1.224442,-2.176131,-1.344708,-1.061307,-0.928321,-2.590136,2.567057,...,-0.548131,0.679761,-0.089944,0.661706,-0.305274,1.001046,-0.166890,0.912449,0.399666,0.487644
748,6,6,37,-0.701875,-3.937709,0.348316,-1.681283,-0.470682,-0.098422,1.446497,...,-0.804841,-1.316879,-0.174902,-0.054609,-0.009838,1.212103,0.255034,0.407127,-0.088884,1.372577


In [None]:
output_df.to_csv('/content/drive/MyDrive/Colab Notebooks/files/190377T_label_4.csv', index=False)