In [None]:
import pandas as pd
import numpy as np

labels = ['label_1', 'label_2', 'label_3', 'label_4']
features = [f'feature_{i}' for i in range (1,257)]

In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/files/train.csv')
valid_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/files/valid.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/files/test.csv')

train_df.shape, valid_df.shape, test_df.shape

((28520, 260), (750, 260), (750, 260))

In [None]:
from sklearn.preprocessing import StandardScaler

x_train = {}
y_train = {}

x_valid = {}
y_valid = {}

x_test = {}

for label in labels:
  tr_df = train_df[train_df['label_2'].notna()] if label == 'label_2' else train_df
  vl_df = valid_df
  tst_df = test_df

  scaler = StandardScaler()
  x_train[label] = pd.DataFrame(scaler.fit_transform(tr_df.drop(labels, axis = 1)), columns = features)
  y_train[label] = tr_df[label]

  x_valid[label] = pd.DataFrame(scaler.transform(vl_df.drop(labels, axis = 1)), columns = features)
  y_valid[label] = vl_df[label]

  x_test[label] = pd.DataFrame(scaler.transform(tst_df.drop(labels, axis = 1)), columns = features)


In [None]:
from sklearn import svm

clf = svm.SVC(kernel= 'linear')
clf.fit(x_train['label_3'], y_train['label_3'])

In [None]:
y_pred = clf.predict(x_valid['label_3'])

In [None]:
y_pred_test_before = clf.predict(x_test['label_3'])

In [None]:
from sklearn import metrics

print(metrics.confusion_matrix(y_valid['label_3'], y_pred))
print(metrics.accuracy_score(y_valid['label_3'], y_pred))
print(metrics.precision_score(y_valid['label_3'], y_pred))
print(metrics.recall_score(y_valid['label_3'], y_pred))

[[142   0]
 [  1 607]]
0.9986666666666667
1.0
0.9983552631578947


In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

selector = SelectKBest(f_classif, k=15)
x_train_kBest = selector.fit_transform(x_train['label_3'], y_train['label_3'])
x_valid_kBest = selector.transform(x_valid['label_3'])
x_test_kBest = selector.transform(x_test['label_3'])
print (x_train_kBest.shape)
print (x_valid_kBest.shape)
print (x_test_kBest.shape)

(28520, 15)
(750, 15)
(750, 15)


In [None]:
clf = svm.SVC(kernel= 'linear')
clf.fit(x_train_kBest, y_train['label_3'])

In [None]:
y_pred = clf.predict(x_valid_kBest)
y_pred_test = clf.predict(x_test_kBest)
print(metrics.confusion_matrix(y_valid['label_3'], y_pred))
print(metrics.accuracy_score(y_valid['label_3'], y_pred))
print(metrics.precision_score(y_valid['label_3'], y_pred))
print(metrics.recall_score(y_valid['label_3'], y_pred))

[[137   5]
 [  7 601]]
0.984
0.9917491749174917
0.9884868421052632


In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components= 0.95, svd_solver='full')
pca.fit(x_train_kBest)
x_train_trf = pd.DataFrame(pca.transform(x_train_kBest))
x_valid_trf = pd.DataFrame(pca.transform(x_valid_kBest))
x_test_trf = pd.DataFrame(pca.transform(x_test_kBest))
print(x_train_trf.shape)
print(x_valid_trf.shape)
print(x_test_trf.shape)

(28520, 12)
(750, 12)
(750, 12)


In [None]:
clf = svm.SVC(kernel= 'linear')
clf.fit(x_train_trf, y_train['label_3'])

y_pred = clf.predict(x_valid_trf)
y_pred_test_after = clf.predict(x_test_trf)
print(metrics.confusion_matrix(y_valid['label_3'], y_pred))
print(metrics.accuracy_score(y_valid['label_3'], y_pred))
print(metrics.precision_score(y_valid['label_3'], y_pred))
print(metrics.recall_score(y_valid['label_3'], y_pred))

[[137   5]
 [  8 600]]
0.9826666666666667
0.9917355371900827
0.9868421052631579


In [None]:
num_new_features = x_test_trf.shape[1]
print(num_new_features)

12


In [None]:
output_df = pd.DataFrame({
    'Predicted labels before feature engineering': y_pred_test_before,
    'Predicted labels after feature engineering': y_pred_test_after,
    'No. of new features': x_test_trf.shape[1]
})

for i in range(x_test_trf.shape[1]):
    output_df[f'new feature {i+1}'] = x_test_trf.iloc[:, i]
output_df

Unnamed: 0,Predicted labels before feature engineering,Predicted labels after feature engineering,No. of new features,new feature 1,new feature 2,new feature 3,new feature 4,new feature 5,new feature 6,new feature 7,new feature 8,new feature 9,new feature 10,new feature 11,new feature 12
0,1,1,12,-1.560401,0.547585,0.613460,0.488944,0.233534,-0.595970,-1.346150,0.241082,-0.362530,-0.266361,0.589218,-0.842507
1,1,1,12,-2.143333,0.825362,0.818877,1.007486,-0.521076,-0.882749,-1.472594,-0.885638,0.275080,-0.507292,-0.119707,-0.661068
2,1,1,12,-3.575549,0.320855,-0.381191,-0.512013,0.594970,-0.054969,-1.969873,0.420373,-0.889453,0.243310,-0.107887,-0.089135
3,1,1,12,-2.634428,-0.830557,1.080932,0.930836,0.641772,0.400238,-0.947103,0.453250,0.135590,-0.550631,0.315002,0.261183
4,1,1,12,-3.865250,-0.991637,1.061084,0.450643,-0.025507,-1.421258,-0.290875,0.872319,-1.385020,-0.298720,0.012161,-0.168566
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
745,1,1,12,-0.968818,-0.900845,-0.323168,0.289818,0.042629,-0.385851,0.261177,0.180923,-0.580402,-0.488867,0.741220,-0.295386
746,1,1,12,-1.661320,-0.106156,-0.635330,-1.335061,0.560128,0.510406,0.485416,-1.015275,-0.553690,-0.783628,-0.227181,-0.843730
747,1,1,12,-1.045689,-0.918966,-2.093063,-1.008008,1.185267,0.134043,-0.013664,-0.131177,0.153730,-0.312214,0.293061,0.508886
748,1,1,12,-2.592318,-1.767958,0.917525,-1.123659,-0.010075,0.499242,0.750225,0.169183,-0.675073,0.380341,0.689389,0.827870


In [None]:
output_df.to_csv('/content/drive/MyDrive/Colab Notebooks/files/190377T_label_3.csv', index=False)