In [None]:
import pandas as pd
import numpy as np

labels = ['label_1', 'label_2', 'label_3', 'label_4']
features = [f'feature_{i}' for i in range (1,257)]


In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/files/train.csv')
train_df.shape

(28520, 260)

In [None]:
valid_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/files/valid.csv')
valid_df.shape

(750, 260)

In [None]:
test_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/files/test.csv')
test_df.shape

(750, 260)

In [None]:
from sklearn.preprocessing import StandardScaler

x_train = {}
y_train = {}

x_valid = {}
y_valid = {}

x_test = {}

for label in labels:
  tr_df = train_df[train_df['label_2'].notna()] if label == 'label_2' else train_df
  vl_df = valid_df

  scaler = StandardScaler()
  x_train[label] = pd.DataFrame(scaler.fit_transform(tr_df.drop(labels, axis = 1)), columns = features)
  y_train[label] = tr_df[label]

  x_valid[label] = pd.DataFrame(scaler.transform(vl_df.drop(labels, axis = 1)), columns = features)
  y_valid[label] = vl_df[label]

  x_test[label] = pd.DataFrame(scaler.transform(vl_df.drop(labels, axis = 1)), columns = features)


In [None]:
from sklearn import svm

clf = svm.SVC(kernel= 'linear')
clf.fit(x_train['label_1'], y_train['label_1'])

In [None]:
y_pred = clf.predict(x_valid['label_1'])

In [None]:
y_pred_test_before = clf.predict(x_test['label_1'])


In [None]:
from sklearn import metrics

print(metrics.confusion_matrix(y_valid['label_1'], y_pred))
print(metrics.accuracy_score(y_valid['label_1'], y_pred))
print(metrics.precision_score(y_valid['label_1'], y_pred, average='weighted'))
print(metrics.recall_score(y_valid['label_1'], y_pred, average='weighted'))

[[13  0  0 ...  0  0  0]
 [ 0  9  0 ...  0  0  0]
 [ 0  0 12 ...  0  0  0]
 ...
 [ 0  0  0 ... 19  0  0]
 [ 0  0  0 ...  0 10  0]
 [ 0  0  0 ...  0  0 10]]
0.9906666666666667
0.9914608132608133
0.9906666666666667


In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

selector = SelectKBest(f_classif, k=130)
x_train_kBest = selector.fit_transform(x_train['label_1'], y_train['label_1'])
x_valid_kBest = selector.transform(x_valid['label_1'])
x_test_kBest = selector.transform(x_test['label_1'])
print (x_train_kBest.shape)
print (x_valid_kBest.shape)
print (x_test_kBest.shape)



(28520, 130)
(750, 130)
(750, 130)


In [None]:
clf = svm.SVC(kernel= 'linear')
clf.fit(x_train_kBest, y_train['label_1'])

In [None]:
y_pred = clf.predict(x_valid_kBest)
y_pred_test = clf.predict(x_test_kBest)
print(metrics.confusion_matrix(y_valid['label_1'], y_pred))
print(metrics.accuracy_score(y_valid['label_1'], y_pred))
print(metrics.precision_score(y_valid['label_1'], y_pred, average='weighted'))
print(metrics.recall_score(y_valid['label_1'], y_pred, average='weighted'))

[[12  0  0 ...  0  0  0]
 [ 0  8  0 ...  0  0  0]
 [ 0  0 12 ...  0  0  0]
 ...
 [ 0  0  0 ... 19  0  1]
 [ 0  0  0 ...  0 10  0]
 [ 0  0  0 ...  1  0  9]]
0.9786666666666667
0.9797212121212121
0.9786666666666667


In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components= 0.95, svd_solver='full')
pca.fit(x_train_kBest)
x_train_trf = pd.DataFrame(pca.transform(x_train_kBest))
x_valid_trf = pd.DataFrame(pca.transform(x_valid_kBest))
x_test_trf = pd.DataFrame(pca.transform(x_test_kBest))
print(x_train_trf.shape)
print(x_valid_trf.shape)
print(x_test_trf.shape)


(28520, 54)
(750, 54)
(750, 54)


In [None]:
clf = svm.SVC(kernel= 'linear')
clf.fit(x_train_trf, y_train['label_1'])

y_pred = clf.predict(x_valid_trf)
y_pred_test_after = clf.predict(x_test_trf)
print(metrics.confusion_matrix(y_valid['label_1'], y_pred))
print(metrics.accuracy_score(y_valid['label_1'], y_pred))
print(metrics.precision_score(y_valid['label_1'], y_pred, average='weighted'))
print(metrics.recall_score(y_valid['label_1'], y_pred, average='weighted'))

[[12  0  0 ...  0  0  0]
 [ 0  8  0 ...  0  0  0]
 [ 0  0 12 ...  0  0  0]
 ...
 [ 0  0  0 ... 18  0  1]
 [ 0  0  0 ...  0 10  0]
 [ 0  0  0 ...  1  0  9]]
0.9573333333333334
0.9601381840381842
0.9573333333333334


In [None]:
num_new_features = x_test_trf.shape[1]
print(num_new_features)

54


In [None]:
output_df = pd.DataFrame({
    'Predicted labels before feature engineering': y_pred_test_before,
    'Predicted labels after feature engineering': y_pred_test_after,
    'No. of new features': x_test_trf.shape[1]
})

output_df

Unnamed: 0,Predicted labels before feature engineering,Predicted labels after feature engineering,No. of new features
0,45,45,54
1,45,45,54
2,45,45,54
3,45,45,54
4,45,45,54
...,...,...,...
745,39,39,54
746,39,39,54
747,39,39,54
748,39,39,54


In [None]:
for i in range(x_test_trf.shape[1]):
    output_df[f'new feature {i+1}'] = x_test_trf.iloc[:, i]
output_df

Unnamed: 0,Predicted labels before feature engineering,Predicted labels after feature engineering,No. of new features,new feature 1,new feature 2,new feature 3,new feature 4,new feature 5,new feature 6,new feature 7,...,new feature 45,new feature 46,new feature 47,new feature 48,new feature 49,new feature 50,new feature 51,new feature 52,new feature 53,new feature 54
0,45,45,54,6.997687,-3.359792,-1.761418,-0.036624,1.086122,2.267507,-0.861734,...,-0.310533,0.509204,0.028219,-0.767454,0.125524,1.486168,-0.047735,0.306214,0.395076,-0.961466
1,45,45,54,7.092422,-2.074671,-2.545863,-1.232009,2.875451,2.211986,-2.079166,...,-0.560616,-0.373369,-0.452115,-0.993253,-1.000651,1.255284,0.082679,0.698936,-0.267131,-0.269837
2,45,45,54,9.042174,-3.809301,-0.756129,2.938039,0.255288,-0.614677,-2.441874,...,-0.488365,0.408846,-1.152733,1.373398,1.150884,0.062549,-0.887686,0.047044,0.098501,-0.071066
3,45,45,54,6.474868,-5.185906,0.712661,3.460085,2.823376,-1.034565,0.374977,...,0.482158,0.414148,0.131429,1.220631,0.834964,0.483232,0.147010,-0.951483,0.294424,-0.198577
4,45,45,54,6.775875,-1.612978,1.644100,1.618446,-0.120389,-2.100044,1.194769,...,0.276380,-1.735726,0.088651,-0.460153,0.267455,-0.365053,0.336121,1.180247,0.426476,-0.433151
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
745,39,39,54,2.787523,-0.883056,-1.743477,1.538904,5.845113,-3.166498,3.236710,...,0.220386,-0.684359,0.865676,-0.768115,-1.261132,0.194899,-1.035923,-1.215269,0.060274,0.807771
746,39,39,54,2.854899,6.047017,-3.474011,-0.346020,-0.587577,0.425215,1.611746,...,-1.309792,0.076844,0.045159,-1.026839,-0.361555,-0.684781,0.225183,-0.525476,-0.250319,0.368014
747,39,39,54,1.589084,0.599613,-1.383467,-1.049447,3.411239,-1.563739,2.328931,...,0.282794,-0.695700,0.036120,0.600618,-0.932108,0.810727,-0.239678,0.177241,0.317068,-0.039797
748,39,39,54,0.606284,-0.193786,1.137545,-2.226705,3.134322,-1.924299,2.835627,...,0.128833,0.178209,-1.817799,0.623336,-1.060034,-0.730585,-0.816369,-0.390094,0.572571,0.118925


In [None]:
output_df.to_csv('/content/drive/MyDrive/Colab Notebooks/files/190377T_label_1.csv', index=False)