In [None]:
import pandas as pd
import numpy as np

labels = ['label_1', 'label_2', 'label_3', 'label_4']
features = [f'feature_{i}' for i in range (1,257)]

In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/files/train.csv')
valid_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/files/valid.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/files/test.csv')

train_df.shape, valid_df.shape, test_df.shape

((28520, 260), (750, 260), (750, 260))

In [None]:
from sklearn.preprocessing import StandardScaler

x_train = {}
y_train = {}

x_valid = {}
y_valid = {}

x_test = {}

for label in labels:
  tr_df = train_df[train_df['label_2'].notna()] if label == 'label_2' else train_df
  vl_df = valid_df[valid_df['label_2'].notna()] if label == 'label_2' else valid_df
  tst_df = test_df

  scaler = StandardScaler()
  x_train[label] = pd.DataFrame(scaler.fit_transform(tr_df.drop(labels, axis = 1)), columns = features)
  y_train[label] = tr_df[label]

  x_valid[label] = pd.DataFrame(scaler.transform(vl_df.drop(labels, axis = 1)), columns = features)
  y_valid[label] = vl_df[label]

  x_test[label] = pd.DataFrame(scaler.transform(tst_df.drop(labels, axis = 1)), columns = features)

In [None]:
from sklearn.neighbors import KNeighborsRegressor

knn = KNeighborsRegressor(n_neighbors=3)
knn.fit(x_train['label_2'], y_train['label_2'])

In [None]:
y_pred = knn.predict(x_valid['label_2'])
y_pred_test_before = knn.predict(x_test['label_2'])

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn import metrics

# Evaluate the KNN regressor's performance
print("Validation Set Performance:")
print("Mean Absolute Error:", metrics.mean_absolute_error(y_valid['label_2'], y_pred))
print("Mean Squared Error:", metrics.mean_squared_error(y_valid['label_2'], y_pred))
print("R-squared:", metrics.r2_score(y_valid['label_2'], y_pred))


Validation Set Performance:
Mean Absolute Error: 0.07744565217391304
Mean Squared Error: 0.3715277777777778
R-squared: 0.991244390032218


In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

selector = SelectKBest(f_classif, k=200)
x_train_kBest = selector.fit_transform(x_train['label_2'], y_train['label_2'])
x_valid_kBest = selector.transform(x_valid['label_2'])
x_test_kBest = selector.transform(x_test['label_2'])
print (x_train_kBest.shape)
print (x_valid_kBest.shape)
print (x_test_kBest.shape)

(28040, 200)
(736, 200)
(736, 200)


In [None]:
knn = KNeighborsRegressor(n_neighbors=3)
knn.fit(x_train_kBest, y_train['label_2'])

In [None]:
y_pred = knn.predict(x_valid_kBest)
y_pred_test = knn.predict(x_test_kBest)

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn import metrics

# Evaluate the KNN regressor's performance
print("Validation Set Performance:")
print("Mean Absolute Error:", metrics.mean_absolute_error(y_valid['label_2'], y_pred))
print("Mean Squared Error:", metrics.mean_squared_error(y_valid['label_2'], y_pred))
print("R-squared:", metrics.r2_score(y_valid['label_2'], y_pred))

Validation Set Performance:
Mean Absolute Error: 0.09827898550724638
Mean Squared Error: 0.4904891304347826
R-squared: 0.9884408871250209


In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components= 0.95, svd_solver='full')
pca.fit(x_train_kBest)
x_train_trf = pd.DataFrame(pca.transform(x_train_kBest))
x_valid_trf = pd.DataFrame(pca.transform(x_valid_kBest))
x_test_trf = pd.DataFrame(pca.transform(x_test_kBest))
print(x_train_trf.shape)
print(x_valid_trf.shape)
print(x_test_trf.shape)

(28040, 63)
(736, 63)
(736, 63)


In [None]:
knn = KNeighborsRegressor(n_neighbors=3)
knn.fit(x_train_trf, y_train['label_2'])

In [None]:
y_pred = knn.predict(x_valid_trf)
y_pred_test_after = knn.predict(x_test_trf)

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn import metrics

# Evaluate the KNN regressor's performance
print("Validation Set Performance:")
print("Mean Absolute Error:", metrics.mean_absolute_error(y_valid['label_2'], y_pred))
print("Mean Squared Error:", metrics.mean_squared_error(y_valid['label_2'], y_pred))
print("R-squared:", metrics.r2_score(y_valid['label_2'], y_pred))

Validation Set Performance:
Mean Absolute Error: 0.1177536231884058
Mean Squared Error: 0.6699879227053139
R-squared: 0.9842107285505826


In [None]:
num_new_features = x_test_trf.shape[1]
print(num_new_features)

63


In [None]:
output_df = pd.DataFrame({
    'Predicted labels before feature engineering': y_pred_test_before,
    'Predicted labels after feature engineering': y_pred_test_after,
    'No. of new features': x_test_trf.shape[1]
})

for i in range(x_test_trf.shape[1]):
    output_df[f'new feature {i+1}'] = x_test_trf.iloc[:, i]

output_df

Unnamed: 0,Predicted labels before feature engineering,Predicted labels after feature engineering,No. of new features,new feature 1,new feature 2,new feature 3,new feature 4,new feature 5,new feature 6,new feature 7,...,new feature 54,new feature 55,new feature 56,new feature 57,new feature 58,new feature 59,new feature 60,new feature 61,new feature 62,new feature 63
0,25.0,25.0,63,2.002570,1.008005,-0.637817,-3.833345,2.369131,-3.733078,0.401496,...,-0.663071,0.827220,-0.427780,0.202375,-1.225741,0.008430,0.782036,-0.179641,0.325557,0.334613
1,25.0,25.0,63,4.887731,-2.057620,0.145484,-0.865058,-2.293970,-2.796303,2.140080,...,0.967900,-0.222786,-0.948916,-0.189948,-0.514662,0.138834,0.062203,-0.485803,-0.025816,-0.748126
2,25.0,25.0,63,0.014203,1.644746,-5.128421,-3.849604,0.837745,-3.798704,-0.242820,...,-1.473518,0.927577,-0.044119,1.404461,-0.966427,-0.140542,0.978434,-0.233260,0.360298,-0.341674
3,25.0,25.0,63,1.339128,2.866874,-6.449367,-2.191864,1.075924,-3.614299,-0.467487,...,0.029930,0.217091,-0.733589,0.800939,-0.501627,-0.215077,0.354504,1.343081,-0.430917,0.795790
4,25.0,25.0,63,1.899913,-0.645804,-2.468413,1.856774,0.502110,-1.782838,-4.613325,...,0.022576,-0.628454,-0.149471,0.102398,0.917642,0.150497,-0.209066,1.021327,0.737844,-0.083449
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
731,29.0,29.0,63,3.652322,1.236319,5.112444,3.013916,4.023141,-0.264678,-4.971714,...,0.765733,1.887753,1.450292,-0.828060,-0.535004,0.073929,-2.823294,-0.806746,-0.114847,0.186816
732,29.0,29.0,63,-1.789947,8.529895,0.545955,-0.658622,1.417001,3.615407,-1.170023,...,0.008034,0.029203,0.007810,-0.866801,-0.388897,0.251097,-0.013333,0.118808,-0.839660,0.225668
733,29.0,29.0,63,1.342084,2.008046,3.515062,0.300107,1.134492,1.605048,-5.004100,...,-1.016923,1.688013,-0.703982,-0.360291,0.286506,-0.573106,-0.431206,0.012209,0.200112,-0.674270
734,29.0,29.0,63,2.035663,-1.217966,-1.450449,1.473739,2.435615,2.834462,-8.707434,...,0.495554,0.374423,-0.229033,-0.311864,-1.719611,-1.685917,0.424514,-0.109508,-0.146349,-0.055599


In [None]:
output_df.to_csv('/content/drive/MyDrive/Colab Notebooks/files/190377T_label_2.csv', index=False)