#### Imports

In [107]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler, OrdinalEncoder
from sklearn.svm import SVR
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score, mean_squared_error
from sklearn.model_selection import GridSearchCV

#### Dataset

In [108]:
# Load data
X_train = pd.read_csv('../data_students/labeled_data/X_train.csv')
X_test = pd.read_csv('../data_students/labeled_data/X_test.csv')
y_train = pd.read_csv('../data_students/labeled_data/y_train.csv', header=None)
y_test = pd.read_csv('../data_students/labeled_data/y_test.csv', header=None)

# Information about the data
print("X_train: " + str(X_train.shape))
print("X_test: " + str(X_test.shape))
print("y_train: " + str(y_train.shape))
print("y_test: " + str(y_test.shape))

X_train: (1000, 14)
X_test: (500, 14)
y_train: (1000, 1)
y_test: (500, 1)


#### Data pre-processing

In [109]:
# Rid of the column 'img_filename'
X_train = X_train.drop(columns=['img_filename'])
X_test = X_test.drop(columns=['img_filename'])

# Transform all categorical columns using label encoding
label_encoders = LabelEncoder()
X_train['profession'] = label_encoders.fit_transform(X_train['profession'])
X_test['profession'] = label_encoders.transform(X_test['profession'])


categorical_columns = X_train.select_dtypes(include=['object']).columns
satisfaction_order = ['Very low', 'Low', 'Moderate', 'High', 'Very high']
ordinal_encoders = OrdinalEncoder(categories=[satisfaction_order])
for column in categorical_columns:
    X_train[column] = ordinal_encoders.fit_transform(X_train[column].values.reshape(-1, 1))
    X_test[column] = ordinal_encoders.transform(X_test[column].values.reshape(-1, 1))

# Standardize data
scaler = StandardScaler()
numerical_columns = X_train.select_dtypes(include=['float64', 'int64']).columns
X_train[numerical_columns] = scaler.fit_transform(X_train[numerical_columns])
X_test[numerical_columns] = scaler.transform(X_test[numerical_columns])



#### Feature Selection

In [110]:
# Computation of Pearson correlation coefficient
def corr(X_train, y_train, k):
    y_train = y_train.squeeze()
    X_train_copy = X_train.copy()
    X_train_copy['target'] = y_train
    corr = X_train_copy.corr(method='pearson')['target'].drop('target')
    abs_corr = corr.abs().sort_values(ascending=False)
    top_features = abs_corr.head(k).index.tolist()
    return top_features

corr(X_train, y_train, 10)


['blood pressure',
 'weight',
 'sarsaparilla',
 'cholesterol',
 'smurfin donuts',
 'age',
 'height',
 'potassium',
 'smurfberry liquor',
 'calcium']

#### Model Selection and Implementation

In [111]:
model = SVR()

param_grid = {
    'kernel': ['poly', 'rbf', 'sigmoid'],
    'C': [0.01, 0.05, 0.1, 0.5, 1],
    'epsilon': [0.01, 0.05, 0.1, 0.5, 1]
}

k = [5, 7, 9, 11, 13]
best = {'k': 0, 'score': 1, 'params': None}
for i in k:
    print("Top " + str(i) + " features")
    top_features = corr(X_train, y_train, i)
    X_train_top = X_train[top_features]
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train_top, y_train.values.ravel())
    print("Best parameters: ", grid_search.best_params_)
    print("Best score: ", grid_search.best_score_)
    print("RMSE: ", np.sqrt(-grid_search.best_score_))
    print()

    if np.sqrt(-grid_search.best_score_) < best['score']:
        best['score'] = np.sqrt(-grid_search.best_score_)
        best['k'] = i
        best['params'] = grid_search.best_params_


print("Best k: ", best['k'])
print("Best score: ", best['score'])
print("Best parameters: ", best['params'])


Top 5 features
Best parameters:  {'C': 0.01, 'epsilon': 0.05, 'kernel': 'poly'}
Best score:  -0.005730474220828155
RMSE:  0.07569989577818556

Top 7 features
Best parameters:  {'C': 0.1, 'epsilon': 0.05, 'kernel': 'rbf'}
Best score:  -0.005591658898469999
RMSE:  0.0747773956384548

Top 9 features
Best parameters:  {'C': 0.1, 'epsilon': 0.05, 'kernel': 'rbf'}
Best score:  -0.0052892992040090405
RMSE:  0.07272756839059753

Top 11 features
Best parameters:  {'C': 0.1, 'epsilon': 0.05, 'kernel': 'rbf'}
Best score:  -0.005405283502290697
RMSE:  0.07352063317389682

Top 13 features
Best parameters:  {'C': 0.01, 'epsilon': 0.05, 'kernel': 'poly'}
Best score:  -0.005499223788460965
RMSE:  0.07415675146917484

Best k:  9
Best score:  0.07272756839059753
Best parameters:  {'C': 0.1, 'epsilon': 0.05, 'kernel': 'rbf'}


In [112]:
best_k = best['k']
top_features = corr(X_train, y_train, best_k)
X_train_top = X_train[top_features]
X_test_top = X_test[top_features]

model = SVR(kernel=best['params']['kernel'], C=best['params']['C'], epsilon=best['params']['epsilon'])
model.fit(X_train_top, y_train.values.ravel())

y_pred = model.predict(X_test_top)
print("RMSE: ", np.sqrt(mean_squared_error(y_test, y_pred)))

RMSE:  0.07321504537951955
