# AML — Task 1
## Predict the age of a brain from MRI features
---

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import zscore
from pandas_profiling import ProfileReport

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.linear_model import Ridge, Lasso
from sklearn.svm import SVR
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectPercentile, f_regression
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import IsolationForest

In [3]:
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning

## Import datasets

In [55]:
X_train = pd.read_csv('data/X_train.csv').drop(columns=['id'])
y_train = pd.read_csv('data/y_train.csv').drop(columns=['id'])

In [56]:
X_test = pd.read_csv('data/X_test.csv').drop(columns=['id'])

In [57]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1212 entries, 0 to 1211
Columns: 832 entries, x0 to x831
dtypes: float64(832)
memory usage: 7.7 MB


In [58]:
X_train.describe()

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x822,x823,x824,x825,x826,x827,x828,x829,x830,x831
count,1118.0,1114.0,1117.0,1106.0,1117.0,1128.0,1105.0,1127.0,1116.0,1124.0,...,1134.0,1125.0,1098.0,1121.0,1120.0,1109.0,1115.0,1112.0,1124.0,1091.0
mean,10.026057,832442.85929,20585.524887,1048.958235,1000291.0,10.08501,597900.429955,10389.657239,999842.2,785176.225858,...,1049674.0,-876.044006,13492.600186,10.554762,10.057767,1066.141107,10.008269,1050199.0,99798.480171,104903.905758
std,0.968347,0.028258,0.029051,28.430733,97408.91,0.968026,0.028128,1655.843472,102244.1,0.028799,...,28395.79,164.585576,2519.835006,0.283844,0.982656,226.606986,1.01893,28142.1,9576.12872,2768.40535
min,6.672068,832442.808579,20585.473808,1000.063783,680021.5,6.984052,597900.381003,3644.074892,609573.0,785176.176297,...,1000105.0,-1597.766964,2536.030655,10.010366,6.841039,496.007706,6.466963,1000002.0,73207.994891,100012.896777
25%,9.381273,832442.835941,20585.501013,1024.969967,936088.2,9.470582,597900.40611,9339.537887,932293.7,785176.201279,...,1025054.0,-975.398714,11947.954006,10.321039,9.379001,899.067501,9.325229,1027575.0,93416.2524,102596.190683
50%,10.000079,832442.860041,20585.524817,1047.985497,1000557.0,10.089601,597900.429787,10295.013382,1001261.0,785176.225608,...,1049296.0,-875.508235,13352.186179,10.55426,10.11437,1049.027077,10.005684,1050262.0,99802.127899,104846.235709
75%,10.664998,832442.882951,20585.550525,1073.180317,1064617.0,10.752707,597900.452983,11304.073469,1068359.0,785176.250421,...,1074354.0,-773.174562,14893.726023,10.792195,10.74537,1215.057985,10.65812,1073831.0,106400.748441,107098.66935
max,12.956099,832442.908334,20585.573514,1099.977638,1331630.0,12.747734,597900.48081,17347.531573,1284804.0,785176.276168,...,1099771.0,-281.030205,24815.260375,11.09105,13.530204,2122.032859,13.163113,1099918.0,130694.436443,109984.169649


---
## Outlier detection: Isolation Forest

### Imputation for outlier detection

In [59]:
# Save a mask of the imputed values to be able to redo the imputation once the outlier detection is done
X_train_null_mask = np.array(X_train.isna())

In [60]:
# Need to impute nan values for the outlier detection to work (cannot deal with nan)
X_train = SimpleImputer(strategy="median", verbose=1).fit_transform(X_train)

In [61]:
clf = IsolationForest(random_state=0, verbose=1).fit(X_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.7s finished


In [62]:
X_train_outliers_prediction = clf.predict(X_train)

In [63]:
print(f"Detected {(X_train_outliers_prediction == -1).sum()} outliers, out of {X_train_outliers_prediction.shape[0]} samples ({100 * (X_train_outliers_prediction == -1).sum() / X_train_outliers_prediction.shape[0]:.2f}%).")

Detected 5 outliers, out of 1212 samples (0.41%).


In [64]:
# Remove outliers from the training set
X_train = X_train[X_train_outliers_prediction == 1, :]
y_train = np.array(y_train)[X_train_outliers_prediction == 1, :].reshape((-1,))

# Update the null mask
X_train_null_mask = X_train_null_mask[X_train_outliers_prediction == 1, :]

---
## Data scaling
Done as soon as possible because can have an effect (e.g. on distances for `KNNImputer`)

In [65]:
# Do the scaling, saving the scaler to use it for X_test too
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(np.array(X_test))

---
## Feature selection

In [66]:
# Use correlation to lose correlated features
corr_threshold = 0.7
X_train_corr_ = pd.DataFrame(X_train).corr()
X_train_not_correlated = ~(X_train_corr_.mask(
    np.tril(np.ones([len(X_train_corr_)]*2, dtype=bool))).abs() > corr_threshold).any()
X_train_not_corr_columns = X_train_not_correlated.loc[
    X_train_not_correlated[X_train_not_correlated.index] == True].index

In [67]:
print(f"With a correlation threshold of {corr_threshold}, there are {len(X_train_not_corr_columns)} uncorrelated columns, out of {X_train.shape[1]} ({100*len(X_train_not_corr_columns)/X_train.shape[1]:.2f}%).")

With a correlation threshold of 0.7, there are 707 uncorrelated columns, out of 832 (84.98%).


In [70]:
X_test_null_mask = np.array(pd.DataFrame(X_test).isna())
X_test = SimpleImputer(strategy="median", verbose=1).fit_transform(X_test)

In [72]:
X_train = pd.DataFrame(X_train).loc[:, X_train_not_correlated]
X_test = pd.DataFrame(X_test).loc[:, X_train_not_correlated]

X_train_null_mask = pd.DataFrame(X_train_null_mask).loc[:, X_train_not_correlated]
X_test_null_mask = pd.DataFrame(X_test_null_mask).loc[:, X_train_not_correlated]

In [73]:
# Use f_regression to lose random features
selector = SelectPercentile(f_regression, percentile=80)
selector.fit(X_train, y_train)
X_train = selector.transform(X_train)
X_train_null_mask = selector.transform(X_train_null_mask)
X_test = selector.transform(X_test)
X_test_null_mask = selector.transform(X_test_null_mask)

  correlation_coefficient /= X_norms


In [74]:
X_test = np.where(X_test_null_mask, np.nan, X_test)

---
## Imputation of missing values

In [75]:
print(f"For the train dataset, there are {X_train_null_mask.sum().sum()} nan values, out of {X_train_null_mask.shape[0]*X_train_null_mask.shape[1]} ({100*X_train_null_mask.sum().sum()/(X_train_null_mask.shape[0]*X_train_null_mask.shape[1]):.2f}%).")

For the train dataset, there are 51933 nan values, out of 681955 (7.62%).


In [76]:
# Put back nan values where we previously imputed for the outlier detection
X_train = np.where(X_train_null_mask, np.nan, X_train)

In [77]:
# Do the imputation
X_train = KNNImputer(n_neighbors=5, weights='uniform').fit_transform(X_train)
X_test = KNNImputer(n_neighbors=5, weights='uniform').fit_transform(X_test)

---
## Models

### Model 1: Lasso

In [83]:
lasso = Lasso(max_iter=100000)

In [84]:
gs_lasso_params = {
    'alpha': np.logspace(-1, 0, 20),
}

In [85]:
gs_lasso = GridSearchCV(lasso, gs_lasso_params, cv=5, verbose=3)

In [86]:
gs_lasso.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END .........................alpha=0.1;, score=0.358 total time=   0.1s
[CV 2/5] END .........................alpha=0.1;, score=0.208 total time=   0.1s
[CV 3/5] END .........................alpha=0.1;, score=0.334 total time=   0.1s
[CV 4/5] END .........................alpha=0.1;, score=0.311 total time=   0.1s
[CV 5/5] END .........................alpha=0.1;, score=0.346 total time=   0.1s
[CV 1/5] END .........alpha=0.11288378916846889;, score=0.372 total time=   0.1s
[CV 2/5] END .........alpha=0.11288378916846889;, score=0.222 total time=   0.1s
[CV 3/5] END .........alpha=0.11288378916846889;, score=0.342 total time=   0.1s
[CV 4/5] END .........alpha=0.11288378916846889;, score=0.318 total time=   0.1s
[CV 5/5] END .........alpha=0.11288378916846889;, score=0.354 total time=   0.1s
[CV 1/5] END .........alpha=0.12742749857031338;, score=0.383 total time=   0.1s
[CV 2/5] END .........alpha=0.12742749857031338

GridSearchCV(cv=5, estimator=Lasso(max_iter=100000),
             param_grid={'alpha': array([0.1       , 0.11288379, 0.1274275 , 0.14384499, 0.16237767,
       0.18329807, 0.20691381, 0.23357215, 0.26366509, 0.29763514,
       0.33598183, 0.37926902, 0.42813324, 0.48329302, 0.54555948,
       0.61584821, 0.6951928 , 0.78475997, 0.88586679, 1.        ])},
             verbose=3)

In [87]:
print(f"The best validation score obtained is {gs_lasso.best_score_:.5f} with\n\talpha: {gs_lasso.best_params_['alpha']}")

The best validation score obtained is 0.35426 with
	alpha: 0.42813323987193935


### Model 2: SVR (SVM for regression)

In [78]:
svr = SVR()

In [79]:
gs_svr_params = {
    'kernel': ['poly', 'rbf', 'sigmoid'],
    'C': np.logspace(-1, 2.2, 4),
    'epsilon': np.logspace(-2, 1, 3),
}

In [80]:
gs_svr = GridSearchCV(svr, gs_svr_params, cv=5, verbose=3)

In [81]:
gs_svr.fit(X_train, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV 1/5] END ..C=0.1, epsilon=0.01, kernel=poly;, score=0.005 total time=   0.4s
[CV 2/5] END .C=0.1, epsilon=0.01, kernel=poly;, score=-0.003 total time=   0.3s
[CV 3/5] END ..C=0.1, epsilon=0.01, kernel=poly;, score=0.004 total time=   0.2s
[CV 4/5] END ..C=0.1, epsilon=0.01, kernel=poly;, score=0.004 total time=   0.3s
[CV 5/5] END .C=0.1, epsilon=0.01, kernel=poly;, score=-0.003 total time=   0.2s
[CV 1/5] END ...C=0.1, epsilon=0.01, kernel=rbf;, score=0.060 total time=   0.3s
[CV 2/5] END ...C=0.1, epsilon=0.01, kernel=rbf;, score=0.037 total time=   0.3s
[CV 3/5] END ...C=0.1, epsilon=0.01, kernel=rbf;, score=0.050 total time=   0.3s
[CV 4/5] END ...C=0.1, epsilon=0.01, kernel=rbf;, score=0.050 total time=   0.3s
[CV 5/5] END ...C=0.1, epsilon=0.01, kernel=rbf;, score=0.035 total time=   0.3s
[CV 1/5] END C=0.1, epsilon=0.01, kernel=sigmoid;, score=0.185 total time=   0.2s
[CV 2/5] END C=0.1, epsilon=0.01, kernel=sigmo

[CV 5/5] END C=1.1659144011798317, epsilon=10.0, kernel=sigmoid;, score=0.278 total time=   0.1s
[CV 1/5] END C=13.593563908785255, epsilon=0.01, kernel=poly;, score=0.174 total time=   0.3s
[CV 2/5] END C=13.593563908785255, epsilon=0.01, kernel=poly;, score=-0.011 total time=   0.4s
[CV 3/5] END C=13.593563908785255, epsilon=0.01, kernel=poly;, score=0.142 total time=   0.3s
[CV 4/5] END C=13.593563908785255, epsilon=0.01, kernel=poly;, score=0.125 total time=   0.3s
[CV 5/5] END C=13.593563908785255, epsilon=0.01, kernel=poly;, score=0.097 total time=   0.4s
[CV 1/5] END C=13.593563908785255, epsilon=0.01, kernel=rbf;, score=0.487 total time=   0.4s
[CV 2/5] END C=13.593563908785255, epsilon=0.01, kernel=rbf;, score=0.335 total time=   0.3s
[CV 3/5] END C=13.593563908785255, epsilon=0.01, kernel=rbf;, score=0.378 total time=   0.3s
[CV 4/5] END C=13.593563908785255, epsilon=0.01, kernel=rbf;, score=0.377 total time=   0.4s
[CV 5/5] END C=13.593563908785255, epsilon=0.01, kernel=rbf;

[CV 2/5] END C=158.48931924611142, epsilon=10.0, kernel=rbf;, score=0.129 total time=   0.1s
[CV 3/5] END C=158.48931924611142, epsilon=10.0, kernel=rbf;, score=0.232 total time=   0.1s
[CV 4/5] END C=158.48931924611142, epsilon=10.0, kernel=rbf;, score=0.176 total time=   0.1s
[CV 5/5] END C=158.48931924611142, epsilon=10.0, kernel=rbf;, score=0.271 total time=   0.1s
[CV 1/5] END C=158.48931924611142, epsilon=10.0, kernel=sigmoid;, score=-75.512 total time=   0.6s
[CV 2/5] END C=158.48931924611142, epsilon=10.0, kernel=sigmoid;, score=-87.774 total time=   0.7s
[CV 3/5] END C=158.48931924611142, epsilon=10.0, kernel=sigmoid;, score=-47.749 total time=   0.7s
[CV 4/5] END C=158.48931924611142, epsilon=10.0, kernel=sigmoid;, score=-58.056 total time=   0.8s
[CV 5/5] END C=158.48931924611142, epsilon=10.0, kernel=sigmoid;, score=-43.129 total time=   0.6s


GridSearchCV(cv=5, estimator=SVR(),
             param_grid={'C': array([1.00000000e-01, 1.16591440e+00, 1.35935639e+01, 1.58489319e+02]),
                         'epsilon': array([ 0.01      ,  0.31622777, 10.        ]),
                         'kernel': ['poly', 'rbf', 'sigmoid']},
             verbose=3)

In [82]:
print(f"""The best validation score obtained is {gs_svr.best_score_:.5f} with
\tkernel: {gs_svr.best_params_['kernel']}
\tC: {gs_svr.best_params_['C']}
\tepsilon: {gs_svr.best_params_['epsilon']}""")

The best validation score obtained is 0.39749 with
	kernel: rbf
	C: 13.593563908785255
	epsilon: 0.31622776601683794


---
## Prediction

In [None]:
best_pipe = gs_lasso ## modify here

In [None]:
prediction = pd.DataFrame(best_pipe.predict(X_test))

In [None]:
sub_id = 6 ## modify here
basepath = 'submissions/task1-sub' 

In [None]:
result = prediction.copy()
result = result.rename(columns={0: 'y'})
result['id'] = range(0, len(result))
result = result[['id', 'y']]

In [None]:
#result.to_csv(basepath+str(sub_id) + '.csv', index=False)