# AML — Task 1
## Predict the age of a brain from MRI features
---

In [1]:
import numpy as np
import pandas as pd
from scipy.stats import zscore
from pandas_profiling import ProfileReport

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler

## Import datasets

In [3]:
X_train = pd.read_csv('data/X_train.csv').drop(columns=['id'])
y_train = pd.read_csv('data/y_train.csv').drop(columns=['id'])

In [4]:
X_test = pd.read_csv('data/X_test.csv').drop(columns=['id'])

## Explore datasets

In [5]:
X_train.describe()

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x822,x823,x824,x825,x826,x827,x828,x829,x830,x831
count,1118.0,1114.0,1117.0,1106.0,1117.0,1128.0,1105.0,1127.0,1116.0,1124.0,...,1134.0,1125.0,1098.0,1121.0,1120.0,1109.0,1115.0,1112.0,1124.0,1091.0
mean,10.026057,832442.85929,20585.524887,1048.958235,1000291.0,10.08501,597900.429955,10389.657239,999842.2,785176.225858,...,1049674.0,-876.044006,13492.600186,10.554762,10.057767,1066.141107,10.008269,1050199.0,99798.480171,104903.905758
std,0.968347,0.028258,0.029051,28.430733,97408.91,0.968026,0.028128,1655.843472,102244.1,0.028799,...,28395.79,164.585576,2519.835006,0.283844,0.982656,226.606986,1.01893,28142.1,9576.12872,2768.40535
min,6.672068,832442.808579,20585.473808,1000.063783,680021.5,6.984052,597900.381003,3644.074892,609573.0,785176.176297,...,1000105.0,-1597.766964,2536.030655,10.010366,6.841039,496.007706,6.466963,1000002.0,73207.994891,100012.896777
25%,9.381273,832442.835941,20585.501013,1024.969967,936088.2,9.470582,597900.40611,9339.537887,932293.7,785176.201279,...,1025054.0,-975.398714,11947.954006,10.321039,9.379001,899.067501,9.325229,1027575.0,93416.2524,102596.190683
50%,10.000079,832442.860041,20585.524817,1047.985497,1000557.0,10.089601,597900.429787,10295.013382,1001261.0,785176.225608,...,1049296.0,-875.508235,13352.186179,10.55426,10.11437,1049.027077,10.005684,1050262.0,99802.127899,104846.235709
75%,10.664998,832442.882951,20585.550525,1073.180317,1064617.0,10.752707,597900.452983,11304.073469,1068359.0,785176.250421,...,1074354.0,-773.174562,14893.726023,10.792195,10.74537,1215.057985,10.65812,1073831.0,106400.748441,107098.66935
max,12.956099,832442.908334,20585.573514,1099.977638,1331630.0,12.747734,597900.48081,17347.531573,1284804.0,785176.276168,...,1099771.0,-281.030205,24815.260375,11.09105,13.530204,2122.032859,13.163113,1099918.0,130694.436443,109984.169649


In [6]:
# Quick look at outliers whiskers are at 1.5 IQR
X_train.boxplot(column=['x543'])

<AxesSubplot:>

---
## Outlier detection

### Method 1: IQR
- Drop every sample rows with any feature that is an outlier.
- This method is not conclusive as it drops way too many samples.

In [7]:
Q1 = X_train.quantile(0.25)
Q3 = X_train.quantile(0.75)
IQR = Q3 - Q1

In [8]:
no_outliers = X_train[~((X_train < (Q1 - 1.5 * IQR)) | (X_train > (Q3 + 1.5 * IQR))).any(axis=1)]

In [9]:
print(f'We dropped {X_train.shape[0] - no_outliers.shape[0]} samples and have now {no_outliers.shape[0]} of them.') 

We dropped 1176 samples and have now 36 of them.


### Method 2: Z-score
- For every features we compute the absolute `z-score` of each sample of that feature.
- We then replace all scores above a tunable threshold by NaN values (filters the outliers).

In [10]:
X_train_abs_z_scores = np.abs(zscore(X_train, nan_policy='omit'))
X_train = X_train[(X_train_abs_z_scores < 3)]

---
## Pipeline Models

### Pipeline 1: Lasso Regression

1. **Imputation of missing values** using sklearn `KNN Impute`
2. **Data normalization** using sklear `Standard Scale`
3. **Feature selection** using sklearn `Lasso Regression`

In [11]:
pipe1 = Pipeline([
    ('imputer', KNNImputer()), 
    ('scaler', StandardScaler()), 
    ('lasso', Lasso(alpha=0.1, max_iter=100000))
])

In [12]:
pipe1.fit(X_train, y_train)

Pipeline(steps=[('imputer', KNNImputer()), ('scaler', StandardScaler()),
                ('lasso', Lasso(alpha=0.1, max_iter=100000))])

In [13]:
pipe1.score(X_train, y_train)

0.7107226982657424

## Predict

In [14]:
prediction = pd.DataFrame(pipe1.predict(X_test))

## Output solution

In [15]:
sub_id = 2
basepath = 'submissions/task1-sub'

In [16]:
result = prediction.copy()
result = result.rename(columns={0: 'y'})
result['id'] = range(0, len(result))
result = result[['id', 'y']]

In [17]:
result.to_csv(basepath+str(sub_id) + '.csv', index=False)