# AML — Task 1
## Predict the age of a brain from MRI features
---

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import zscore
from pandas_profiling import ProfileReport

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.linear_model import Ridge, Lasso
from sklearn.svm import SVR
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, SelectPercentile, f_regression
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.ensemble import IsolationForest, GradientBoostingRegressor, AdaBoostRegressor, RandomForestRegressor
from sklearn.neighbors import LocalOutlierFactor

## Import dataset

In [None]:
def load_raw_data():
    X_train = pd.read_csv('data/X_train.csv').drop(columns=['id'])
    y_train = pd.read_csv('data/y_train.csv').drop(columns=['id'])
    X_test = pd.read_csv('data/X_test.csv').drop(columns=['id'])
    return X_train, y_train, X_test

## Export dataset to `csv`

In [None]:
def export_to_csv(X_train_cleaned, y_train_cleaned, X_test_cleaned):
    X_train_cleaned.to_csv('data/X_train_cleaned.csv', index=False)
    y_train_cleaned.to_csv('data/y_train_cleaned.csv', index=False)
    X_test_cleaned.to_csv('data/X_test_cleaned.csv', index=False)

## Export `csv` submission file

In [None]:
def create_submission(prediction, sub_id, basepath = 'submissions/task1-sub'):
    result = prediction.copy()
    result = result.rename(columns={0: 'y'})
    result['id'] = range(0, len(result))
    result = result[['id', 'y']]
    result.to_csv(basepath+str(sub_id) + '.csv', index=False)

## Outlier detection

In [None]:
def remove_outliers(X_train, y_train, contamination='auto', verbose=1):
    """
    Remove the ouliers from our dataset. Temporarily replace the nan values by 
    the median to perform the outlier detection.

    Parameters
    ----------
    X_train : pd.df
        The features (what we will use to see the outliers)
    y_train : pd.df
        The labels
    contamination : int, optional
        The percent of outliers found by the isolation forest if it is used.

    Return
    ------
    (pd.df, pd.df)
        The data with the outliers rows removed
    """
    # Save a mask of the imputed values to be able to redo the imputation once the outlier detection is done
    X_train_null_mask = X_train.isna()
    
    # Need to impute nan values for the outlier detection to work (cannot deal with nan)
    X_train_imputed = pd.DataFrame(SimpleImputer(strategy="median", verbose=verbose).fit_transform(X_train))
    
    #clf = IsolationForest(contamination=contamination, random_state=0) # modify here
    clf = LocalOutlierFactor(contamination=contamination) # modify here
    outliers_mask = pd.Series(clf.fit_predict(X_train_imputed))
    
    if verbose:
        print(f"Detected {(outliers_mask == -1).sum()} outliers, out of {outliers_mask.shape[0]} samples ({100 * (outliers_mask == -1).sum() / outliers_mask.shape[0]:.2f}%).")
    
    # Put back the nan values
    # convert the null mask to np.array so it is correctly applied since X_train indexes have changed
    X_train_no_outliers = X_train_imputed.mask(np.array(X_train_null_mask))
    
    # Remove outliers from the training set
    X_train_no_outliers = X_train_no_outliers.loc[outliers_mask == 1, :]
    y_train_no_outliers = y_train.loc[outliers_mask == 1, :]
    
    return (X_train_no_outliers, y_train_no_outliers)

## Data scaling

In [None]:
def scale(X_train, X_test):
    # Do the scaling, saving the scaler to use it for X_test too. No need for imputation, just ignore nan values.
    scaler = StandardScaler().fit(X_train)
    X_train_scaled = pd.DataFrame(scaler.transform(X_train))
    # Cast X_test to np.array to avoid warning of model trained without feature names but X having some.
    X_test_scaled = pd.DataFrame(scaler.transform(np.array(X_test)))
    return (X_train_scaled, X_test_scaled)

## Data imputation

In [None]:
def impute_values(X_train, X_test, method='knn', max_iter=15):
    print(f"For the train dataset, there are {np.array(X_train.isna()).sum().sum()} nan values, out of {X_train.shape[0]*X_train.shape[1]} ({100*np.array(X_train.isna()).sum().sum()/(X_train.shape[0]*X_train.shape[1]):.2f}%).")
    
    imputer = None
    if method == 'knn':
        imputer = KNNImputer(n_neighbors=6, weights='uniform').fit(X_train)
    elif method == 'iterative':
        # Runs VERY slowly
        imputer = IterativeImputer(random_state=0, max_iter=max_iter, verbose=2).fit(X_train)
    
    X_train_imputed = pd.DataFrame(imputer.transform(X_train))
    X_test_imputed = pd.DataFrame(imputer.transform(X_test))
    return (X_train_imputed, X_test_imputed)

## Feature selection

In [None]:
def remove_constant_features(X_train, X_test, verbose=1):
    non_constant_features_mask = X_train.apply(pd.Series.nunique) != 1
    X_train_selected_features = X_train.loc[:, non_constant_features_mask]
    X_test_selected_features = X_test.loc[:, non_constant_features_mask]
    
    if verbose:
        print(f"{X_train.shape[1]-X_train_selected_features.shape[1]} features removed because of constant values ({100*(X_train.shape[1]-X_train_selected_features.shape[1])/X_train.shape[1]:.2f}%).")
    
    return X_train_selected_features, X_test_selected_features

In [None]:
def remove_too_correlated_features(X_train, X_test, threshold=0.98, verbose=1):
    X_train_corr_ = X_train.corr()

    X_train_too_correlated = (X_train_corr_.mask(
        np.tril(np.ones([len(X_train_corr_)]*2, dtype=bool))).abs() > threshold).any()
    
    X_train_selected_features = X_train.loc[:, (~X_train_too_correlated)]
    X_test_selected_features = X_test.loc[:, (~X_train_too_correlated)]
    
    if verbose:
        print(f"{X_train.shape[1]-X_train_selected_features.shape[1]} features removed because of correlation with another feature > {threshold} ({100*(X_train.shape[1]-X_train_selected_features.shape[1])/X_train.shape[1]:.2f}%).")

    return X_train_selected_features, X_test_selected_features

In [None]:
def remove_random_features(X_train, y_train, X_test, Xtrm=None, Xtem=None, k=190, verbose=1):
    selector = SelectKBest(f_regression, k=k) # modify here
    selector.fit(X_train, np.array(y_train).ravel())
    X_train_selected_features = pd.DataFrame(selector.transform(X_train))
    X_test_selected_features = pd.DataFrame(selector.transform(X_test))
    if Xtrm is not None:
        Xtrm = pd.DataFrame(selector.transform(Xtrm))
    if Xtem is not None:
        Xtem = pd.DataFrame(selector.transform(Xtem))
    
    if verbose:
        print(f"{X_train.shape[1]-X_train_selected_features.shape[1]} features removed because of low correlation with target ({100*(X_train.shape[1]-X_train_selected_features.shape[1])/X_train.shape[1]:.2f}%).")
        
    return X_train_selected_features, X_test_selected_features, Xtrm, Xtem

## Data cleaning

In [None]:
def clean_data():
    print("Loading raw data...")
    X_train, y_train, X_test = load_raw_data()

    print("Removing outliers...")
    X_train, y_train = remove_outliers(X_train, y_train)

    print("Scaling data...")
    X_train, X_test = scale(X_train, X_test)

    print("Selecting features...")
    X_train, X_test = remove_constant_features(X_train, X_test)
    X_train, X_test = remove_too_correlated_features(X_train, X_test, threshold=0.98)

    X_train_mask = X_train.isna()
    X_test_mask = X_test.isna()
    X_train, X_test = impute_values(X_train, X_test, method='knn')
    X_train, X_test, X_train_mask, X_test_mask = remove_random_features(
        X_train, y_train, X_test, Xtrm=X_train_mask, Xtem=X_test_mask, k=190)
    X_train = X_train.mask(np.array(X_train_mask))
    X_test = X_test.mask(np.array(X_test_mask))

    print("Imputing nan values...")
    X_train, X_test = impute_values(X_train, X_test, method='iterative', max_iter=100)

    print("Exporting clean data to csv...")
    export_to_csv(X_train, y_train, X_test)

    print("All done!")
    
    return X_train, y_train, X_test

## Models

In [None]:
def best_svr(X_train, y_train):
    svr = SVR()
    gs_svr_params = {
        'kernel': ['rbf'],#, 'poly', 'sigmoid'],
        'C': np.logspace(0, 3, 6),
        'epsilon': np.logspace(-4, -1, 7),
    }
    gs_svr = GridSearchCV(svr, gs_svr_params, cv=5, verbose=3)
    gs_svr.fit(X_train, y_train)
    
    print(f"""The best validation score obtained is {gs_svr.best_score_:.5f} with
    \tkernel: {gs_svr.best_params_['kernel']}
    \tC: {gs_svr.best_params_['C']}
    \tepsilon: {gs_svr.best_params_['epsilon']}""")
    
    return gs_svr

In [None]:
def best_gbr(X_train, y_train):
    gbr = GradientBoostingRegressor()
    gs_gbr_params = {
     "loss":["ls"],
     "learning_rate": [0.03, 0.035, 0.04],
     "n_estimators": np.arange(330, 421, 30),
     "subsample": np.arange(0.3, 0.41, 0.05),
     "max_depth": [4, 8],
     "min_samples_split": [4, 5, 6],
     "min_samples_leaf": [1, 5],
     "random_state": [0], 
     "verbose": [1]
    }
    gs_gbr = GridSearchCV(gbr, gs_gbr_params, cv=4, verbose=3, error_score='raise')
    gs_gbr.fit(X_train, y_train)

    print(f"""The best validation score obtained is {gs_gbr.best_score_:.5f} with
    \tloss: {gs_gbr.best_params_['loss']}
    \tlearning_rate: {gs_gbr.best_params_['learning_rate']}
    \tn_estimators: {gs_gbr.best_params_['n_estimators']}
    \tsubsample: {gs_gbr.best_params_['subsample']}
    \tmax_depth: {gs_gbr.best_params_['max_depth']}
    \tmin_samples_split: {gs_gbr.best_params_['min_samples_split']}
    \tmin_samples_leaf: {gs_gbr.best_params_['min_samples_leaf']}
    """)
        
    return gs_gbr

## Model selection

In [None]:
X_train, y_train, X_test = clean_data()

In [None]:
svr = best_svr(X_train, np.array(y_train).ravel())

In [None]:
gbr = best_gbr(X_train, np.array(y_train).ravel())

## Create submission

prediction = pd.DataFrame(svr.predict(X_test)) # modify here
sub_id = 19 # modify here
#create_submission(prediction, sub_id)