# AML — Task 1
## Predict the age of a brain from MRI features
---

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import zscore
from pandas_profiling import ProfileReport

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.linear_model import Ridge, Lasso
from sklearn.svm import SVR
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectPercentile, f_regression
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.ensemble import IsolationForest, GradientBoostingRegressor
from sklearn.neighbors import LocalOutlierFactor

In [3]:
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning

## Import datasets

In [4]:
def import_data(folder="data/", extension=""):
    X_train = pd.read_csv(folder + 'X_train' + extension + '.csv').drop(columns=['id'])
    y_train = pd.read_csv(folder + 'y_train' + extension + '.csv').drop(columns=['id'])
    X_test = pd.read_csv(folder + 'X_test' + extension + '.csv').drop(columns=['id'])
    return X_train, y_train, X_test

---
## Exporting Data as csv

In [5]:
def export_to_csv(X_train_cleaned, y_train_cleaned, X_test_cleaned, folder="data/"):
    X_train_cleaned.to_csv(folder + 'X_train_cleaned.csv', index=False)
    y_train_cleaned.to_csv(folder + 'y_train_cleaned.csv', index=False)
    X_test_cleaned.to_csv(folder + 'X_test_cleaned.csv', index=False)

---
## Task 1: Outlier detection

In [6]:
"""Remove the ouliers from our dataset. Replace temporarily the Nan values by the mean to perform outlier selection

Parameters
----------
X_train : pd.df
    The features (what we will use to see the outliers)
y_train : pd.df
    The labels
contamination : int, optional
    The percent of outliers found by the isolation forest if it's used.
verbose : int, optional
    If not set to 0, print messages
    
Return
------
(pd.df, pd.df)
    The data with the outliers rows removed
"""
def remove_outliers(X_train, y_train, contamination='auto', verbose=1, method="LocalOutlierFactor"):
    # Save a mask of the imputed values to be able to redo the imputation once the outlier detection is done
    X_train_null_mask = X_train.isna()
    
    # Need to impute nan values for the outlier detection to work (cannot deal with nan)
    X_train_imputed = pd.DataFrame(SimpleImputer(strategy="median", verbose=verbose).fit_transform(X_train))
    
    clf = None
    if method=="LocalOutlierFactor":
        clf = LocalOutlierFactor(contamination=contamination)
    elif method=="IsolationForest":
        clf = IsolationForest(contamination=contamination, random_state=0, verbose=verbose)
    else:
        raise AttributeError(f"Unvalid argument for method, must be 'LocalOutlierFactor' or 'IsolationForest', not '{method}'")
        
    outliers_mask = pd.Series(clf.fit_predict(X_train_imputed)).map({1:1, -1:0}) #Mask with 0 for outliers and 1 for non outliers
    
    if verbose:
        print(f"Detected {(outliers_mask == 0).sum()} outliers with method {method}, out of {outliers_mask.shape[0]} samples ({100 * (outliers_mask == 0).sum() / outliers_mask.shape[0]:.2f}%).")
    
    #Replace the Nan values (The outlier detection shouldn't replace NaN values by itself)
    X_train = pd.DataFrame(X_train).mask(X_train_null_mask, other=np.NaN, inplace=False)
    
    # Remove outliers from the training set
    X_train = np.array(X_train)[outliers_mask == 1, :]
    y_train = np.array(y_train)[outliers_mask == 1, :]
    
    X_train = pd.DataFrame(X_train)
    y_train = pd.DataFrame(y_train)
    
    return (X_train, y_train)

---
## Task 2: Data scaling
Done as soon as possible because can have an effect (e.g. on distances for `KNNImputer`)

In [7]:
def scale(X_train_no_outliers, X_test):
    # Do the scaling, saving the scaler to use it for X_test too. No need imputation, just ignore Nan values
    scaler = StandardScaler().fit(X_train)
    X_train_scaled = pd.DataFrame(scaler.transform(X_train_no_outliers))
    # Cast X_test to np.array to avoid warning of model trained without feature names but X having some.
    X_test_scaled = pd.DataFrame(scaler.transform(np.array(X_test)))
    return (X_train_scaled, X_test_scaled)

---
## Task 3: impute values

In [83]:
def impute_values(X_train, X_test, method='knn'):
    print(f"For the train dataset, there are {np.array(X_train.isna()).sum().sum()} nan values, out of {X_train.shape[0]*X_train.shape[1]} ({100*np.array(X_train.isna()).sum().sum()/(X_train.shape[0]*X_train.shape[1]):.2f}%).")
    
    imputer = None
    if method == 'knn':
        imputer = KNNImputer(n_neighbors=6, weights='uniform').fit(X_train)
    elif method == 'iterative':
        # Runs VERY slowly and might cause the kernel to crash due to intenisve use of RAM
        imputer = IterativeImputer(random_state=0, max_iter=15, verbose=2).fit(X_train)
    else:
        raise AttributeError(f"Unvalid argument for method, must be 'knn' or 'iterative', not '{method}'")
    
    X_train_imputed = pd.DataFrame(imputer.transform(X_train))
    X_test_imputed = pd.DataFrame(imputer.transform(X_test))
    return (X_train_imputed, X_test_imputed)

---
## Task 4: Feature selection

In [9]:
def select_features(X_train_imputed, X_test_imputed):
    X_train_selected_features, X_test_selected_features = remove_constant_features(X_train_imputed, X_test_imputed)
    X_train_selected_features, X_test_selected_features = remove_too_coorelated_features(X_train_selected_features, X_test_selected_features)
    X_train_selected_features, X_test_selected_features = remove_random_features(X_train_selected_features, y_train, X_test_selected_features, percentile=80)

    return X_train_selected_features, X_test_selected_features

In [10]:
def remove_constant_features(X_train_imputed, X_test_imputed, verbose=1):
    X_train_selected_features = X_train_imputed.loc[:, np.logical_and(X_train_imputed != X_train_imputed.iloc[0], X_train_imputed.notna()).any()]
    X_test_selected_features = X_test_imputed.loc[:, np.logical_and(X_train_imputed != X_train_imputed.iloc[0], X_train_imputed.notna()).any()]
    
    if verbose:
        print(f"{X_train_imputed.shape[1]-X_train_selected_features.shape[1]} features removed because of constant values ({100*(X_train_imputed.shape[1]-X_train_selected_features.shape[1])/X_train_imputed.shape[1]:.2f}%)")
    
    return X_train_selected_features, X_test_selected_features

In [11]:
def remove_too_coorelated_features(X_train, X_test, threshold=0.7, verbose=1):
    X_train_corr_ = X_train.corr()

    X_train_too_correlated = (X_train_corr_.mask(
        np.tril(np.ones([len(X_train_corr_)]*2, dtype=bool))).abs() > threshold).any()
    
    
    X_train_selected_features = X_train.loc[:, (~X_train_too_correlated)]
    X_test_selected_features = X_test.loc[:, (~X_train_too_correlated)]
    
    if verbose:
        print(f"{X_train.shape[1]-X_train_selected_features.shape[1]} features removed because of correlation > {threshold} ({100*(X_train.shape[1]-X_train_selected_features.shape[1])/X_train.shape[1]:.2f}%)")

    return X_train_selected_features, X_test_selected_features

In [12]:
def remove_random_features(X_train, y_train, X_test, Xtrm=None, Xtem=None, percentile=80, verbose=1):
    selector = SelectPercentile(f_regression, percentile=percentile) # modify here
    selector.fit(X_train, np.array(y_train).ravel())
    X_train_selected_features = pd.DataFrame(selector.transform(X_train))
    X_test_selected_features = pd.DataFrame(selector.transform(X_test))
    if Xtrm is not None:
        Xtrm = pd.DataFrame(selector.transform(Xtrm))
    if Xtem is not None:
        Xtem = pd.DataFrame(selector.transform(Xtem))
    
    if verbose:
        print(f"{X_train.shape[1]-X_train_selected_features.shape[1]} features removed because of low correlation with target ({100*(X_train.shape[1]-X_train_selected_features.shape[1])/X_train.shape[1]:.2f}%).")
        
    return X_train_selected_features, X_test_selected_features, Xtrm, Xtem

---
## Task 5 Models

In [84]:
def model(X_train_cleaned, y_train_cleaned):
    gs_lasso = best_lasso(X_train_cleaned, y_train_cleaned)
    gs_svr = best_svr(X_train_cleaned, y_train_cleaned)
    gs_gbr = best_gbr(X_train_cleaned, y_train_cleaned)
    max_score = max(max(gs_gbr.best_score_, gs_lasso.best_score_), gs_svr.best_score_)

    return gs_lasso if gs_lasso.best_score_ == max_score else gs_svr if gs_svr.best_score_ == max_score else gbr

### Model 1: Lasso

In [22]:
def best_lasso(X_train_cleaned, y_train_cleaned):
    lasso = Lasso(max_iter=100000)
    gs_lasso_params = {
    'alpha': np.logspace(-1, 0, 20),
    }
    gs_lasso = GridSearchCV(lasso, gs_lasso_params, cv=5, verbose=3)
    gs_lasso.fit(X_train_cleaned, y_train_cleaned)
    print(f"The best validation score obtained is {gs_lasso.best_score_:.5f} with\n\talpha: {gs_lasso.best_params_['alpha']}")
    return gs_lasso;

### Model 2: SVR (SVM for regression)

In [23]:
def best_svr(X_train_cleaned, y_train_cleaned):
    svr = SVR()
    gs_svr_params = {
    'kernel': ['poly', 'rbf', 'sigmoid'],
    'C': np.logspace(-1, 2.2, 4),
    'epsilon': np.logspace(-2, 1, 3),
    }
    gs_svr = GridSearchCV(svr, gs_svr_params, cv=5, verbose=3)
    gs_svr.fit(X_train_cleaned, y_train_cleaned)
    print(f"""The best validation score obtained is {gs_svr.best_score_:.5f} with
    \tkernel: {gs_svr.best_params_['kernel']}
    \tC: {gs_svr.best_params_['C']}
    \tepsilon: {gs_svr.best_params_['epsilon']}""")
    return gs_svr

### Model 3: Gradient Boosting

In [45]:
def best_gbr(X_train, y_train):
    gbr = GradientBoostingRegressor()
    gs_gbr_params = {
     "loss":["ls"],
     "learning_rate": np.logspace(-2, -1, 4),
     "n_estimators": np.arange(50, 500, 50),
     "subsample": np.arange(0.4, 1, 0.2),
     "max_depth": np.arange(2, 8, 1),
     "min_samples_split": np.arange(2, 8, 1),
     "min_samples_leaf": np.arange(1, 9, 2),
     "random_state": [0], 
     "verbose": [1]
    }
    gs_gbr = GridSearchCV(gbr, gs_gbr_params, cv=4, verbose=2, error_score='raise')
    gs_gbr.fit(X_train, y_train)

    print(f"""The best validation score obtained is {gs_gbr.best_score_:.5f} with
    \tloss: {gs_gbr.best_params_['loss']}
    \tlearning_rate: {gs_gbr.best_params_['learning_rate']}
    \tn_estimators: {gs_gbr.best_params_['n_estimators']}
    \tsubsample: {gs_gbr.best_params_['subsample']}
    \tmax_depth: {gs_gbr.best_params_['max_depth']}
    \tmin_samples_split: {gs_gbr.best_params_['min_samples_split']}
    \tmin_samples_leaf: {gs_gbr.best_params_['min_samples_leaf']}
    """)
        
    return gs_gbr

In [27]:
def old_best_gbr(X_train, y_train):

    params = {
        "loss":"ls",
        "n_estimators": 250,
        "learning_rate": 0.025,
        "subsample": 0.75,
        "max_depth": 6,
        "min_samples_split": 5,
        "n_iter_no_change": 100,
        "validation_fraction": 0.1,
        "random_state": 0, 
        "verbose": 1,
    }

    gbr = GradientBoostingRegressor(**params)
    
    # Get the validation score
    gbr_cv_scores = cross_val_score(gbr, X_train, y_train, n_jobs=-1, verbose=3)
    print(f"""Validation score obtained is {np.mean(gbr_cv_scores):.4f} with
    \t{params}""")
    
    # Fit model (because previous function does not return fitted model)
    X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=0)
    gbr.fit(X_train, y_train)
    
    test_score = np.zeros(gbr.n_estimators_, dtype=np.float64)
    for i, y_pred in enumerate(gbr.staged_predict(X_test)):
        test_score[i] = gbr.loss_(y_test, y_pred)

    fig = plt.figure(figsize=(6, 6))
    plt.subplot(1, 1, 1)
    plt.title("Deviance")
    plt.plot(
        np.arange(gbr.n_estimators_),
        gbr.train_score_,
        "b-",
        label="Training Set Deviance",
    )
    plt.plot(
        np.arange(gbr.n_estimators_), test_score, "r-", label="Test Set Deviance"
    )
    plt.legend(loc="upper right")
    plt.xlabel("Boosting Iterations")
    plt.ylabel("Deviance")
    fig.tight_layout()
    plt.show()
    
    return gbr

---
## Create CSV for submission

In [18]:
def create_submission(prediction, sub_id, basepath = 'submissions/task1-sub'):
    result = prediction.copy()
    result = result.rename(columns={0: 'y'})
    result['id'] = range(0, len(result))
    result = result[['id', 'y']]
    result.to_csv(basepath+str(sub_id) + '.csv', index=False)

---
## Final Pipeline

In [87]:
print("Loading raw data...")
X_train, y_train, X_test = import_data()

print("Removing outliers...")
#X_train_no_outliers, y_train_cleaned = remove_outliers(X_train, y_train, method="IsolationForest")
#X_train_no_outliers, y_train_cleaned = remove_outliers(X_train_no_outliers, y_train_cleaned, method="LocalOutlierFactor") #Remove outliers in x
X_train_no_outliers, y_train_cleaned = remove_outliers(X_train, y_train, method="LocalOutlierFactor") #Remove outliers in x


print("Scaling data...")
X_train_scaled, X_test_scaled = scale(X_train_no_outliers, X_test)

print("Selecting features...")
X_train_no_constant_features, X_test_no_constant_features = remove_constant_features(X_train_scaled, X_test_scaled)
X_train_no_too_correlated_features, X_test_no_too_correlated_features = remove_too_coorelated_features(X_train_no_constant_features, X_test_no_constant_features, threshold=0.98)

#TODO: Clean pipeline
# If we want iterative imputation, and don't want to run it for one hour (should try at one point though), need to
# do feature selection before we do imputation. Except biggest filter for feature selection is f_regression which
# cannot deal with nan values. Therefore, intermediary dumb knn imputation was quickly/dirtily 
# implemented to see results.
# Conclusion: we achieve ~same or better validation scores with a LOT less features, seems like an 
# interresting way forward (even though test scores are a bit lower once submitted, but to not 
# overfit on the public ones and fail on the secret ones, should only watch validation score)

#We first impute with knn which is cheap and then later on reaply mask and use iterative imputing
X_train_mask = X_train_no_too_correlated_features.isna()
X_test_mask = X_test_no_too_correlated_features.isna()
X_train_knn_imputed, X_test_knn_imputed = impute_values(X_train_no_too_correlated_features, X_test_no_too_correlated_features, method='knn')
X_train_selected_features, X_test_selected_features, X_train_mask, X_test_mask = remove_random_features(X_train_knn_imputed, y_train_cleaned, X_test_knn_imputed, X_train_mask, X_test_mask, percentile=25)
X_train_no_imputation = X_train_selected_features.mask(np.array(X_train_mask))
X_test_no_imputation = X_test_selected_features.mask(np.array(X_test_mask))

print("Imputing nan values...")
X_train_cleaned, X_test_cleaned = impute_values(X_train_no_imputation, X_test_no_imputation, method='iterative')

print("Exporting clean data to csv...")
export_to_csv(X_train_cleaned, y_train_cleaned, X_test_cleaned)

print("All done!")

Loading raw data...
Removing outliers...
Detected 50 outliers with method LocalOutlierFactor, out of 1212 samples (4.13%).
Scaling data...
Selecting features...
3 features removed because of constant values (0.36%)
35 features removed because of correlation > 0.98 (4.22%)
For the train dataset, there are 70349 nan values, out of 922628 (7.62%).


  corr /= X_norms


595 features removed because of low correlation with target (74.94%).
Imputing nan values...
For the train dataset, there are 17599 nan values, out of 231238 (7.61%).
[IterativeImputer] Completing matrix with shape (1162, 199)
[IterativeImputer] Ending imputation round 1/15, elapsed time 2.58
[IterativeImputer] Change: 46.61194186774277, scaled tolerance: 0.014114220441223293 
[IterativeImputer] Ending imputation round 2/15, elapsed time 5.78
[IterativeImputer] Change: 4.496222306752934, scaled tolerance: 0.014114220441223293 
[IterativeImputer] Ending imputation round 3/15, elapsed time 8.96
[IterativeImputer] Change: 4.935402922708507, scaled tolerance: 0.014114220441223293 
[IterativeImputer] Ending imputation round 4/15, elapsed time 12.00
[IterativeImputer] Change: 1.5411124373844296, scaled tolerance: 0.014114220441223293 
[IterativeImputer] Ending imputation round 5/15, elapsed time 15.05
[IterativeImputer] Change: 0.5037903245494088, scaled tolerance: 0.014114220441223293 
[Ite



[IterativeImputer] Ending imputation round 4/15, elapsed time 0.26
[IterativeImputer] Ending imputation round 5/15, elapsed time 0.33
[IterativeImputer] Ending imputation round 6/15, elapsed time 0.41
[IterativeImputer] Ending imputation round 7/15, elapsed time 0.52
[IterativeImputer] Ending imputation round 8/15, elapsed time 0.59
[IterativeImputer] Ending imputation round 9/15, elapsed time 0.64
[IterativeImputer] Ending imputation round 10/15, elapsed time 0.70
[IterativeImputer] Ending imputation round 11/15, elapsed time 0.77
[IterativeImputer] Ending imputation round 12/15, elapsed time 0.83
[IterativeImputer] Ending imputation round 13/15, elapsed time 0.89
[IterativeImputer] Ending imputation round 14/15, elapsed time 0.95
[IterativeImputer] Ending imputation round 15/15, elapsed time 1.02
[IterativeImputer] Completing matrix with shape (776, 199)
[IterativeImputer] Ending imputation round 1/15, elapsed time 0.05
[IterativeImputer] Ending imputation round 2/15, elapsed time 0.

In [47]:
#gs = model(X_train_cleaned, np.array(y_train_cleaned).ravel())

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END .........................alpha=0.1;, score=0.474 total time=   0.0s
[CV 2/5] END .........................alpha=0.1;, score=0.435 total time=   0.0s
[CV 3/5] END .........................alpha=0.1;, score=0.441 total time=   0.0s
[CV 4/5] END .........................alpha=0.1;, score=0.501 total time=   0.0s
[CV 5/5] END .........................alpha=0.1;, score=0.402 total time=   0.0s
[CV 1/5] END .........alpha=0.11288378916846889;, score=0.474 total time=   0.0s
[CV 2/5] END .........alpha=0.11288378916846889;, score=0.434 total time=   0.0s
[CV 3/5] END .........alpha=0.11288378916846889;, score=0.440 total time=   0.0s
[CV 4/5] END .........alpha=0.11288378916846889;, score=0.497 total time=   0.0s
[CV 5/5] END .........alpha=0.11288378916846889;, score=0.400 total time=   0.0s
[CV 1/5] END .........alpha=0.12742749857031338;, score=0.474 total time=   0.0s
[CV 2/5] END .........alpha=0.12742749857031338

[CV 2/5] END ..C=0.1, epsilon=0.01, kernel=poly;, score=0.033 total time=   0.1s
[CV 3/5] END ..C=0.1, epsilon=0.01, kernel=poly;, score=0.039 total time=   0.1s
[CV 4/5] END ..C=0.1, epsilon=0.01, kernel=poly;, score=0.049 total time=   0.1s
[CV 5/5] END ..C=0.1, epsilon=0.01, kernel=poly;, score=0.019 total time=   0.1s
[CV 1/5] END ...C=0.1, epsilon=0.01, kernel=rbf;, score=0.129 total time=   0.1s
[CV 2/5] END ...C=0.1, epsilon=0.01, kernel=rbf;, score=0.109 total time=   0.1s
[CV 3/5] END ...C=0.1, epsilon=0.01, kernel=rbf;, score=0.102 total time=   0.1s
[CV 4/5] END ...C=0.1, epsilon=0.01, kernel=rbf;, score=0.115 total time=   0.1s
[CV 5/5] END ...C=0.1, epsilon=0.01, kernel=rbf;, score=0.110 total time=   0.1s
[CV 1/5] END C=0.1, epsilon=0.01, kernel=sigmoid;, score=0.313 total time=   0.1s
[CV 2/5] END C=0.1, epsilon=0.01, kernel=sigmoid;, score=0.265 total time=   0.1s
[CV 3/5] END C=0.1, epsilon=0.01, kernel=sigmoid;, score=0.239 total time=   0.1s
[CV 4/5] END C=0.1, epsil

[CV 2/5] END C=13.593563908785255, epsilon=0.01, kernel=poly;, score=0.391 total time=   0.1s
[CV 3/5] END C=13.593563908785255, epsilon=0.01, kernel=poly;, score=0.391 total time=   0.1s
[CV 4/5] END C=13.593563908785255, epsilon=0.01, kernel=poly;, score=0.384 total time=   0.1s
[CV 5/5] END C=13.593563908785255, epsilon=0.01, kernel=poly;, score=0.360 total time=   0.1s
[CV 1/5] END C=13.593563908785255, epsilon=0.01, kernel=rbf;, score=0.618 total time=   0.1s
[CV 2/5] END C=13.593563908785255, epsilon=0.01, kernel=rbf;, score=0.542 total time=   0.1s
[CV 3/5] END C=13.593563908785255, epsilon=0.01, kernel=rbf;, score=0.510 total time=   0.1s
[CV 4/5] END C=13.593563908785255, epsilon=0.01, kernel=rbf;, score=0.556 total time=   0.1s
[CV 5/5] END C=13.593563908785255, epsilon=0.01, kernel=rbf;, score=0.575 total time=   0.1s
[CV 1/5] END C=13.593563908785255, epsilon=0.01, kernel=sigmoid;, score=-38.302 total time=   0.1s
[CV 2/5] END C=13.593563908785255, epsilon=0.01, kernel=sigm

[CV 1/5] END C=158.48931924611142, epsilon=10.0, kernel=sigmoid;, score=-6689.595 total time=   0.1s
[CV 2/5] END C=158.48931924611142, epsilon=10.0, kernel=sigmoid;, score=-1260.672 total time=   0.1s
[CV 3/5] END C=158.48931924611142, epsilon=10.0, kernel=sigmoid;, score=-3596.458 total time=   0.1s
[CV 4/5] END C=158.48931924611142, epsilon=10.0, kernel=sigmoid;, score=-2335.250 total time=   0.1s
[CV 5/5] END C=158.48931924611142, epsilon=10.0, kernel=sigmoid;, score=-4441.638 total time=   0.1s
The best validation score obtained is 0.56008 with
    	kernel: rbf
    	C: 13.593563908785255
    	epsilon: 0.01
Fitting 4 folds for each of 1 candidates, totalling 4 fits
      Iter       Train Loss      OOB Improve   Remaining Time 
         1          86.3646           6.2093            1.72s
         2          74.4087           5.5380            1.72s
         3          70.2060           5.1252            1.70s
         4          66.6197           3.8355            1.68s
         5 

In [88]:
gs = old_best_gbr(X_train_cleaned, np.array(y_train_cleaned).ravel())

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   15.8s remaining:   23.6s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   20.1s finished


Validation score obtained is 0.6257 with
    	{'loss': 'ls', 'n_estimators': 250, 'learning_rate': 0.025, 'subsample': 0.75, 'max_depth': 6, 'min_samples_split': 5, 'n_iter_no_change': 100, 'validation_fraction': 0.1, 'random_state': 0, 'verbose': 1}
      Iter       Train Loss      OOB Improve   Remaining Time 
         1          91.3291           2.8179           16.80s
         2          90.5564           2.5407           13.68s
         3          87.3021           2.4079           12.35s
         4          84.6068           2.0784           11.69s
         5          80.9514           1.9908           11.39s
         6          77.3235           1.9075           11.13s
         7          77.5718           1.8483           10.95s
         8          73.2426           2.1333           10.87s
         9          72.2031           1.4538           10.74s
        10          69.2085           1.6761           10.67s
        20          49.0434           1.0633            9.95s
    

  plt.show()


In [89]:
prediction = pd.DataFrame(gs.predict(X_test_cleaned))
SUB_ID = 15 #to modify
create_submission(prediction, SUB_ID)