# SCRIPT 1 - STACKING

Toutes les explications de ce script ainsi que des deux autres sont disponibles dans mon rapport

# Imports

In [3]:
import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib
import matplotlib.pyplot as plt

from scipy import stats
from scipy.stats import norm, skew

from sklearn.preprocessing import LabelEncoder, RobustScaler, StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_val_predict, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV, ElasticNet, ElasticNetCV
from sklearn.pipeline import make_pipeline
from sklearn.compose import TransformedTargetRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingClassifier, VotingClassifier, StackingClassifier
from sklearn.metrics import r2_score
from sklearn.base import clone
from sklearn.tree import DecisionTreeClassifier,plot_tree
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
# explicitly require this experimental feature
from sklearn.experimental import enable_iterative_imputer  # noqa
# now you can import normally from sklearn.impute
from sklearn.impute import IterativeImputer, SimpleImputer

#évaluer la performance des modèles
from sklearn.model_selection import cross_validate

#hyperparamétrisation
from sklearn.model_selection import GridSearchCV,cross_val_score

#visualisation d'un arbre
from sklearn.tree import export_graphviz,export_text
from subprocess import call
from sklearn.impute import KNNImputer
from sklearn.feature_selection import SequentialFeatureSelector, SelectKBest, f_classif, SelectFromModel, chi2, RFECV
from sklearn.linear_model import SGDClassifier
from imblearn.over_sampling import SMOTENC
from sklearn.metrics import roc_auc_score

from xgboost.sklearn import XGBClassifier
from sklearn import metrics   
import matplotlib.pylab as plt
import xgboost as xgb
from sklearn.neighbors import KNeighborsClassifier
import lightgbm as lgb
from mlxtend.classifier import StackingCVClassifier

# Script a lancé (Assurance Vie)

In [None]:
# on read le csv
data = pd.read_csv("train.csv")
data=data.drop(['Unnamed: 0'], axis=1)


#définition des types de variables
nominal = ['Nat', 'Statmat', 'Sexe', 'Couple', 'Occ', 'Statpro', 'Herit', 'Pere', 'Mere', 'Gpp', 'Gpm', 'Evtgrav', 'Livep', 
           'Eplog', 'Pep', 'Vmob', 'Assdec', 'Livdf', 'Pel', 'Cel', 'Capi', 'Epsal', 'Pea', 'Zres', 'Logt', 'Terre', 
           'Dette', 'Detlog', 'Detvo', 'Dip', 'Work', 'Urbani'] 
           
ordinal = []
           
discrete = ['Nbenf', 'Age']

# Standardisation
data[discrete] = StandardScaler().fit_transform(data[discrete])

# Remplacement des valeurs manquantes

data[nominal] = data[nominal].apply(lambda series: pd.Series(
        LabelEncoder().fit_transform(series[series.notnull()]),
        index=series[series.notnull()].index
    ))
imp_cat = IterativeImputer(estimator=GradientBoostingClassifier(random_state=47), 
                               initial_strategy='most_frequent',
                               max_iter=10, random_state=47, verbose=2)
data[nominal] = imp_cat.fit_transform(data[nominal])

#Split
y = data['Assvie'] =="O"
X=data.drop(['Assvie', 'Retraite'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

#Features selection
selector = RFECV(GradientBoostingClassifier(random_state=47, max_features='sqrt'), step=4, scoring="roc_auc")
selector.fit(X,y)
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)

#Logistic Regression 
lr=LogisticRegression(C= 0.1, max_iter= 1000, penalty= 'l2', solver='lbfgs', random_state=42)
# train
lr.fit(X_train_selected, y_train)

#KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors= 50, weights= 'distance')
# train
knn.fit(X_train_selected, y_train)

#Random Forest Classifier 
rfc = RandomForestClassifier(criterion= 'gini', max_depth= 12, max_features= 2, n_estimators= 500, random_state=46)
# train
rfc.fit(X_train_selected, y_train)

#Gradient Boosting Classifier
boost=GradientBoostingClassifier(max_features=0.3, n_iter_no_change=26, random_state=12, subsample=0.5,n_estimators=177,max_depth=2, learning_rate=0.1)

# train
boost.fit(X_train_selected, y_train)

#XGBoostClassifier
xgb=XGBClassifier(max_depth=4, learning_rate=0.1, colsample_bytree= 0.6, random_state=29, subsample=0.7, n_estimators=90, min_child_weight=8, eval_metric='auc', gamma=0.1)

# train
xgb.fit(X_train_selected, y_train)

#LGBMClassifier
lgbc=lgb.LGBMClassifier(max_depth=4, learning_rate=0.1, min_split_gain=0.3, num_leaves=10,reg_alpha= 1.2, reg_lambda= 1.2,
                           random_state=9, subsample=0.8, n_estimators=177, colsample_bytree= 0.8, subsample_freq= 10)

# train
lgbc.fit(X_train_selected, y_train)

#Stacking
best_models_auc=[
    ('Boosting', boost),
    ('Random Forest', rfc),
    ('Logistic Regression',lr),
    ('KNeighbours', knn),
    ('XGBoost', xgb),
    ('LGBoost', lgbc)
]

# ensemble = base models + meta-learner
stacking = StackingClassifier(best_models_auc, cv=10, final_estimator=LogisticRegression())

# train
stacking.fit(X_train_selected, y_train)

# test
#score = stacking.score(X_test, y_test)
score=roc_auc_score(y_test,stacking.predict_proba(X_test_selected)[:,1])

# show
print('K-fold stacking: {:.4f}'.format(score))

#################################################################################################################
#Test
test=pd.read_csv("test.csv")
test=test.drop(['Unnamed: 0'], axis=1)

test[discrete] = StandardScaler().fit_transform(test[discrete])

test[nominal] = test[nominal].apply(lambda series: pd.Series(
        LabelEncoder().fit_transform(series[series.notnull()]),
        index=series[series.notnull()].index
    ))
imp_cat = IterativeImputer(estimator=GradientBoostingClassifier(random_state=47), 
                               initial_strategy='most_frequent',
                               max_iter=10, random_state=47, verbose=2)
test[nominal] = imp_cat.fit_transform(test[nominal])

test2=selector.transform(test)

#Création des probas
proba_assvie=stacking.predict_proba(test2)
Assvie=[]
for i in range(len(proba_assvie)):
    Assvie.append(proba_assvie[i][1])

# Script a lancé (Retraite)

In [9]:
#Split
y = data['Retraite'] =="O"
X=data.drop(['Assvie', 'Retraite'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

#Features Selection
selector = RFECV(GradientBoostingClassifier(random_state=47, max_features='sqrt'), step=3, scoring="roc_auc")
selector.fit(X,y)
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)

#Logistic Regression 
lr=LogisticRegression(C= 1.0, max_iter= 70, penalty= 'l2', solver= 'liblinear', random_state=42)
# train
lr.fit(X_train_selected, y_train)

#KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors= 71, weights= 'distance')
# train
knn.fit(X_train_selected, y_train)

#Random Forest Classifier 
rfc = RandomForestClassifier(criterion= 'entropy', max_depth= 8, max_features= 4, n_estimators= 200, random_state=42)
# train
rfc.fit(X_train_selected, y_train)

#Gradient Boosting Classifier
boost=GradientBoostingClassifier(max_features=0.3, n_iter_no_change=24, learning_rate=0.1, random_state=163, subsample=0.5, n_estimators=500, max_depth=2)

# train
boost.fit(X_train_selected, y_train)

#XGBoostClassifier
xgb=XGBClassifier(max_depth=2, learning_rate=0.05, colsample_bytree= 0.6, random_state=124, subsample=0.9, n_estimators=500, min_child_weight=4, eval_metric='auc', gamma=0.1, reg_alpha= 1)

# train
xgb.fit(X_train_selected, y_train)

#LGBMClassifier
lgbc=lgb.LGBMClassifier(colsample_bytree= 0.7, random_state=47, learning_rate= 0.1, max_depth= 4, min_split_gain= 0.3, n_estimators= 500, num_leaves= 20, reg_alpha= 1.2, reg_lambda= 1.2, subsample= 0.8, subsample_freq= 20)

# train
lgbc.fit(X_train_selected, y_train)

#Stacking
best_models_auc=[
    ('Boosting', boost),
    ('Random Forest', rfc),
    ('Logistic Regression',lr),
    ('KNeighbours', knn),
    ('XGBoost', xgb),
    ('LGBoost', lgbc)
]

# ensemble = base models + meta-learner
stacking = StackingClassifier(best_models_auc, cv=10, final_estimator=LogisticRegression())

# train
stacking.fit(X_train_selected, y_train)

# test
#score = stacking.score(X_test, y_test)
score=roc_auc_score(y_test,stacking.predict_proba(X_test_selected)[:,1])

# show
print('K-fold stacking: {:.4f}'.format(score))

test3=selector.transform(test)

#Création des probas
proba_retraite=stacking.predict_proba(test3)
retraite=[]
for i in range(len(proba_retraite)):
    retraite.append(proba_retraite[i][1])

# Script a lancé (création du csv)

In [None]:
id = list(map(lambda x: x, range(0, len(test))))
df=pd.DataFrame({"Id":id, "Assvie":Assvie, "Retraite":retraite})
df.to_csv("script1.csv", index=False)

# Détails avec les grid search (à ne pas lancé)

# Preprocessing

In [5]:
#définition des types de variables
nominal = ['Nat', 'Statmat', 'Sexe', 'Couple', 'Occ', 'Statpro', 'Herit', 'Pere', 'Mere', 'Gpp', 'Gpm', 'Evtgrav', 'Livep', 
           'Eplog', 'Pep', 'Vmob', 'Assdec', 'Livdf', 'Pel', 'Cel', 'Capi', 'Epsal', 'Pea', 'Zres', 'Logt', 'Terre', 
           'Dette', 'Detlog', 'Detvo', 'Dip', 'Work', 'Urbani'] 
           
ordinal = []
           
discrete = ['Nbenf', 'Age']

# Standardisation
data[discrete] = StandardScaler().fit_transform(data[discrete])

# Remplacement des valeurs manquantes

data[nominal] = data[nominal].apply(lambda series: pd.Series(
        LabelEncoder().fit_transform(series[series.notnull()]),
        index=series[series.notnull()].index
    ))
imp_cat = IterativeImputer(estimator=GradientBoostingClassifier(random_state=47), 
                               initial_strategy='most_frequent',
                               max_iter=10, random_state=47, verbose=2)
data[nominal] = imp_cat.fit_transform(data[nominal])

  mode = stats.mode(array)


[IterativeImputer] Completing matrix with shape (10906, 32)
[IterativeImputer] Ending imputation round 1/10, elapsed time 55.24
[IterativeImputer] Change: 10.0, scaled tolerance: 0.007 
[IterativeImputer] Ending imputation round 2/10, elapsed time 110.97
[IterativeImputer] Change: 9.0, scaled tolerance: 0.007 
[IterativeImputer] Ending imputation round 3/10, elapsed time 166.56
[IterativeImputer] Change: 9.0, scaled tolerance: 0.007 
[IterativeImputer] Ending imputation round 4/10, elapsed time 222.29
[IterativeImputer] Change: 7.0, scaled tolerance: 0.007 
[IterativeImputer] Ending imputation round 5/10, elapsed time 278.26
[IterativeImputer] Change: 7.0, scaled tolerance: 0.007 
[IterativeImputer] Ending imputation round 6/10, elapsed time 337.03
[IterativeImputer] Change: 7.0, scaled tolerance: 0.007 
[IterativeImputer] Ending imputation round 7/10, elapsed time 396.54
[IterativeImputer] Change: 7.0, scaled tolerance: 0.007 
[IterativeImputer] Ending imputation round 8/10, elapsed t



# Assurance Vie

## Split

In [7]:
y = data['Assvie'] =="O"
X=data.drop(['Assvie', 'Retraite'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Features Selections

### Grid Search

In [None]:
param_selector = {'estimator':[GradientBoostingClassifier(random_state=47, max_features='sqrt'), RandomForestClassifier(random_state=47, max_features='sqrt')],
                  'step' :[3,4,5],
                  'scoring' : ['roc_auc']
}

# model
rfecv = RFECV(GradientBoostingClassifier(random_state=47, max_features='sqrt'))

# cross-validation strategy
cv = KFold(n_splits=10, shuffle=True, random_state=47)

# hyper-parameter search
gridsearch = GridSearchCV(rfecv, param_selector, scoring="roc_auc", cv=cv, verbose=True, n_jobs=-1)
gridsearch.fit(X, y)
print('Best hyperparameters:\n' + str(gridsearch.best_params_))

### Selector final

In [None]:
selector = RFECV(GradientBoostingClassifier(random_state=47, max_features='sqrt'), step=4, scoring="roc_auc")
selector.fit(X,y)
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)

## Training

### LogisticRegression

#### GridSearch

In [None]:
parameters = {'penalty':['l1','l2','elasticnet'],
              'solver' : ['newton-cg', 'lbfgs', 'liblinear'],
               'max_iter':[100,1000], 
              'C':np.logspace(-3,3,7)}

lr=LogisticRegression(random_state=42)

cv = KFold(n_splits=10)

gslr = GridSearchCV(lr, parameters, cv=cv, scoring="roc_auc", verbose=3, n_jobs=-1)
gslr.fit(X_train_selected, y_train)
print('---')
print('Best hyperparameters:\n' + str(gslr.best_params_))

#### Validation

In [None]:
# model
lr = gslr.best_estimator_

# train
lr.fit(X_train_selected, y_train)

# test
score=roc_auc_score(y_test,lr.predict_proba(X_test_selected)[:,1])
# print
print(score)

### KNeighborsClassifier

#### GridSearch

In [None]:
parameters = {'n_neighbors':list(range(1, 200)),
              'weights':['uniform', 'distance'],
             }

knn=KNeighborsClassifier()

cv = KFold(n_splits=10)

gsknn = GridSearchCV(knn, parameters, cv=cv, scoring="roc_auc", verbose=3, n_jobs=-1)
gsknn.fit(X_train_selected, y_train)
print('---')
print('Best hyperparameters:\n' + str(gsknn.best_params_))

#### Validation

In [None]:
# model
knn = gsknn.best_estimator_

# train
knn.fit(X_train_selected, y_train)

# test
score=roc_auc_score(y_test,knn.predict_proba(X_test_selected)[:,1])
# print
print(score)

### RandomForestClassifier

#### Grid Search

In [None]:
paramGrid = { 
    "n_estimators": [70, 90, 100, 200, 500], # TODO: add candidate values
      "max_features": [0.8, 1,2,4,6,8,10,20], # TODO: add the other parameters and their candidate values
    "max_depth":[3,4,8,10,15],
    "criterion" :['gini','entropy'],
}


# model
rfc = RandomForestClassifier(random_state=42)

# cross-validation strategy
cv = KFold(n_splits=10, shuffle=True, random_state=42)

# hyper-parameter search
gsrf = GridSearchCV(rfc, paramGrid, scoring="roc_auc", cv=cv, verbose=True, n_jobs=-1)
gsrf.fit(X_train_selected, y_train)

print('---')
print('Best hyperparameters:\n' + str(gsrf.best_params_))

#### Search of the best random_state

In [None]:
best_score=0.74
debut=41
depth=1
for random_state in range(0,200):
    for max_depth in [6,8,10,12,14]:
    # model
        rfc=RandomForestClassifier(criterion= 'gini', max_depth= max_depth, max_features= 2, n_estimators= 500, random_state=random_state)

        rfc.fit(X_train_selected, y_train)

        score=roc_auc_score(y_test,rfc.predict_proba(X_test_selected)[:,1])

        if (score>best_score):
            best_score=score
            debut=random_state
            depth=max_depth
        
print('Best Score: {:.4f} (with random_state {:d} and max_depth {:d} )'.format(best_score, debut, depth))

#### Validation

In [None]:
# model
rfc = RandomForestClassifier(criterion= 'gini', max_depth= 12, max_features= 2, n_estimators= 500, random_state=46)

# train
rfc.fit(X_train_selected, y_train)

# test
#score = rfc.score(X_test, y_test)
score = roc_auc_score(y_test, rfc.predict_proba(X_test_selected)[:,1])

# print
print(score)

### GradientBoostingClassifier

#### Grid Search

In [None]:
paramGrid = { 
    "n_estimators": [100,500,1000], 
    "n_iter_no_change": [10,15],
    "learning_rate":[0.01, 0.1, 0.2],
    "max_depth":[1, 2, 3],
    "subsample":[0.4, 0.5, 0.6],
    "max_features":[0.2, 0.3, 0.4]
}


# model
boost = GradientBoostingClassifier(random_state=47)

# cross-validation strategy
cv = KFold(n_splits=10, shuffle=True, random_state=47)

# hyper-parameter search
gsgbc = GridSearchCV(boost, paramGrid, cv=cv,scoring='roc_auc', verbose=True, n_jobs=-1)
gsgbc.fit(X_train_selected, y_train)

print('---')
print('Best hyperparameters:\n' + str(gsgbc.best_params_))

#### Search of the best random_state and iter_to_change

In [None]:
best_score=0.74
debut=41
n_iter=1
for random_state in range(0,200):
    for iter_to_change in range(0,50):
    # model
        boost=GradientBoostingClassifier(max_depth=2, learning_rate=0.1, max_features=0.3, n_iter_no_change=iter_to_change,
                           random_state=random_state, subsample=0.5, n_estimators=177)

        boost.fit(X_train_selected, y_train)

        score=roc_auc_score(y_test,boost.predict_proba(X_test_selected)[:,1])

        if (score>best_score):
            best_score=score
            debut=random_state
            n_iter=iter_to_change
        
print('Best Score: {:.4f} (with random_state {:d} and iter_to_change {:d} )'.format(best_score, debut, n_iter))

#### Validation

In [None]:
# model
boost=GradientBoostingClassifier(max_features=0.3, n_iter_no_change=26, random_state=12, subsample=0.5,n_estimators=177,max_depth=2, learning_rate=0.1)

# train
boost.fit(X_train_selected, y_train)

# test
#score = boost.score(X_test_selected, y_test)

score=roc_auc_score(y_test,boost.predict_proba(X_test_selected)[:,1])

# print
print('Gradient boosting Classifier: {:.4f} (with {:d} trees)'.format(score, boost.n_estimators_))

### XGBClassifier

#### Grid Search

In [None]:
params = {
    'n_estimators': [90,100,500], # Nombre d'arbres
     'min_child_weight':[5,6,7,8],
    'max_depth': [1,2,3,4], # Profondeur maximale de chaque arbre
    'learning_rate': [0.01,0.05,0.1], # Taux d'apprentissage
    'colsample_bytree': [0.6,0.7,0.8], # Sous-échantillonnage des colonnes
    'subsample': [0.7,0.8,0.9], # Sous-échantillonnage des lignes
    'eval_metric':['auc'],
    'reg_alpha':[1],
    'gamma':[0.2,0.1],
}

# Initialiser le modèle XGBoost
xgb = XGBClassifier(seed=47)

# cross-validation strategy
cv = KFold(n_splits=10, shuffle=True, random_state=47)

# hyper-parameter search
gsxgb = GridSearchCV(xgb, params, cv=cv,scoring='roc_auc', verbose=True, n_jobs=-1)
gsxgb.fit(X_train_selected, y_train)

print('---')
print('Best hyperparameters:\n' + str(gsxgb.best_params_))

#### Search of the best random_state 

In [None]:
best_score=0.74
debut=41
depth=1
for random_state in range(0,200):
    for max_depth in [2,4,6,8,10,15]:
    # model
        xgb=XGBClassifier(max_depth=max_depth, learning_rate=0.1, colsample_bytree= 0.6, random_state=random_state, subsample=0.7, n_estimators=90, min_child_weight=8, eval_metric='auc', gamma=0.1)

        xgb.fit(X_train_selected, y_train)

        score=roc_auc_score(y_test,xgb.predict_proba(X_test_selected)[:,1])

        if (score>best_score):
            best_score=score
            debut=random_state
            depth=max_depth
        
print('Best Score: {:.4f} (with random_state {:d} and max_depth {:d} )'.format(best_score, debut, depth))

#### Validation

In [None]:
xgb=XGBClassifier(max_depth=4, learning_rate=0.1, colsample_bytree= 0.6, random_state=29, subsample=0.7, n_estimators=90, min_child_weight=8, eval_metric='auc', gamma=0.1)

# train
xgb.fit(X_train_selected, y_train)

# test
#score = boost.score(X_test_selected, y_test)
score=roc_auc_score(y_test,xgb.predict_proba(X_test_selected)[:,1])

# print
print('XGB Classifier: {:.4f} '.format(score))

### LGBMClassifier

#### Grid Search

In [None]:
parameters= { 
        'n_estimators': [100, 500, 1000],
        'learning_rate': [0.1],
        'colsample_bytree': [0.7, 0.8],
        'max_depth': [4],
        'num_leaves': [10, 20],
        'reg_alpha': [1.1, 1.2],
        'reg_lambda': [1.1, 1.2],
        'min_split_gain': [0.3, 0.4],
        'subsample': [0.8, 0.9],
        'subsample_freq': [10, 20]
    }
# Initialiser le modèle XGBoost
lgbc = lgb.LGBMClassifier(random_state=47, learning_rate=0.1)

# cross-validation strategy
cv = KFold(n_splits=10, shuffle=True, random_state=47)

# hyper-parameter search
gslgb = GridSearchCV(lgbc, parameters, cv=cv,scoring='roc_auc', verbose=True, n_jobs=-1)
gslgb.fit(X_train_selected, y_train)

print('---')
print('Best hyperparameters:\n' + str(gslgb.best_params_))

#### Search of the best random_state and iter_to_change

In [None]:
best_score=0.74
debut=41
n_iter=1
for random_state in range(0,200):
        lgbc=lgb.LGBMClassifier(max_depth=4, learning_rate=0.1, min_split_gain=0.3, num_leaves=10,reg_alpha= 1.2, reg_lambda= 1.2,
                           random_state=random_state, subsample=0.8, n_estimators=177, colsample_bytree= 0.8, subsample_freq= 10)

        lgbc.fit(X_train_selected, y_train)

        score=roc_auc_score(y_test,lgbc.predict_proba(X_test_selected)[:,1])

        if (score>best_score):
            best_score=score
            debut=random_state
print('Best Score: {:.4f} (with random_state {:d} )'.format(best_score, debut))

#### Validation

In [None]:
lgbc=lgb.LGBMClassifier(max_depth=4, learning_rate=0.1, min_split_gain=0.3, num_leaves=10,reg_alpha= 1.2, reg_lambda= 1.2,
                           random_state=9, subsample=0.8, n_estimators=177, colsample_bytree= 0.8, subsample_freq= 10)

# train
lgbc.fit(X_train_selected, y_train)

# test
#score = boost.score(X_test_selected, y_test)

score=roc_auc_score(y_test,lgbc.predict_proba(X_test_selected)[:,1])

# print
print('LGB Classifier: {:.4f} '.format(score))

### Stacking

In [None]:
lr = gslr.best_estimator_
knn = gsknn.best_estimator_
rfc = gsrf.best_estimator_
boost = GradientBoostingClassifier(max_features=0.3, n_iter_no_change=26, random_state=12, subsample=0.5,n_estimators=177,max_depth=2)
xgb = XGBClassifier(max_depth=4, learning_rate=0.1, colsample_bytree= 0.6, random_state=29, subsample=0.7, n_estimators=90, min_child_weight=8, eval_metric='auc', gamma=0.1)
lgbc = gslgb.best_estimator_

best_models_auc=[
    ('Boosting', boost),
    ('Random Forest', rfc),
    ('Logistic Regression',lr),
    ('KNeighbours', knn),
    ('XGBoost', xgb),
    ('LGBoost', lgbc)
]

# ensemble = base models + meta-learner
stacking = StackingClassifier(best_models_auc, cv=10, final_estimator=LogisticRegression())

# train
stacking.fit(X_train_selected, y_train)

# test
#score = stacking.score(X_test, y_test)
score=roc_auc_score(y_test,stacking.predict_proba(X_test_selected)[:,1])

# show
print('K-fold stacking: {:.4f}'.format(score))

## Test

### preprocessing

In [6]:
test=pd.read_csv("test.csv")
test=test.drop(['Unnamed: 0'], axis=1)

test[discrete] = StandardScaler().fit_transform(test[discrete])

test[nominal] = test[nominal].apply(lambda series: pd.Series(
        LabelEncoder().fit_transform(series[series.notnull()]),
        index=series[series.notnull()].index
    ))
imp_cat = IterativeImputer(estimator=GradientBoostingClassifier(random_state=47), 
                               initial_strategy='most_frequent',
                               max_iter=10, random_state=47, verbose=2)
test[nominal] = imp_cat.fit_transform(test[nominal])
test2=selector.transform(test)

  mode = stats.mode(array)


[IterativeImputer] Completing matrix with shape (5873, 32)
[IterativeImputer] Ending imputation round 1/10, elapsed time 29.91
[IterativeImputer] Change: 10.0, scaled tolerance: 0.007 
[IterativeImputer] Ending imputation round 2/10, elapsed time 59.92
[IterativeImputer] Change: 9.0, scaled tolerance: 0.007 
[IterativeImputer] Ending imputation round 3/10, elapsed time 88.68
[IterativeImputer] Change: 8.0, scaled tolerance: 0.007 
[IterativeImputer] Ending imputation round 4/10, elapsed time 120.22
[IterativeImputer] Change: 7.0, scaled tolerance: 0.007 
[IterativeImputer] Ending imputation round 5/10, elapsed time 151.37
[IterativeImputer] Change: 7.0, scaled tolerance: 0.007 
[IterativeImputer] Ending imputation round 6/10, elapsed time 181.53
[IterativeImputer] Change: 7.0, scaled tolerance: 0.007 
[IterativeImputer] Ending imputation round 7/10, elapsed time 211.64
[IterativeImputer] Change: 7.0, scaled tolerance: 0.007 
[IterativeImputer] Ending imputation round 8/10, elapsed time



NameError: name 'selector' is not defined

### Prediction

In [None]:
proba_assvie=stacking.predict_proba(test2)
Assvie=[]
for i in range(len(proba_assvie)):
    Assvie.append(proba_assvie[i][1])

# Retraite

## Split

In [None]:
y = data['Retraite'] =="O"
X=data.drop(['Assvie', 'Retraite'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Features Selection

In [None]:
param_selector = {
                  'estimator__random_state': list(map(lambda x: x, range(0, 100))),
                  'scoring' : ['roc_auc'], 
}

# model
rfecv = RFECV(GradientBoostingClassifier(max_features='sqrt'),step=3)

# cross-validation strategy
cv = KFold(n_splits=10, shuffle=True, random_state=47)

# hyper-parameter search
gridsearch = GridSearchCV(rfecv, param_selector, scoring="roc_auc", cv=cv, verbose=True, n_jobs=-1)
gridsearch.fit(X, y)

selector = RFECV(GradientBoostingClassifier(random_state=47, max_features='sqrt'), step=3, scoring="roc_auc")
selector.fit(X,y)
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)

## Training

### Logistic Regression

#### Grid Search

In [None]:
parameters = {'penalty':['l1','l2','elasticnet'],
              'solver' : ['newton-cg', 'lbfgs', 'liblinear'],
               'max_iter':[70,100,500,1000], 
              'C':np.logspace(-3,3,7)}

lr=LogisticRegression(random_state=42)

cv = KFold(n_splits=10)

gslr = GridSearchCV(lr, parameters, cv=cv, scoring="roc_auc", verbose=3, n_jobs=-1)
gslr.fit(X_train_selected, y_train)
print('---')
print('Best hyperparameters:\n' + str(gslr.best_params_))

#### Validation

In [None]:
# model
lr = gslr.best_estimator_

# train
lr.fit(X_train_selected, y_train)

# test
score=roc_auc_score(y_test,lr.predict_proba(X_test_selected)[:,1])
# print
print(score)

### KNeighborsClassifier

#### Grid Search

In [None]:
parameters = {'n_neighbors':list(range(1, 200)),
              'weights':['uniform', 'distance'],
             }

knn=KNeighborsClassifier()

cv = KFold(n_splits=10)

gsknn = GridSearchCV(knn, parameters, cv=cv, scoring="roc_auc", verbose=3, n_jobs=-1)
gsknn.fit(X_train_selected, y_train)
print('---')
print('Best hyperparameters:\n' + str(gsknn.best_params_))

#### Validation

In [None]:
# model
knn = gsknn.best_estimator_

# train
knn.fit(X_train_selected, y_train)

# test
score=roc_auc_score(y_test,knn.predict_proba(X_test_selected)[:,1])
# print
print(score)

### RandomForestClassifier

#### Grid Search

In [None]:
paramGrid = { 
    "n_estimators": [70, 90, 100, 200, 500], # TODO: add candidate values
      "max_features": [0.8, 1,2,4,6,8,10,20], # TODO: add the other parameters and their candidate values
    "max_depth":[3,4,8,10,15],
    "criterion" :['gini','entropy'],
}


# model
rfc = RandomForestClassifier(random_state=42)

# cross-validation strategy
cv = KFold(n_splits=10, shuffle=True, random_state=42)

# hyper-parameter search
gsrf = GridSearchCV(rfc, paramGrid, scoring="roc_auc", cv=cv, verbose=True, n_jobs=-1)
gsrf.fit(X_train_selected, y_train)

print('---')
print('Best hyperparameters:\n' + str(gsrf.best_params_))

#### Validation

In [None]:
# model
rfc = gsrf.best_estimator_

# train
rfc.fit(X_train_selected, y_train)

# test
#score = rfc.score(X_test, y_test)
score = roc_auc_score(y_test, rfc.predict_proba(X_test_selected)[:,1])

# print
print(score)

### GradientBoostingClassifier

#### Grid Search

In [None]:
paramGrid = { 
    "n_estimators": [100,500,1000], 
    "n_iter_no_change": [10,15],
    "learning_rate":[0.01, 0.1, 0.2],
    "max_depth":[1, 2, 3],
    "subsample":[0.4, 0.5, 0.6],
    "max_features":[0.2, 0.3, 0.4]
}


# model
boost = GradientBoostingClassifier(random_state=47)

# cross-validation strategy
cv = KFold(n_splits=10, shuffle=True, random_state=47)

# hyper-parameter search
gsgbc = GridSearchCV(boost, paramGrid, cv=cv,scoring='roc_auc', verbose=True, n_jobs=-1)
gsgbc.fit(X_train_selected, y_train)

print('---')
print('Best hyperparameters:\n' + str(gsgbc.best_params_))

#### Search of the best random_state and iter_to_change

In [None]:
best_score=0.74
debut=41
n_iter=1
for random_state in range(0,200):
    for iter_to_change in range(0,50):
    # model
        boost=GradientBoostingClassifier(max_depth=2, learning_rate=0.1, max_features=0.3, n_iter_no_change=iter_to_change,
                           random_state=random_state, subsample=0.5, n_estimators=500)

        boost.fit(X_train_selected, y_train)

        score=roc_auc_score(y_test,boost.predict_proba(X_test_selected)[:,1])

        if (score>best_score):
            best_score=score
            debut=random_state
            n_iter=iter_to_change
        
print('Best Random State: {:.4f} (with random_state {:d} and iter_to_change {:d} )'.format(best_score, debut, n_iter))

#### Validation

In [None]:
# model
boost=GradientBoostingClassifier(max_features=0.3, n_iter_no_change=24, learning_rate=0.1, random_state=163, subsample=0.5, n_estimators=500, max_depth=2)

# train
boost.fit(X_train_selected, y_train)

# test
#score = boost.score(X_test_selected, y_test)

score=roc_auc_score(y_test,boost.predict_proba(X_test_selected)[:,1])

# print
print('Gradient boosting Classifier: {:.4f} (with {:d} trees)'.format(score, boost.n_estimators_))

### XGBClassfier

#### Grid Search

In [None]:
params = {
    'n_estimators': [100,500,1000], # Nombre d'arbres
     'min_child_weight':[4,5,6],
    'max_depth': [1,2,3,4], # Profondeur maximale de chaque arbre
    'learning_rate': [0.01,0.05,0.1], # Taux d'apprentissage
    'colsample_bytree': [0.6,0.7,0.8], # Sous-échantillonnage des colonnes
    'subsample': [0.8,0.9], # Sous-échantillonnage des lignes
    'reg_alpha':[1,1.2],
    'gamma':[0.2,0.1],
}

# Initialiser le modèle XGBoost
xgb = XGBClassifier(seed=47, eval_metric='auc')

# cross-validation strategy
cv = KFold(n_splits=10, shuffle=True, random_state=47)

# hyper-parameter search
gsxgb = GridSearchCV(xgb, params, cv=cv,scoring='roc_auc', verbose=True, n_jobs=-1)
gsxgb.fit(X_train_selected, y_train)

print('---')
print('Best hyperparameters:\n' + str(gsxgb.best_params_))

#### Search of the best random_state

In [None]:
best_score=0.74
debut=41
depth=1
for random_state in range(0,200):
    # model
        xgb=XGBClassifier(max_depth=2, learning_rate=0.05, colsample_bytree= 0.6, random_state=random_state, subsample=0.9, n_estimators=500, min_child_weight=4, eval_metric='auc', gamma=0.1,reg_alpha= 1)

        xgb.fit(X_train_selected, y_train)

        score=roc_auc_score(y_test,xgb.predict_proba(X_test_selected)[:,1])

        if (score>best_score):
            best_score=score
            debut=random_state
            
        
print('Best Random State: {:.4f} (with random_state {:d}  )'.format(best_score, debut))

#### Validation

In [None]:
xgb=XGBClassifier(max_depth=2, learning_rate=0.05, colsample_bytree= 0.6, random_state=124, subsample=0.9, n_estimators=500, min_child_weight=4, eval_metric='auc', gamma=0.1, reg_alpha= 1)
#xgb=gsxgb.best_estimator_
# train
xgb.fit(X_train_selected, y_train)

# test
#score = boost.score(X_test_selected, y_test)
score=roc_auc_score(y_test,xgb.predict_proba(X_test_selected)[:,1])

# print
print('XGB Classifier: {:.4f} '.format(score))

### LGBMClassifier

#### Grid Search

In [None]:
parameters= { 
        'n_estimators': [100, 500, 1000],
        'learning_rate': [0.1],
        'colsample_bytree': [0.7, 0.8],
        'max_depth': [4],
        'num_leaves': [10, 20],
        'reg_alpha': [1.1, 1.2],
        'reg_lambda': [1.1, 1.2],
        'min_split_gain': [0.3, 0.4],
        'subsample': [0.8, 0.9],
        'subsample_freq': [10, 20]
    }
# Initialiser le modèle XGBoost
lgbc = lgb.LGBMClassifier(random_state=47, learning_rate=0.1)

# cross-validation strategy
cv = KFold(n_splits=10, shuffle=True, random_state=47)

# hyper-parameter search
gslgb = GridSearchCV(lgbc, parameters, cv=cv,scoring='roc_auc', verbose=True, n_jobs=-1)
gslgb.fit(X_train_selected, y_train)

print('---')
print('Best hyperparameters:\n' + str(gslgb.best_params_))

#### Validation

In [None]:
lgbc=gslgb.best_estimator_

# train
lgbc.fit(X_train_selected, y_train)

# test
#score = boost.score(X_test_selected, y_test)

score=roc_auc_score(y_test,lgbc.predict_proba(X_test_selected)[:,1])

# print
print('LGB Classifier: {:.4f} '.format(score))

### Stacking

In [None]:
best_models_auc=[
    ('Boosting', boost),
    ('Random Forest', rfc),
    ('Logistic Regression',lr),
    ('KNeighbours', knn),
    ('XGBoost', xgb),
    ('LGBoost', lgbc)
]

# ensemble = base models + meta-learner
stacking = StackingClassifier(best_models_auc, cv=10, final_estimator=LogisticRegression())

# train
stacking.fit(X_train_selected, y_train)

# test
#score = stacking.score(X_test, y_test)
score=roc_auc_score(y_test,stacking.predict_proba(X_test_selected)[:,1])

# show
print('K-fold stacking: {:.4f}'.format(score))

# Test

## Prediction

In [None]:
test3=selector.transform(test)
proba_retraite=stacking.predict_proba(test3)
retraite=[]
for i in range(len(proba_retraite)):
    retraite.append(proba_retraite[i][1])

# Create CSV to export

In [None]:
id = list(map(lambda x: x, range(0, len(test))))
df=pd.DataFrame({"Id":id, "Assvie":Assvie, "Retraite":retraite})
df.to_csv("yanis_perrin_15.csv", index=False)