In [44]:
#%load_ext pycodestyle_magic

In [45]:
#%pycodestyle_on

In [46]:
# Load libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from joblib import dump, load

from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold, StratifiedKFold, GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score, accuracy_score, f1_score
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import AdaBoostClassifier

#from xgboost import XGBClassifier

In [47]:
# load data
df = pd.read_csv("data/nba_logreg.csv", sep=",")
#df = df.drop_duplicates()
df.head()

Unnamed: 0,Name,GP,MIN,PTS,FGM,FGA,FG%,3P Made,3PA,3P%,...,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,TARGET_5Yrs
0,Brandon Ingram,36,27.4,7.4,2.6,7.6,34.7,0.5,2.1,25.0,...,2.3,69.9,0.7,3.4,4.1,1.9,0.4,0.4,1.3,0.0
1,Andrew Harrison,35,26.9,7.2,2.0,6.7,29.6,0.7,2.8,23.5,...,3.4,76.5,0.5,2.0,2.4,3.7,1.1,0.5,1.6,0.0
2,JaKarr Sampson,74,15.3,5.2,2.0,4.7,42.2,0.4,1.7,24.4,...,1.3,67.0,0.5,1.7,2.2,1.0,0.5,0.3,1.0,0.0
3,Malik Sealy,58,11.6,5.7,2.3,5.5,42.6,0.1,0.5,22.6,...,1.3,68.9,1.0,0.9,1.9,0.8,0.6,0.1,1.0,1.0
4,Matt Geiger,48,11.5,4.5,1.6,3.0,52.4,0.0,0.1,0.0,...,1.9,67.4,1.0,1.5,2.5,0.3,0.3,0.4,0.8,1.0


In [48]:
#number of rows and columns of the dataset
df.shape

(1340, 21)

In [49]:
# class distribution
print(df.groupby('TARGET_5Yrs').size())

TARGET_5Yrs
0.0    509
1.0    831
dtype: int64


## Data Cleaning

- Dans cette étape le but est d'identifier l'existence ou pas de variables n'ont qu'une seule modalité ou très peu de modalité ainsi qu'identifier les lignes qui contiennent des duplicatas

In [50]:
# computes the modalities of each column 
df.nunique()

Name           1294
GP               70
MIN             325
PTS             191
FGM              87
FGA             159
FG%             284
3P Made          23
3PA              54
3P%             254
FTM              59
FTA              76
FT%             383
OREB             44
DREB             74
REB             101
AST              77
STL              26
BLK              28
TOV              41
TARGET_5Yrs       2
dtype: int64

- Les lignes qui contiennent des données semblables sont potentiellement inutiles pour l'apprentissage, et trompeuses lors de l'évaluation du modèle (Data Leakage) pour cela il est nécessaire d'identifier les lignes semblables.

In [51]:
# calculate duplicates
duplicates = df.duplicated()

In [52]:
# identify duplicates rows
duplicates
duplicates.any()

True

In [53]:
# Visualizing the duplicates rows
print(df[duplicates])

                  Name  GP   MIN   PTS  FGM   FGA   FG%  3P Made  3PA   3P%  \
163      Charles Smith  60   8.7   2.9  1.0   2.2  44.4      0.0  0.1   0.0   
166      Charles Smith  71  30.4  16.3  6.1  12.4  49.5      0.0  0.0   0.0   
169      Charles Smith  34   8.6   3.5  1.4   3.7  39.2      0.4  1.4  31.9   
243    Reggie Williams  35  24.5  10.4  4.3  12.2  35.6      0.4  1.7  22.4   
339        Ken Johnson  64  12.7   4.1  1.8   3.3  52.8      0.0  0.0   NaN   
366      Charles Jones  78  20.1   8.4  3.0   5.8  52.0      0.0  0.1   0.0   
369      Charles Jones  29  16.4   3.7  1.3   4.2  31.7      0.7  2.1  31.1   
504      Eddie Johnson  74  20.5   9.3  4.0   8.7  45.9      0.0  0.1   9.1   
733   Gerald Henderson  43   8.3   2.6  0.9   2.4  35.6      0.1  0.4  21.1   
824    Marcus Williams  79  16.6   6.8  2.6   6.7  39.5      0.6  2.1  28.2   
971      Mike Dunleavy  82  15.9   5.7  2.0   5.1  40.3      0.6  1.8  34.7   
1335       Chris Smith  80  15.8   4.3  1.6   3.6  4

In [54]:
# we delete the duplicates rows of the dataset
df.drop_duplicates(inplace=True)
df.shape

(1328, 21)

In [55]:
#df[df["Name"].duplicated()]

- À présent, nous allons essayer d'identifier les valeurs manquantes, puisque de nombreux algorithmes d'apprentissage ne prennent pas en charge les jeux de données avec des valeurs manquantes.

In [56]:
#identifying missing values
df.isnull().sum()

Name            0
GP              0
MIN             0
PTS             0
FGM             0
FGA             0
FG%             0
3P Made         0
3PA             0
3P%            10
FTM             0
FTA             0
FT%             0
OREB            0
DREB            0
REB             0
AST             0
STL             0
BLK             0
TOV             0
TARGET_5Yrs     0
dtype: int64

- Nous remplaçons les valeurs Nan uniquement présentes lorsqu'aucune tentative de 3 points n'a été effectuée par un joueur

In [57]:
# extract names, labels, features names and values
names = df['Name'].values.tolist() # players names
labels = df['TARGET_5Yrs'].values # labels
paramset = df.drop(['TARGET_5Yrs','Name'],axis=1).columns.values
df_vals = df.drop(['TARGET_5Yrs','Name'],axis=1).values

In [58]:
# replacing Nan values (only present when no 3 points attempts have been performed by a player)
for x in np.argwhere(np.isnan(df_vals)):
    df_vals[x]=0.0

In [59]:
# reconstruction of the dataframe after filling the missing values
X = pd.DataFrame(data = df_vals, 
                  columns = paramset)
X.head()

Unnamed: 0,GP,MIN,PTS,FGM,FGA,FG%,3P Made,3PA,3P%,FTM,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV
0,36.0,27.4,7.4,2.6,7.6,34.7,0.5,2.1,25.0,1.6,2.3,69.9,0.7,3.4,4.1,1.9,0.4,0.4,1.3
1,35.0,26.9,7.2,2.0,6.7,29.6,0.7,2.8,23.5,2.6,3.4,76.5,0.5,2.0,2.4,3.7,1.1,0.5,1.6
2,74.0,15.3,5.2,2.0,4.7,42.2,0.4,1.7,24.4,0.9,1.3,67.0,0.5,1.7,2.2,1.0,0.5,0.3,1.0
3,58.0,11.6,5.7,2.3,5.5,42.6,0.1,0.5,22.6,0.9,1.3,68.9,1.0,0.9,1.9,0.8,0.6,0.1,1.0
4,48.0,11.5,4.5,1.6,3.0,52.4,0.0,0.1,0.0,1.3,1.9,67.4,1.0,1.5,2.5,0.3,0.3,0.4,0.8


In [60]:
# verifiying the non existance of missing values
df.isnull().any().all()

False

In [61]:
# construction of the target variable
y = pd.Series(data = labels, 
                  name = "TARGET_5Yrs")
y.head()

0    0.0
1    0.0
2    0.0
3    1.0
4    1.0
Name: TARGET_5Yrs, dtype: float64

## Feature Selection

- La sélection des caractéristiques est le processus de réduction du nombre de variables d'entrée lors du développement d'un modèle prédictif dans le but d'améliorer les performances du modèle prédictif. Le but est de ne garder que les variables d'entrée qui qui ont la plus forte relation avec la variable à prédire. 

### Lasso Feature Selection

- 1- A first attempt into doing feature selection is to use regularized logistic regression with the l1 penalty which will shrink the coefficent of features into zero allowing for feature selection

In [62]:
# We make a pipeline that first scale our data 
# We then apply a penalized logistic regression model and we provide a list of tuning parameter lambda 
# The regul parameter that maximes the recall is picked and then the model is trained on the whole dataset 
penalized_lr = make_pipeline(MinMaxScaler(), 
                             LogisticRegressionCV(penalty="l1", Cs = [10, 1, .1, .001],
                                                         scoring="recall", solver="liblinear", cv=10, refit=True))

In [63]:
# we fit the model on the data
penalized_lr.fit(X, y)

Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
                ('logisticregressioncv',
                 LogisticRegressionCV(Cs=[10, 1, 0.1, 0.001], cv=10,
                                      penalty='l1', scoring='recall',
                                      solver='liblinear'))])

In [64]:
# picked regularization parameter 
penalized_lr[1].C_

array([0.1])

In [65]:
# The coefficents of the built model and we can clearly see that the model performed feature selection
penalized_lr[1].coef_

array([[2.12145568, 0.1225175 , 0.        , 0.54237426, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.85602379, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ]])

In [66]:
# Plotting the selected features by the penalized logistic regression 
penalized_lr_coef_df = pd.DataFrame(penalized_lr[1].coef_.reshape(-1), columns=["Coefficient"])
penalized_lr_coef_df["Feature"] = X.columns

fig1 = px.bar(penalized_lr_coef_df, x='Feature', y='Coefficient', title="Feature Selection by Penalized LR")
fig1.show()

In [67]:
# We save the selected feature by the penalized logistic regression into a variable
penalized_lr_selected_features = list(penalized_lr_coef_df[penalized_lr_coef_df["Coefficient"] != 0].Feature)
penalized_lr_selected_features

['GP', 'MIN', 'FGM', 'OREB']

### Random Forests Feature Selection

In [68]:
# We initialize the selection model using a random forest classifier
feature_selection_rf = SelectFromModel(RandomForestClassifier())
feature_selection_rf.fit(X, y)

SelectFromModel(estimator=RandomForestClassifier())

In [69]:
# Plotting the feature importance given by the random forest classifier
rd_feat_import_df = pd.DataFrame(feature_selection_rf.estimator_.feature_importances_, columns=["Feature Importance"])
rd_feat_import_df["Feature"] = X.columns

fig2 = px.bar(rd_feat_import_df, x='Feature', y='Feature Importance', title="Feature Importance by a RF")
fig2.show()

In [70]:
rf_selected_features = X.loc[:,feature_selection_rf.get_support()].columns.tolist()
rf_selected_features

['GP', 'MIN', 'PTS', 'FGM', 'FGA', 'FG%', 'FTM', 'FT%', 'REB']

### Comparing The Selected Features

In [71]:
# We compare the variables selected by each approach
feature_selection_df = pd.DataFrame({'Feature':X.columns, 'Penalized Logistic Regression':penalized_lr[1].coef_.reshape(-1) != 0, 
                                     'Random Forest Feature Selection':feature_selection_rf.get_support()})
feature_selection_df

Unnamed: 0,Feature,Penalized Logistic Regression,Random Forest Feature Selection
0,GP,True,True
1,MIN,True,True
2,PTS,False,True
3,FGM,True,True
4,FGA,False,True
5,FG%,False,True
6,3P Made,False,False
7,3PA,False,False
8,3P%,False,False
9,FTM,False,True


In [72]:
# scoring function using cross-validation
def score_classifier(dataset, classifier, labels, n_splits=10):
    """
    performs 3 random trainings/tests to build a confusion matrix and prints results with precision and recall scores
    :param dataset: the dataset to work on
    :param classifier: the classifier to use
    :param labels: the labels used for training and validation
    :n_splits: number of splits in cross-validation
    :return:
    """
    dataset = dataset.values
    kf = KFold(n_splits=n_splits, random_state=50, shuffle=True)
    confusion_mat = np.zeros((2, 2))
    recall = 0
    precision = 0
    for training_ids, test_ids in kf.split(dataset):
        training_set, training_labels = dataset[training_ids], labels[training_ids]
        test_set, test_labels = dataset[test_ids], labels[test_ids]
        classifier.fit(training_set, training_labels)
        predicted_labels = classifier.predict(test_set)
        confusion_mat += confusion_matrix(test_labels, predicted_labels)
        recall += recall_score(test_labels, predicted_labels)
        precision += precision_score(test_labels, predicted_labels)
    recall /= n_splits
    precision /= n_splits
    print(confusion_mat)
    print("The recall is: "+str(recall)+",\nThe precision is: "+str(precision))

One way the scoring function can be improved is by switching from cross-validation to stratified cross-validation due to the fact that the distribution of the target variable is imbalanced so it's better to use stratified cross-validation the fact that it preserves the distribution of the outcome variable

In [73]:
# scoring function using stratified cross-validation
def score_classifier_skf(dataset, classifier, labels, n_splits=10):
    """
    performs 3 random trainings/tests to build a confusion matrix and prints results with precision and recall scores
    :param dataset: the dataset to work on
    :param classifier: the classifier to use
    :param labels: the labels used for training and validation
    :n_splits: number of splits in cross-validation
    :return:
    """

    dataset = dataset.values
    skf = StratifiedKFold(n_splits=10, random_state=50, shuffle=True)
    confusion_mat = np.zeros((2, 2))
    recall = 0
    precision = 0
    accuracy = 0
    f1score = 0
    for training_ids, test_ids in skf.split(dataset, labels):
        training_set, training_labels = dataset[training_ids], labels[training_ids]
        test_set, test_labels = dataset[test_ids], labels[test_ids]
        classifier.fit(training_set, training_labels)
        predicted_labels = classifier.predict(test_set)
        confusion_mat += confusion_matrix(test_labels, predicted_labels)
        recall += recall_score(test_labels, predicted_labels)
        precision += precision_score(test_labels, predicted_labels)
        accuracy += accuracy_score(test_labels, predicted_labels)
        f1score += f1_score(test_labels, predicted_labels)
    recall /= n_splits
    precision /= n_splits
    accuracy /= n_splits
    f1score /= n_splits
    print(confusion_mat)
    print("The recall is: "+str(recall)+",\nThe precision is: "+str(precision)+
          "\nThe f1-score is: "+str(f1score)+"\nThe accuracy is: "+str(accuracy))

- In order to determine what subset of features we will keep for the next project we will train an svm model on a the two subset of features and compare the performance of the model on both dataset and we will also compare the performance when the model is trained on the dataset without using feature selection.

- The SVM model was chosen to compare both feature selection because SVM is notably sensitive to irrelevant predictors.

In [74]:
# RobustScaler was also used to scale the data in order to deal with outliers but not much difference in the performance was observed
benchmarking_feat_selec = make_pipeline(MinMaxScaler(), SVC())

In [75]:
# We compute the recall when the model is applied on the features selected by penalized logistic regression
score_classifier_skf(X[penalized_lr_selected_features], benchmarking_feat_selec, labels)

[[281. 223.]
 [158. 666.]]
The recall is: 0.8083455774316779,
The precision is: 0.750614412067417
The f1-score is: 0.7774970299825885
The accuracy is: 0.7131635907951697


In [76]:
# We compute the recall when the model is applied on the features selected by random forest
score_classifier_skf(X[rf_selected_features], benchmarking_feat_selec, labels)

[[262. 242.]
 [148. 676.]]
The recall is: 0.8204672347928298,
The precision is: 0.7379462545013143
The f1-score is: 0.775964976116479
The accuracy is: 0.7063624971519709


In [77]:
# We compute the recall when the model is applied on the whole dataset
score_classifier_skf(X, benchmarking_feat_selec, labels)

[[268. 236.]
 [141. 683.]]
The recall is: 0.8289450484866295,
The precision is: 0.7456269964011811
The f1-score is: 0.7837684704364408
The accuracy is: 0.7161654135338346


- We can clearly see that when we only used the 4 features selected by the $l_1$ penalty we had a slightly worse performance than when we used more variables. we can also see that when we used the 9 variables selected by random forest instead of all of the 19 variables of the dataset we got almost the difference is not noticeable so we choose to keep 9 variables for the rest of the project due to the fact that we will be building simpler model that can generalize better

### Metrics Choice

- The default scoring function came in with the recall metric but in our use case it really depends on the point of view of the investor. If the investor does not want to take risks by investing on a player that he shouldnt have(FP) then precision is the go to metric but the drawback of precision is that we may end up passing by investable players.

- On the other hand if the investor is more of the risk taking type then we want to catch as much as potential investable players even if we are not 100% sure but the drawback of this metric is that with only using a model that would predict 1 for all players we would get a model having 1 as a value for recall.

- F1-score is the trade-off between both metrics

## Building Classifiction Models

- At this stage we prepared the data and we selected the relavant features. Now we can start off building the predictive models. We will be implementing few models that we will try to optimize by performing some hyper-parameter tuning and we pick the best model that will be deployed in production

### Baseline Models

In [78]:
# We start of by building a first baseline model that will help us to judge the performance of the models built later on
dummy_clf = DummyClassifier(strategy="most_frequent")
score_classifier_skf(X, dummy_clf, labels)

[[  0. 504.]
 [  0. 824.]]
The recall is: 1.0,
The precision is: 0.6204830257461836
The f1-score is: 0.765794714514558
The accuracy is: 0.6204830257461836


In [79]:
# By a simple second baseline model we were able to get a better f1-score than the dummy model 
score_classifier_skf(X, LogisticRegression(max_iter=1000), labels)

[[277. 227.]
 [157. 667.]]
The recall is: 0.8095063179547457,
The precision is: 0.7475256636314102
The f1-score is: 0.7763265219436913
The accuracy is: 0.7109079516974254


### Models Comparison

In [124]:
models = []
models.append(('KNN', KNeighborsClassifier()))
models.append(('RF', RandomForestClassifier()))
models.append(('SVM', make_pipeline(MinMaxScaler(), SVC())))
models.append(('AdaBoostClassifier', AdaBoostClassifier()))
#models.append(("XGBOOST", XGBClassifier()))
for name, model in models:
    print("\nThe metrics for the model: "+str(name))
    score_classifier_skf(X[rf_selected_features], model, labels)
    print("---------------------------------------------------")


The metrics for the model: KNN
[[259. 245.]
 [209. 615.]]
The recall is: 0.7465324713488098,
The precision is: 0.7154262684594959
The f1-score is: 0.7297859841956003
The accuracy is: 0.6581510594668489
---------------------------------------------------

The metrics for the model: RF
[[268. 236.]
 [187. 637.]]
The recall is: 0.773170731707317,
The precision is: 0.7302754252922643
The f1-score is: 0.7503541770284056
The accuracy is: 0.6814935064935063
---------------------------------------------------

The metrics for the model: SVM
[[262. 242.]
 [148. 676.]]
The recall is: 0.8204672347928298,
The precision is: 0.7379462545013143
The f1-score is: 0.775964976116479
The accuracy is: 0.7063624971519709
---------------------------------------------------

The metrics for the model: AdaBoostClassifier
[[265. 239.]
 [185. 639.]]
The recall is: 0.775668527769615,
The precision is: 0.7290239761943059
The f1-score is: 0.7506745228367652
The accuracy is: 0.6807814992025518
---------------------

- We can observe that random forest and SVM gave the best trade off between recall and precision so we will try to fine tune these two algorithm in order to pick the one giving the best performance.

- The dataset is not very large se we wouldn't have an issue using gridsearch in order to fine tune the hyperparameter of the models

In [125]:
fine_tuning_rf = make_pipeline(GridSearchCV(RandomForestClassifier(),
                                      param_grid={'n_estimators': [200, 500,1000],
                                     'max_features': ['auto', 'sqrt', 'log2'],
                                     'max_depth' : [4,5,6,7,8,10],
                                     'criterion' :['gini', 'entropy']},
                                      cv=10,
                                      scoring='recall',
                                      refit=True))

fine_tuning_rf.fit(X[rf_selected_features], y)

Pipeline(steps=[('gridsearchcv',
                 GridSearchCV(cv=10, estimator=RandomForestClassifier(),
                              param_grid={'criterion': ['gini', 'entropy'],
                                          'max_depth': [4, 5, 6, 7, 8, 10],
                                          'max_features': ['auto', 'sqrt',
                                                           'log2'],
                                          'n_estimators': [200, 500, 1000]},
                              scoring='recall'))])

In [126]:
# Parameters maximizing recall
fine_tuning_rf[0].best_params_

{'criterion': 'gini',
 'max_depth': 10,
 'max_features': 'log2',
 'n_estimators': 200}

In [127]:
# We store the fine tuned model
fine_tuned_rf = fine_tuning_rf[0].best_estimator_

In [128]:
# we test the model using the score function in order to perform a fair comparison 
score_classifier_skf(X[rf_selected_features], fine_tuned_rf, y)

[[270. 234.]
 [177. 647.]]
The recall is: 0.7852189244784016,
The precision is: 0.7351995273135523
The f1-score is: 0.7586285496737799
The accuracy is: 0.6905274550011392


In [155]:
# fine tuning the model in order to maximize the recall
fine_tuning_svm_recall = make_pipeline(MinMaxScaler(),
                                      GridSearchCV(SVC(),
                                      param_grid={'C': [0.1, 1, 10, 100, 1000],
                                     'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                                     'kernel' : ['rbf', 'linear']},
                                      cv=10,
                                      scoring='f1',
                                      refit=True))

fine_tuning_svm_recall.fit(X[rf_selected_features], y)

Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
                ('gridsearchcv',
                 GridSearchCV(cv=10, estimator=SVC(),
                              param_grid={'C': [0.1, 1, 10, 100, 1000],
                                          'gamma': [1, 0.1, 0.01, 0.001,
                                                    0.0001],
                                          'kernel': ['rbf', 'linear']},
                              scoring='f1'))])

In [156]:
# Parameters maximizing recall
fine_tuning_svm_recall[1].best_params_

{'C': 0.1, 'gamma': 1, 'kernel': 'linear'}

In [157]:
# We store the fine tuned model
fine_tuned_svm_recall = make_pipeline(MinMaxScaler(), fine_tuning_svm_recall[1].best_estimator_)

In [158]:
# We can see that this model has a good recall
score_classifier_skf(X[rf_selected_features], fine_tuned_svm_recall, y)

[[243. 261.]
 [133. 691.]]
The recall is: 0.8386570672935644,
The precision is: 0.7270203473971932
The f1-score is: 0.7780086950468966
The accuracy is: 0.7033777625882888


In [162]:
fine_tuned_svm_precision = make_pipeline(StandardScaler(), SVC(class_weight='balanced'))
score_classifier_skf(X, fine_tuned_svm_precision, labels)

[[371. 133.]
 [288. 536.]]
The recall is: 0.6506317954745813,
The precision is: 0.8024895726388264
The f1-score is: 0.7168128589611965
The accuracy is: 0.6830143540669857


In [163]:
dump(fine_tuned_svm_recall, './model/fine_tuned_svm_recall.joblib') 
dump(fine_tuning_svm_precision, './model/fine_tuned_svm_precision.joblib') 

['./model/fine_tuned_svm_precision.joblib']

In [165]:
model_saved_recall = load('./model/fine_tuned_svm_recall.joblib') 
model_saved_precision = load('./model/fine_tuned_svm_precision.joblib') 

In [166]:
score_classifier_skf(X[rf_selected_features], model_saved_recall, y)

[[243. 261.]
 [133. 691.]]
The recall is: 0.8386570672935644,
The precision is: 0.7270203473971932
The f1-score is: 0.7780086950468966
The accuracy is: 0.7033777625882888


In [167]:
score_classifier_skf(X[rf_selected_features], model_saved_precision, y)

[[369. 135.]
 [285. 539.]]
The recall is: 0.6542903320599471,
The precision is: 0.8012308342961969
The f1-score is: 0.7191496555979756
The accuracy is: 0.6837662337662339


In [91]:
#cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=1)
#scores = cross_val_score(pipeline, X, y, scoring='recall', cv=cv, n_jobs=-1)
#scores