# Modélisation et évaluation

## Chargement et préparation des datasets en Scikit-Learn

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

In [2]:
# IRIS dataset

# Loading
iris_df = pd.read_csv("data/iris.csv")

# Train/test splitting
y = iris_df['class']
X = iris_df.drop(labels='class', axis=1)
train_X_iris, test_X_iris, train_y_iris, test_y_iris = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=42)

In [3]:
# TITANIC dataset

# Loading
titanic_df = pd.read_csv("data/titanic_train.csv")

# Feature Engineering
titanic_df['FamilyNb'] = titanic_df['SibSp'] + titanic_df['Parch']
titanic_df['Alone'] = (titanic_df['FamilyNb'] == 0)

# Dummification Sex + Embarked
sex_df = pd.get_dummies(titanic_df['Sex'], prefix='sex', drop_first=True)
embarked_df = pd.get_dummies(titanic_df['Embarked'], prefix='embarked', dummy_na=True)
titanic_df = pd.concat([titanic_df, embarked_df, sex_df], axis=1)

# Filtering attributes
titanic_df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp', 'Parch', 'Sex', 'Embarked'], axis=1, inplace=True)

# Train/test splitting
y = titanic_df['Survived']
X = titanic_df.drop(['Survived'], axis=1)
train_X_titanic, test_X_titanic, train_y_titanic, test_y_titanic = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=42)

# Imputation for missing values
col_names = train_X_titanic.columns
titanic_imputer = SimpleImputer(strategy='mean')
titanic_imputer.fit(train_X_titanic)
train_X_titanic = titanic_imputer.transform(train_X_titanic)
test_X_titanic = titanic_imputer.transform(test_X_titanic)

# /!\ : recreating DataFrames 
train_X_titanic = pd.DataFrame(data=train_X_titanic, columns=col_names)
test_X_titanic = pd.DataFrame(data=test_X_titanic, columns=col_names)

In [4]:
train_X_titanic.head()

Unnamed: 0,Pclass,Age,Fare,FamilyNb,Alone,embarked_C,embarked_Q,embarked_S,embarked_nan,sex_male
0,1.0,45.5,28.5,0.0,1.0,0.0,0.0,1.0,0.0,1.0
1,2.0,23.0,13.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
2,3.0,32.0,7.925,0.0,1.0,0.0,0.0,1.0,0.0,1.0
3,3.0,26.0,7.8542,1.0,0.0,0.0,0.0,1.0,0.0,1.0
4,3.0,6.0,31.275,6.0,0.0,0.0,0.0,1.0,0.0,0.0


In [5]:
# BOSTON dataset

# Loading
names=['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
boston_df = pd.read_fwf("data/boston.txt", skiprows=22, header=None, names=names)

# Train/test splitting
y = boston_df['MEDV']
X = boston_df.drop(labels='MEDV', axis=1)
train_X_boston, test_X_boston, train_y_boston, test_y_boston = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=42)

## Modélisation

In [6]:
# Create a model for Iris
from sklearn.tree import DecisionTreeClassifier

tree_classifier = DecisionTreeClassifier(max_depth=3, random_state=42)
tree_classifier.fit(train_X_iris, train_y_iris)
pred_y_iris = tree_classifier.predict(test_X_iris)

In [7]:
# Evaluate the model
from sklearn.metrics import accuracy_score

accuracy_score(test_y_iris, pred_y_iris)

1.0

In [8]:
# Same with K-Fold
from sklearn.model_selection import cross_validate

tree_classifier = DecisionTreeClassifier(max_depth=3, random_state=42)
scores = cross_validate(tree_classifier, train_X_iris, train_y_iris, cv=5, scoring=['accuracy', 'f1_macro'])

In [9]:
# Evaluate the model on each fold
print(scores['test_accuracy'])

[0.95833333 1.         0.83333333 0.95833333 0.91666667]


In [10]:
# Save the model
import pickle

# First create a model
best_model = DecisionTreeClassifier(max_depth=3, random_state=42)
best_model.fit(train_X_iris, train_y_iris)

# And save it as Pickle file
filename = 'temp/mymodel.bak'
pickle.dump(best_model, open(filename, 'wb'))

In [11]:
# Open an existing model
old_model = pickle.load(open(filename, 'rb'))

# Test model
pred_y_iris = old_model.predict(test_X_iris)
accuracy_score(test_y_iris, pred_y_iris)

1.0

In [12]:
# Same with joblib
import joblib

joblib.dump(best_model, 'temp/mymodel2.bak')

old_model = joblib.load('temp/mymodel2.bak')

## Fine Tuning

In [13]:
# Grid-search parameters
params_grid = {'max_depth':[3, 5, 7],
         'splitter':['best', 'random'],
         'min_samples_split':[2, 5]}

In [14]:
# Random search parameters
import scipy

params_random = {'max_depth': scipy.stats.randint(3, 8), # 8 not included
         'splitter': ['best', 'random'],
         'min_samples_split': [2, 5]}

In [15]:
# Grid search
from sklearn.model_selection import GridSearchCV

tree_classifier = DecisionTreeClassifier(random_state=42)
grid_classifier = GridSearchCV(tree_classifier, param_grid=params_grid, scoring='accuracy', cv=5)

grid_classifier.fit(train_X_iris, train_y_iris)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=42),
             param_grid={'max_depth': [3, 5, 7], 'min_samples_split': [2, 5],
                         'splitter': ['best', 'random']},
             scoring='accuracy')

In [16]:
from sklearn.model_selection import RandomizedSearchCV

tree_classifier = DecisionTreeClassifier(random_state=42)
random_classifier = RandomizedSearchCV(tree_classifier, param_distributions=params_random, scoring='accuracy', cv=5, n_iter=6)

random_classifier.fit(train_X_iris, train_y_iris)

RandomizedSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=42),
                   n_iter=6,
                   param_distributions={'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f93cc767910>,
                                        'min_samples_split': [2, 5],
                                        'splitter': ['best', 'random']},
                   scoring='accuracy')

In [17]:
# Best for grid search
grid_classifier.best_estimator_

DecisionTreeClassifier(max_depth=3, random_state=42, splitter='random')

In [18]:
# and score
grid_classifier.best_score_

0.95

In [19]:
# Best for random search
random_classifier.best_estimator_

DecisionTreeClassifier(max_depth=3, random_state=42, splitter='random')

In [20]:
# and score
random_classifier.best_score_

0.95