**Auteurs:** Guillaume Poirier-Morency et Gabriel Lemyre

Chaque modèle est présenté successivement, entraîné et finalement testés selon les meilleurs paramètres obtenus par le processus de validation.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.datasets import fetch_mldata
from sklearn.preprocessing import LabelEncoder
from collections import OrderedDict
%matplotlib inline

Le jeu de données de salaire est déjà séparé en deux ensembles.

In [None]:
salary_dtype = OrderedDict([('age', 'int'), 
                            ('workclass', 'category'), 
                            ('financial_weight', 'int'), 
                            ('education', 'category'), 
                            ('education_code', 'int'),
                            ('marital_status', 'category'), 
                            ('occupation', 'category'),
                            ('relationship', 'category'),
                            ('race', 'category'),
                            ('sex', 'category'),
                            ('capital_gain', 'int'),
                            ('capital_loss', 'int'),
                            ('hours_per_week', 'int'),
                            ('native_country', 'category'),
                            ('target', 'category')])
salary_continuous_columns = ['age', 'financial_weight', 'education_code', 'capital_gain', 'capital_loss', 'hours_per_week']
salary_categorical_columns = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']
salary_data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', sep=', ', engine='python', names=salary_dtype.keys(), dtype=salary_dtype, na_values=['?'])
salary_test = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test', sep=', ', engine='python', skiprows=[0], names=salary_dtype.keys(), dtype=salary_dtype, na_values=['?'])
salary_train_X, salary_train_Y = salary_data.iloc[:,:len(salary_dtype)-1], salary_data['target']
salary_test_X, salary_test_Y = salary_test.iloc[:,:len(salary_dtype)-1], salary_test['target']

In [None]:
# transformations catégorie -> one-hot
identity = lambda x: x
cat_to_codes = lambda x: x.cat.codes
salary_transform = {
    'age': identity,
    'workclass': cat_to_codes,
    'financial_weight': identity,
    'education': cat_to_codes,
    'marital_status': cat_to_codes,
    'occupation': cat_to_codes,
    'relationship': cat_to_codes,
    'race': cat_to_codes,
    'sex': cat_to_codes,
    'capital_gain': identity,
    'capital_loss': identity,
    'hours_per_week': identity,
    'native_country': cat_to_codes}

On utilise un état déterministe pour la routine `train_test_split` afin de s'assurer de ne jamais toucher l'ensemble de test avant la toute fin.

In [None]:
mnist_data = fetch_mldata('mnist-original')
mnist_train_X, mnist_test_X, mnist_train_Y, mnist_test_Y = train_test_split(mnist_data['data'], mnist_data['target'], random_state=123)

# Classifieur de Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.preprocessing import OneHotEncoder

## Salary

Pour classer les features catégoriques du dataset de salaires, on les convertit en one-hot et on utilise un classifier naïf ad-hoc avec densité de Bernouilli. On considère ensuite la probabilité suivante: $\Pr [c \mid x_{cont},x_{cat}] = \frac{\Pr[X_{cont} X_{cat} \mid c]\Pr[c]}{\Pr[X_{cont}] \Pr[X_{cat}]}$.

Avec l'hypothèse naïve $\Pr[X_{cont},X_{cat}] = \Pr[X_{cont}] \Pr[X_{cat}]$ et en passant par le logarithme:

$\implies \log \Pr[X_{cont} \mid c] + \log \Pr[X_{cat} \mid c] + \log \Pr[c] - (\log \Pr[X_{cont}] + \log \Pr[X_{cat}])$

Si l'on prend la somme des log-probabilité des deux modèles, on compte deux fois les priors $\Pr[c]$. On remédie à cette situation en le soustrayant.

In [None]:
from sklearn.base import BaseEstimator, ClassifierMixin
class MixedNB(BaseEstimator, ClassifierMixin):
    """
    Mixed gaussian and binomial-on-onehot naive Bayes classifier.
    """
    def __init__(self, alpha=1.0):
        self.alpha = alpha
        self.gnb = GaussianNB()
        self.bnb = BernoulliNB(alpha)
        self.encoder = OneHotEncoder()
    def get_params(self, deep=False):
        return {'alpha': self.alpha}
    def set_params(self, **parameters):
        for name, val in parameters.items():
            setattr(self, name, val)
        self.bnb.set_params(alpha=self.alpha)
        return self
    def fit(self, X, y):
        cat_as_codes = X[salary_categorical_columns].transform(lambda x: x.cat.codes + 1)
        self.encoder.fit(cat_as_codes)
        self.gnb.fit(X[salary_continuous_columns], y)
        self.bnb.fit(self.encoder.transform(cat_as_codes), y)
    def predict(self, X):
        cat_as_codes = X[salary_categorical_columns].transform(lambda x: x.cat.codes + 1)
        lp = self.gnb.predict_log_proba(X[salary_continuous_columns]) + self.bnb.predict_log_proba(self.encoder.transform(cat_as_codes)) - self.bnb.class_log_prior_
        return self.gnb.classes_[np.argmax(lp, axis=1)]

In [None]:
mnb_salary = GridSearchCV(MixedNB(), param_grid={'alpha': np.logspace(-1, 2)}, scoring='accuracy', n_jobs=16, return_train_score=True)
mnb_salary.fit(salary_train_X, salary_train_Y)

In [None]:
r = pd.DataFrame(mnb_salary.cv_results_)
plt.plot(r.param_alpha, 1 - r.mean_train_score, label='Entraînement')
plt.plot(r.param_alpha, 1 - r.mean_test_score, label='Validation')
plt.title('Courbe d\'apprentissage sur salary')
plt.xlabel('Lissage laplacien')
plt.ylabel('Erreur')
plt.legend()

In [None]:
gnb_mnist = GaussianNB()
1 - cross_val_score(gnb_mnist, mnist_train_X, mnist_train_Y, scoring='accuracy', n_jobs=16).mean()

# Arbres de décision

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dtc_param_grid = {'max_depth': range(1, 20)}

## Salary

In [None]:
%%time
dtc_salary = GridSearchCV(DecisionTreeClassifier(), param_grid=dtc_param_grid, scoring='accuracy', n_jobs=16, return_train_score=True)
dtc_salary.fit(salary_train_X.transform(salary_transform), salary_train_Y)

In [None]:
r = pd.DataFrame(dtc_salary.cv_results_)
plt.plot(r.param_max_depth, 1 - r.mean_train_score, label='Entraînement')
plt.plot(r.param_max_depth, 1 - r.mean_test_score, label='Validation')
plt.title('Courbe d\'apprentissage sur salary')
plt.xlabel('Profondeur maximale')
plt.ylabel('Erreur')
plt.legend()

## MNIST

In [None]:
%%time
dtc_mnist = GridSearchCV(DecisionTreeClassifier(), param_grid=dtc_param_grid, scoring='accuracy', n_jobs=16, return_train_score=True)
dtc_mnist.fit(mnist_train_X, mnist_train_Y)

In [None]:
r = pd.DataFrame(dtc_mnist.cv_results_)
plt.plot(r.param_max_depth, 1 - r.mean_train_score, label='Erreur d\'entraînement')
plt.plot(r.param_max_depth, 1 - r.mean_test_score, label='Erreur de validation')
plt.title('Courbe d\'apprentissage sur MNIST')
plt.xlabel('Profondeur maximale')
plt.ylabel('Erreur')
plt.xticks(range(1, 20))
plt.legend()

# Arbres de décisions + classifieurs de Bayes

# Modèle boosté

# Perceptron multi-couches

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.losses import categorical_crossentropy
from keras.optimizers import SGD

In [None]:
model = Sequential()
model.add(Dense(units=28*28, input_dim=28*28))
model.add(Dense(units=10))

In [None]:
model.compile(loss=categorical_crossentropy, optimizer=SGD(), metrics=['accuracy'])

In [None]:
model.fit(mnist_train_X, mnist_train_Y)

In [None]:
model.predict(np.zeros(shape=(1, 784)))

# Réseau de neurones convolutif

# Tests

Ici, on trouve le code pour les tests finaux qui ont été effectués à la toute fin, indépendament du processus de validation afin d'avoir la meilleure idée possible de la performance de généralisation de chaque modèle.