# Joseph ASSOUMA, Issa DIA, Thomas MARGNAC

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt

In [2]:
!pip install openpyxl

# A. Base de données

#### Importation du jeu de données

In [3]:
file = '../input/beer-quality/beer_quality.xlsx'
data = pd.read_excel(file)

#### Détachement des labels

In [4]:
y = pd.DataFrame(data.pop('quality'))

#### Séparation en jeu d'apprentissage et en test

In [5]:
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.3, random_state=42)

#### On centre-réduit les données

In [6]:
norm = StandardScaler()
X_train = pd.DataFrame(data = norm.fit_transform(X_train), columns = X_train.columns)
X_test = pd.DataFrame(data = norm.transform(X_test), columns = X_test.columns)

# B. Classification Binaire

#### 1. Création d'une nouvelle variable quantitative à deux modalités

Recherche de la médianne des labels

In [7]:
median = np.median(y_train['quality'].values)
median

Application de la nouvelle variable en fonction de la médianne

In [8]:
def ybin(row):
    if row['quality'] < median :
        return 0
    else:
        return 1

In [9]:
y_train['ybin'] = y_train.apply(ybin, axis=1)
y_test['ybin'] = y_test.apply(ybin, axis=1)

#### 2. Optimiser rapidement un arbre de décision pour réaliser la classification

In [10]:
y_train['ybin'].value_counts()

In [11]:
y_test['ybin'].value_counts()

In [12]:
dt = DecisionTreeClassifier(
    criterion='gini',
    min_samples_split=2,
    max_depth=5
)
dt.fit(X_train, y_train['ybin'])
print(dt.score(X_test, y_test['ybin']))

#### 3. Entraîner un ensemble d’arbres de décision « faibles » (peu, voire très peu profonds) à l’aide de l’algorithme AdaBoost :

Analyser les performances en fonction des différents paramètres

In [13]:
ab = AdaBoostClassifier(
    base_estimator=DecisionTreeClassifier(max_depth=5, random_state=42),
    n_estimators=200,
    random_state=10
)
ab.fit(X_train, y_train['ybin'])
print(ab.score(X_test, y_test['ybin']))

##### Max_depth=1

In [14]:
test_acc = []
train_acc = []
estim = []
for i in range(1,200,5):
    estim.append(i)
    ab = AdaBoostClassifier(
        base_estimator=DecisionTreeClassifier(max_depth=1, min_samples_leaf=5, min_samples_split=5, random_state=42),
        n_estimators=i,
        random_state=10
    )
    ab.fit(X_train, y_train['ybin'])
    train_acc.append(ab.score(X_train, y_train['ybin']))
    test_acc.append(ab.score(X_test, y_test['ybin']))
plt.plot(estim,train_acc, 'g', label = 'Train')
plt.plot(estim,test_acc, 'r', label = 'Test')
plt.title("Max_depth=1")
plt.xlabel("Number of estimators")
plt.ylabel("Accuracy")
plt.legend(loc="lower right", fontsize="medium")
plt.grid()
plt.show()

In [15]:
imp = list(ab.feature_importances_)
cols = list(X_train.columns)
d = {'Name': cols, 'Importance': imp}
carac5 = pd.DataFrame(data = d)
carac5.sort_values(by=['Importance'], ascending=False)

##### Max_depth=5

In [16]:
test_acc = []
train_acc = []
estim = []
for i in range(1,200,5):
    estim.append(i)
    ab = AdaBoostClassifier(
        base_estimator=DecisionTreeClassifier(max_depth=5, min_samples_leaf=5, min_samples_split=5, random_state=42),
        n_estimators=i,
        random_state=10
    )
    ab.fit(X_train, y_train['ybin'])
    train_acc.append(ab.score(X_train, y_train['ybin']))
    test_acc.append(ab.score(X_test, y_test['ybin']))
plt.plot(estim,train_acc, 'g', label = 'Train')
plt.plot(estim,test_acc, 'r', label = 'Test')
plt.title("Max_depth=5")
plt.xlabel("Number of estimators")
plt.ylabel("Accuracy")
plt.legend(loc="lower right", fontsize="medium")
plt.grid()
plt.show()

In [17]:
imp = list(ab.feature_importances_)
cols = list(X_train.columns)
d = {'Name': cols, 'Importance': imp}
carac5 = pd.DataFrame(data = d)
carac5.sort_values(by=['Importance'], ascending=False)

# C. Classification Multiclasse

### 1. Création d'une nouvelle variable quantitative ymulti discrète à 3 modalités : qualité basse (0), moyenne (1) ou élevée (2).

On se base sur la distribution de la note de qualité pour faire la séparation en multiclasse

In [18]:
plt.hist(y_train['quality'].values)
plt.show()

In [19]:
y_train['quality'].value_counts()

In [20]:
def ymulti(row):
    if 0 <= row['quality'] < 5 :
        return 0
    elif 5 <= row['quality'] < 7:
        return 1
    else:
        return 2

In [21]:
y_train['ymulti'] = y_train.apply(ymulti, axis=1)
y_test['ymulti'] = y_test.apply(ymulti, axis=1)

### 2. Déterminer les effectifs des différentes classes. Si nécessaire, équilibrer les données d’apprentissage (voir SMOTE). Dans la suite, on présentera les résultats obtenus avec et sans équilibrage.

In [22]:
y_train['ymulti'].value_counts()

In [23]:
y_test['ymulti'].value_counts()

##### SMOTE

In [24]:
sm = SMOTE(random_state=42)
X_bal, y_bal = sm.fit_resample(X_train, y_train['ymulti'])
y_bal.value_counts()

## Partie 1

### 3. Entraîner un réseau de neurones à une couche cachée pour effectuer cette tâche de classification, avec early stopping sur la base de validation. L’optimiser rapidement en prenant soin d’éviter l’over-fitting.

##### Classes déséquilibrées (couche cachée <= 223)

In [None]:
test_acc = []
train_acc = []
neurals = []
for i in range(1,223,10):
    neurals.append(i)
    clf_imb = MLPClassifier(hidden_layer_sizes=i, early_stopping=True)
    clf_imb.fit(X_train, y_train['ymulti'])
    train_acc.append(clf_imb.score(X_train, y_train['ymulti']))
    test_acc.append(clf_imb.score(X_test, y_test['ymulti']))
plt.plot(neurals,train_acc, 'g', label = 'Train')
plt.plot(neurals,test_acc, 'r', label = 'Test')
plt.title("Imbalanced")
plt.xlabel("Number of neurals")
plt.ylabel("Accuracy")
plt.legend(loc="lower right", fontsize="medium")
plt.grid()
plt.show()

##### Classes équilibrées (couche cachée <= 554)

In [26]:
test_acc = []
train_acc = []
neurals = []
for i in range(1,554,10):
    neurals.append(i)
    clf_imb = MLPClassifier(hidden_layer_sizes=i, early_stopping=True)
    clf_imb.fit(X_bal, y_bal)
    train_acc.append(clf_imb.score(X_bal, y_bal))
    test_acc.append(clf_imb.score(X_test, y_test['ymulti']))
plt.plot(neurals,train_acc, 'g', label = 'Train')
plt.plot(neurals,test_acc, 'r', label = 'Test')
plt.title("Balanced")
plt.xlabel("Number of neurals")
plt.ylabel("Accuracy")
plt.legend(loc="lower right", fontsize="medium")
plt.grid()
plt.show()

### 4. Faire un bagging en utilisant comme classifieur de base le réseau de neurones.

##### Classes déséquilibrées

In [None]:
test_acc = []
train_acc = []
estim = []
for i in range(2,200,10):
    estim.append(i)
    clf_bag = BaggingClassifier(base_estimator=MLPClassifier(hidden_layer_sizes=50, early_stopping=True),
                                n_estimators=i,
                                random_state=0)
    clf_bag.fit(X_train, y_train['ymulti'])
    train_acc.append(clf_bag.score(X_train, y_train['ymulti']))
    test_acc.append(clf_bag.score(X_test, y_test['ymulti']))
plt.plot(estim,train_acc, 'g', label = 'Train')
plt.plot(estim,test_acc, 'r', label = 'Test')
plt.title("Accuracy for n estimator(s)")
plt.xlabel("Number of estimators")
plt.ylabel("Accuracy")
plt.legend(loc="lower right", fontsize="medium")
plt.grid()
plt.show()

##### Classes équilibrées

In [None]:
test_acc = []
train_acc = []
estim = []
for i in range(2,200,10):
    estim.append(i)
    clf_bag = BaggingClassifier(base_estimator=MLPClassifier(hidden_layer_sizes=444, early_stopping=True),
                                n_estimators=i,
                                random_state=0)
    clf_bag.fit(X_bal, y_bal)
    train_acc.append(clf_bag.score(X_bal, y_bal))
    test_acc.append(clf_bag.score(X_test, y_test['ymulti']))
plt.plot(estim,train_acc, 'g', label = 'Train')
plt.plot(estim,test_acc, 'r', label = 'Test')
plt.title("Accuracy for n estimator(s)")
plt.xlabel("Number of estimators")
plt.ylabel("Accuracy")
plt.legend(loc="lower right", fontsize="medium")
plt.grid()
plt.show()

## Partie 2

#### 5. Entraîner une forêt aléatoire :

##### Classes déséquilibrées

In [None]:
rf = RandomForestClassifier()
params = dict(max_depth=[1,5,10,20], n_estimators=[1,5,10,20])
rs_imb = RandomizedSearchCV(rf, params)
search = rs_imb.fit(X_train, y_train['ymulti'])
print(search.best_params_, search.best_score_)

In [None]:
rf_imb = RandomForestClassifier(max_depth=search.best_params_['max_depth'], n_estimators=search.best_params_['n_estimators'])
rf_imb.fit(X_train, y_train['ymulti'])
print("Train : ", rf_imb.score(X_train, y_train['ymulti']))
print("Test : ", rf_imb.score(X_test, y_test['ymulti']))

In [None]:
imp = list(rf_imb.feature_importances_)
cols = list(X_train.columns)
d = {'Name': cols, 'Importance': imp}
rf_imb_imp = pd.DataFrame(data = d)
rf_imb_imp.sort_values(by=['Importance'], ascending=False)

##### Classes équilibrées

In [None]:
rf = RandomForestClassifier()
params = dict(max_depth=[1,5,10,20], n_estimators=[1,5,10,20])
rs_bal = RandomizedSearchCV(rf, params)
search = rs_bal.fit(X_bal, y_bal)
print(search.best_params_, search.best_score_)

In [None]:
rf_bal = RandomForestClassifier(max_depth=search.best_params_['max_depth'], n_estimators=search.best_params_['n_estimators'])
rf_bal.fit(X_bal, y_bal)
print("Train : ", rf_bal.score(X_bal, y_bal))
print("Test : ", rf_bal.score(X_test, y_test['ymulti']))

In [None]:
imp = list(rf_bal.feature_importances_)
cols = list(X_bal.columns)
d = {'Name': cols, 'Importance': imp}
rf_bal_imp = pd.DataFrame(data = d)
rf_bal_imp.sort_values(by=['Importance'], ascending=False)