<a href="https://colab.research.google.com/github/adrien-chinour/ia-data/blob/master/05-machine-learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Procédure en machine learning

1. Changer les données/ explorer / visulaliser / préparer
2. Découper les données `(test, train)`
3. Entraîner le modèle
4. Évaluer le modèle
5. Chercher les meilleurs `hyperparamètres`

# Exemple 1

## Étape 1 : Chargement des données

In [0]:
# chargement du dataset depuis sklearn
from sklearn import datasets
wine = datasets.load_wine()

In [0]:
print(wine.feature_names) # label X
print(wine.target_names) # label Y
print(wine.data) # X
print(wine.target) # Y

['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']
['class_0' 'class_1' 'class_2']
[[1.423e+01 1.710e+00 2.430e+00 ... 1.040e+00 3.920e+00 1.065e+03]
 [1.320e+01 1.780e+00 2.140e+00 ... 1.050e+00 3.400e+00 1.050e+03]
 [1.316e+01 2.360e+00 2.670e+00 ... 1.030e+00 3.170e+00 1.185e+03]
 ...
 [1.327e+01 4.280e+00 2.260e+00 ... 5.900e-01 1.560e+00 8.350e+02]
 [1.317e+01 2.590e+00 2.370e+00 ... 6.000e-01 1.620e+00 8.400e+02]
 [1.413e+01 4.100e+00 2.740e+00 ... 6.100e-01 1.600e+00 5.600e+02]]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2

## Étape 2 : découpage des données

In [0]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(wine.data, wine.target, test_size=.3, random_state=109)

## Étape 3 : Entrainement du modèle

In [0]:
# Entrainement du modèle Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(x_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

## Étape 4 : Évaluation du modèle

In [0]:
# calcul des prédiction par notre modèle
y_pred = gnb.predict(x_test)
print(y_pred)
print(y_test)

[0 0 1 2 0 1 0 0 1 0 2 2 2 2 0 1 1 0 0 1 2 1 0 2 0 0 1 2 0 1 2 1 1 0 1 1 0
 2 2 0 2 1 0 0 0 2 2 0 1 1 2 0 0 2]
[0 0 1 2 0 1 0 1 1 0 1 1 2 2 0 1 1 0 0 1 2 1 0 2 0 0 1 2 0 1 2 1 1 0 1 1 0
 2 2 0 2 0 0 0 0 2 2 0 1 1 2 1 0 2]


In [0]:
# comparaison des résultats obtenus (performance)
from sklearn import metrics
scores = metrics.accuracy_score(y_test, y_pred)
print('Accuracy :', '{:2.2%}'.format(scores))

Accuracy : 90.74%


In [0]:
# matrices des prédictions (par classe)
cm = metrics.confusion_matrix(y_test, y_pred)
print(cm)

[[20  1  0]
 [ 2 15  2]
 [ 0  0 14]]


In [0]:
# On retrouve à partir de la matrice l'indice de performance (juste/total)
print('{:2.2%}'.format((20+15+14)/(20+1+2+15+2+14)))

90.74%


# Exemple 2

## Étape 1 : Chargement des données

In [0]:
import pandas
bank_data = pandas.read_csv('https://www.labri.fr/~zemmari/datasets/bill_authentication.csv')
bank_data.head()

Unnamed: 0,Variance,Skewness,Curtosis,Entropy,Class
0,3.6216,8.6661,-2.8073,-0.44699,0
1,4.5459,8.1674,-2.4586,-1.4621,0
2,3.866,-2.6383,1.9242,0.10645,0
3,3.4566,9.5228,-4.0112,-3.5944,0
4,0.32924,-4.4552,4.5718,-0.9888,0


Plus d'info sur le dataset [ici](https://archives.ics.uci.edu/ml/datasets/banknote+authentication)

## Étape 2 : Découpage des données

In [0]:
X = bank_data.drop('Class', axis=1)
Y = bank_data['Class']
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.3, random_state=100)

## Étape 3 : Entrainement du modèle

In [0]:
# Entrainement du modèle avec https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
from sklearn.svm import SVC 
classif = SVC(kernel='linear')
classif.fit(x_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

## Étape 4 : Évaluation du modèle

In [0]:
# Calcul de la prédiction
y_pred = classif.predict(x_test)

In [0]:
# comparaison des résultats obtenus (performance)
scores = metrics.accuracy_score(y_test, y_pred)
print('Accuracy :', '{:2.2%}'.format(scores))

Accuracy : 98.79%


# Exemple 3

## Étape 1 : Chargement des données

In [0]:
cols_names=['pregnant', 'glucose', 'bp', 'skin', 'insulin', 'bmi', 'pedigree', 'age', 'label']
data = pandas.read_csv('https://www.labri.fr/~zemmari/datasets/pima-indians-diabetes.csv', header=None, names=cols_names)
data.head()

Unnamed: 0,pregnant,glucose,bp,skin,insulin,bmi,pedigree,age,label
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


## Étape 2 : Découpage des données

In [0]:
X = data.drop('label', axis=1)
Y = data.label
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.3, random_state=100)

## Étape 3 : Entrainement du modèle

In [0]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier
decision_tree = DecisionTreeClassifier()
decision_tree.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [0]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier()
random_forest.fit(x_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

## Étape 4 : Évaluation du modèle

In [0]:
# Calcul de la prédiction
y_pred_decision_tree = decision_tree.predict(x_test)
y_pred_random_forest = random_forest.predict(x_test)

In [0]:
# comparaison des résultats obtenus (performance)
scores_decision_tree = metrics.accuracy_score(y_test, y_pred_decision_tree)
scores_random_forest = metrics.accuracy_score(y_test, y_pred_random_forest)
print('Accuracy Decision Tree:', '{:2.2%}'.format(scores_decision_tree))
print('Accuracy Random Forest:', '{:2.2%}'.format(scores_random_forest))

Accuracy Decision Tree: 65.80%
Accuracy Random Forest: 70.13%
