# Imports

In [1]:
from sklearn.datasets import load_wine
import pandas as pd
import numpy as np
import os

from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import SGDClassifier, LogisticRegression

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score
from sklearn.ensemble import StackingClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier


from sklearn.naive_bayes import GaussianNB

import joblib
from joblib import dump, load

# Creation du Dataset

In [2]:
wines = load_wine()
df = pd.DataFrame(data= np.c_[wines['data'], wines['target']],
                     columns= wines['feature_names'] + ['target'])
df

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0.0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0,0.0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0,0.0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0,0.0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0,2.0
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0,2.0
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0,2.0
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0,2.0


# Visualisation du Dataset

In [3]:
def df_info(df):
    len_df = len(df)
    all_columns = len(df.columns)
    all_nan = df.isna().sum().sum()

    print(f"""
    Longueur du dataset : {len_df} enregistrements
    Nombre de colonnes : {all_columns}
    """)

    echantillonColonnes = []
    for i in df.columns:
        listcolumn = str(list(df[i].head(5)))
        echantillonColonnes.append(listcolumn)
        echantillonColonnes[0:5]
    obs = pd.DataFrame({'type': list(df.dtypes),
                        'Echantillon': echantillonColonnes,
                        "% de valeurs nulles":
                        round(df.isna().sum() / len_df * 100, 2),
                        "nb de valeurs uniques":df.nunique()
                       })
    return obs
df_info(df)


    Longueur du dataset : 178 enregistrements
    Nombre de colonnes : 14
    


Unnamed: 0,type,Echantillon,% de valeurs nulles,nb de valeurs uniques
alcohol,float64,"[14.23, 13.2, 13.16, 14.37, 13.24]",0.0,126
malic_acid,float64,"[1.71, 1.78, 2.36, 1.95, 2.59]",0.0,133
ash,float64,"[2.43, 2.14, 2.67, 2.5, 2.87]",0.0,79
alcalinity_of_ash,float64,"[15.6, 11.2, 18.6, 16.8, 21.0]",0.0,63
magnesium,float64,"[127.0, 100.0, 101.0, 113.0, 118.0]",0.0,53
total_phenols,float64,"[2.8, 2.65, 2.8, 3.85, 2.8]",0.0,97
flavanoids,float64,"[3.06, 2.76, 3.24, 3.49, 2.69]",0.0,132
nonflavanoid_phenols,float64,"[0.28, 0.26, 0.3, 0.24, 0.39]",0.0,39
proanthocyanins,float64,"[2.29, 1.28, 2.81, 2.18, 1.82]",0.0,101
color_intensity,float64,"[5.64, 4.38, 5.68, 7.8, 4.32]",0.0,132


# Creation des X et y

In [4]:
X = df.drop('target', axis = 1)
y = df['target']

# Recherche des Modèles pertinents

In [5]:
def test_model(models):
    for model in models:
        final_model = make_pipeline(StandardScaler(), model)
        print(f"model : {model} => {cross_val_score(final_model, X, y, cv=5).mean()}")

In [6]:
a = [
    LogisticRegression(), 
    SGDClassifier(),  
    RandomForestClassifier(n_estimators=100),
    GaussianNB(),
    LinearSVC(),
    KNeighborsClassifier(n_neighbors=10),
    StackingClassifier([
            ('model_1', LogisticRegression()),('model_2', LogisticRegression())
        ], final_estimator=LogisticRegression()),
    AdaBoostClassifier(),
    GradientBoostingClassifier()
]

test_model(a)

model : LogisticRegression() => 0.9831746031746033
model : SGDClassifier() => 0.9661904761904762
model : RandomForestClassifier() => 0.9665079365079364
model : GaussianNB() => 0.9663492063492063
model : LinearSVC() => 0.9776190476190475
model : KNeighborsClassifier(n_neighbors=10) => 0.9552380952380952
model : StackingClassifier(estimators=[('model_1', LogisticRegression()),
                               ('model_2', LogisticRegression())],
                   final_estimator=LogisticRegression()) => 0.9831746031746033
model : AdaBoostClassifier() => 0.8085714285714285
model : GradientBoostingClassifier() => 0.9385714285714286


# Initiation du Modèle

In [7]:
model = LogisticRegression()

In [8]:
final_model = make_pipeline(StandardScaler(), model)

# Mesure du modèle

In [9]:
X_train, X_test,y_train, y_test = train_test_split(
                X, y, test_size=0.33, random_state=42)

In [10]:
final_model.fit(X_train, y_train)
prediction = final_model.predict(X_test)
accuracy_score(y_test, prediction)


confusion_matrix (y_test, prediction)

array([[20,  0,  0],
       [ 0, 23,  1],
       [ 0,  0, 15]])

In [11]:
#num = X.columns
#preprocess = make_column_transformer((StandardScaler(),num))

# Entraînement

In [12]:
final_model = make_pipeline(StandardScaler(), model)
fit_model = final_model.fit(X, y)

# Prédiction simple

In [13]:
for i in X.columns:
    print(f"{i} =                             #min = {X[i].min()} et max = {X[i].max()}")

alcohol =                             #min = 11.03 et max = 14.83
malic_acid =                             #min = 0.74 et max = 5.8
ash =                             #min = 1.36 et max = 3.23
alcalinity_of_ash =                             #min = 10.6 et max = 30.0
magnesium =                             #min = 70.0 et max = 162.0
total_phenols =                             #min = 0.98 et max = 3.88
flavanoids =                             #min = 0.34 et max = 5.08
nonflavanoid_phenols =                             #min = 0.13 et max = 0.66
proanthocyanins =                             #min = 0.41 et max = 3.58
color_intensity =                             #min = 1.28 et max = 13.0
hue =                             #min = 0.48 et max = 1.71
od280/od315_of_diluted_wines =                             #min = 1.27 et max = 4.0
proline =                             #min = 278.0 et max = 1680.0


In [14]:
alcohol = 13.56                       #min = 11.03 et max = 14.83
malic_acid = 1.71                     #min = 0.74 et max = 5.8
ash = 2.31                            #min = 1.36 et max = 3.23
alcalinity_of_ash = 16                #min = 10.6 et max = 30.0
magnesium = 117.0                     #min = 70.0 et max = 162.0
total_phenols = 3.1                   #min = 0.98 et max = 3.88
flavanoids = 3.29                     #min = 0.34 et max = 5.08
nonflavanoid_phenols = 0.34           #min = 0.13 et max = 0.66
proanthocyanins = 2.3                 #min = 0.41 et max = 3.58
color_intensity = 6.13                #min = 1.28 et max = 13.0
hue =  0.95                           #min = 0.48 et max = 1.71
od280_od315_of_diluted_wines = 3.38   #min = 1.27 et max = 4.0
proline = 795.0                       #min = 278.0 et max = 1680.0

In [15]:
a = pd.DataFrame([[alcohol, malic_acid, ash, alcalinity_of_ash, magnesium,
       total_phenols, flavanoids, nonflavanoid_phenols,
       proanthocyanins, color_intensity, hue,
       od280_od315_of_diluted_wines, proline ]])

fit_model.predict(a)

array([0.])

# Sauvegarde et Chargement 

In [16]:
def save_model(fit_model, name):
        #---------------------------------------------------------
        """ • Save model fitted
            • Return None"""
        #---------------------------------------------------------
        path = os.path.abspath('')
        dump(fit_model, f"{path}/{name}")

def load_model(name):
    #---------------------------------------------------------
    """ • Load model fitted
        • Return Model"""
    #---------------------------------------------------------
    
    path = os.path.abspath('')
    model = load(f"{path}/{name}")
    return model

In [17]:
save_model(fit_model, 'wine_fit')

In [18]:
fitness = load_model('wine_fit')

In [19]:
fitness.predict(a)

array([0.])

# Création de la classe

In [20]:
class Wine :
    def __init__(self):
        wines = load_wine()
        
        self.df = pd.DataFrame(data= np.c_[wines['data'], wines['target']],
                     columns= wines['feature_names'] + ['target'])
        self.X = self.df.drop('target', axis = 1)
        self.parameters = [[X[i].min(), X[i].max(), i] for i in self.X.columns]

wine = Wine()

In [21]:
wine.parameters

[[11.03, 14.83, 'alcohol'],
 [0.74, 5.8, 'malic_acid'],
 [1.36, 3.23, 'ash'],
 [10.6, 30.0, 'alcalinity_of_ash'],
 [70.0, 162.0, 'magnesium'],
 [0.98, 3.88, 'total_phenols'],
 [0.34, 5.08, 'flavanoids'],
 [0.13, 0.66, 'nonflavanoid_phenols'],
 [0.41, 3.58, 'proanthocyanins'],
 [1.28, 13.0, 'color_intensity'],
 [0.48, 1.71, 'hue'],
 [1.27, 4.0, 'od280/od315_of_diluted_wines'],
 [278.0, 1680.0, 'proline']]