<a href="https://colab.research.google.com/github/alexliqu09/wine-ML-classification/blob/main/Wine_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pprint
import seaborn as sns
from sklearn.datasets import load_wine
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
import warnings
warnings.filterwarnings('ignore')

#**Cargar el dataset**

In [None]:
wine_dataset = load_wine()
train= np.append(wine_dataset['data'], np.array([wine_dataset['target']]).T, axis=1)
wine_dataset['feature_names'] = np.append(wine_dataset['feature_names'], 'target')

In [None]:
pd_wine_dataset = pd.DataFrame(train, columns=wine_dataset['feature_names']).sample(frac=1).reset_index(drop=True)

#Estadistica Descriptiva


In [None]:
print(f"el tamaño del dataset es: {pd_wine_dataset.shape}")

In [None]:
pd_wine_dataset.head()

In [None]:
pd_wine_dataset.describe() #aplicar alguna transformacion

In [None]:
pd_wine_dataset.groupby('target').size()

#Visualización

##Univariable

In [None]:
f, axes = plt.subplots(2, 4, figsize=(10, 7))
sns.boxplot(pd_wine_dataset["alcohol"], ax=axes[0, 0])
sns.boxplot(pd_wine_dataset["malic_acid"], ax=axes[0, 1])
sns.boxplot(pd_wine_dataset["ash"], ax=axes[0, 2])
sns.boxplot(pd_wine_dataset["alcalinity_of_ash"], ax=axes[0, 3])
sns.boxplot(pd_wine_dataset["magnesium"], ax=axes[1, 0])
sns.boxplot(pd_wine_dataset["total_phenols"], ax=axes[1, 1])
sns.boxplot(pd_wine_dataset["flavanoids"], ax=axes[1, 2])
sns.boxplot(pd_wine_dataset["proanthocyanins"], ax=axes[1, 3])

In [None]:
f, axes = plt.subplots(1, 3, figsize=(10, 7))
sns.boxplot(pd_wine_dataset["color_intensity"], ax=axes[0])
sns.boxplot(pd_wine_dataset["od280/od315_of_diluted_wines"], ax=axes[1])
sns.boxplot(pd_wine_dataset["proline"], ax=axes[2])

In [None]:
fig = plt.figure(figsize = (10,15))
ax = fig.gca()
pd_wine_dataset.plot(ax=ax, kind='box', subplots=True, layout=(7,2), sharex=False, sharey=False)
plt.show()

In [None]:
f, axes = plt.subplots(2, 4, figsize=(10, 7))
sns.distplot(pd_wine_dataset["alcohol"], ax=axes[0, 0])
sns.distplot(pd_wine_dataset["malic_acid"], ax=axes[0, 1])
sns.distplot(pd_wine_dataset["ash"], ax=axes[0, 2])
sns.distplot(pd_wine_dataset["alcalinity_of_ash"], ax=axes[0, 3])
sns.distplot(pd_wine_dataset["magnesium"], ax=axes[1, 0])
sns.distplot(pd_wine_dataset["total_phenols"], ax=axes[1, 1])
sns.distplot(pd_wine_dataset["flavanoids"], ax=axes[1, 2])
sns.distplot(pd_wine_dataset["proanthocyanins"], ax=axes[1, 3])

In [None]:
fig = plt.figure(figsize = (10,10))
ax = fig.gca()
pd_wine_dataset.hist(ax=ax)
plt.show()

##Multivariable

In [None]:
sns.pairplot(pd_wine_dataset, y_vars='target', aspect=1)

In [None]:
wine_dataset['feature_names'][:-1]

In [None]:
plt.rcParams["figure.figsize"] = [12,8]
pd.plotting.scatter_matrix(pd_wine_dataset[wine_dataset['feature_names'][:-1]])
plt.show()

#Transformación

##Normalización

In [None]:
X = pd_wine_dataset[wine_dataset['feature_names'][:-1]]
Y = pd_wine_dataset[wine_dataset['feature_names'][-1]]

In [None]:
scaler = Normalizer().fit(X)
normalizer = scaler.transform(X)
np.set_printoptions(precision=2)
print(wine_dataset['feature_names'][:-1])
print(normalizer)

In [None]:
new_pd_wine_data =  pd.DataFrame(normalizer, columns=wine_dataset['feature_names'][:-1])
new_pd_wine_data.head()

In [None]:
fig = plt.figure(figsize = (10,10))
ax = fig.gca()
new_pd_wine_data.hist(ax=ax)
plt.show()

##BoxCox

In [None]:
pp = pprint.PrettyPrinter(indent=1)
pp.pprint(wine_dataset['feature_names'][:-1])

In [None]:
name_columns_features = wine_dataset['feature_names'][:-1]
data = pd_wine_dataset[name_columns_features]
data

In [None]:
pt = PowerTransformer(method='box-cox', standardize=True)
skl_boxcox = pt.fit(data)
skl_boxcox = pt.transform(data)

In [None]:
new_pd_wine_features = pd.DataFrame(skl_boxcox, columns=name_columns_features)
target = Y

In [None]:
new_pd_wine_features.head()

In [None]:
new_pd_wine_features.describe()

In [None]:
fig = plt.figure(figsize = (10,10))
ax = fig.gca()
new_pd_wine_features.hist(ax=ax)
plt.show()

#Fase de Modelado

In [None]:
X = new_pd_wine_features[name_columns_features].to_numpy()
validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = train_test_split(X, target, test_size=validation_size, random_state=seed)

In [None]:
modelos = []
modelos.append(('LoR', LogisticRegression()))
modelos.append(('LDA', LinearDiscriminantAnalysis()))
modelos.append(('K-NN', KNeighborsClassifier()))
modelos.append(('CART', DecisionTreeClassifier()))
modelos.append(('NB', GaussianNB()))
modelos.append(('SVM', SVC()))

results = []
names = []
seed=3

for name, model in modelos:
    kfold = KFold(n_splits=10, random_state=seed, shuffle=True)
    cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    print(f"{name}: {cv_results.mean()*100.0:,.2f}% ({cv_results.std()*100.0:,.2f}%)")

In [None]:
knn = KNeighborsClassifier()
knn.fit(X_train, Y_train)
predictions = knn.predict(X_validation)

In [None]:
print(f'KNN {accuracy_score(Y_validation, predictions)*100.0:,.2f}%')
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))

In [None]:
svm = SVC()
svm.fit(X_train, Y_train)
predictions = svm.predict(X_validation)

In [None]:
print(f'SVM {accuracy_score(Y_validation, predictions)*100.0:,.2f}%')
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))