# Enunciado

En este enlace puedes descargar un dataset que recoge diferentes características de vino rojo portugués "Vinho Verde" y la calidad del mismo. Crea un modelo de clasificación usando NaiveBayes (GaussianNB) y KNN.

In [19]:
# Importamos las librerías necesarias
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import neighbors
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [20]:
# Cargamos los datos
df = pd.read_csv("csv/winequality-red.csv")
df

FileNotFoundError: [Errno 2] No such file or directory: 'csv/winequality-red.csv'

## Correlación de las variables

In [None]:
sns.set()
sns.heatmap(df.corr(), square=True, annot=True, cmap='BuPu', annot_kws={'size': 7})
"""
annot_kws sirve para cambiar el tamaño de la letra de los valores de la matriz
"""
df.corr()

## Ajuste de los nombres de las variables

In [None]:
df.columns = ['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar', 'chlorides',
              'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density', 'pH', 'sulphates', 'alcohol',
              'quality']
df

## Entrenamiento y test

In [None]:
X = df[['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar', 'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density', 'pH', 'sulphates', 'alcohol']]
y = df['quality']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

## NaiveBayes (GaussianNB)

In [None]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

## kNN

In [None]:
MAX_NEIGHBORDS = 20
MIN_NEIGHBORDS = 1
N_SPLITS = 5

cv = KFold(n_splits = N_SPLITS, shuffle = False)
train = pd.concat([X_train, y_train], axis=1)

for i, weights in enumerate(['uniform', 'distance']):
    total_scores = []

    for n_neighbors in range(MIN_NEIGHBORDS,MAX_NEIGHBORDS+1):
        fold_accuracy = []
        knn = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
        for train_fold, test_fold in cv.split(train):
            # División train test aleatoria
            f_train = train.iloc[train_fold]
            f_test = train.iloc[test_fold]

            # entrenamiento y ejecución del modelo
            knn.fit(X = f_train.drop(['quality'], axis=1),
                            y = f_train['quality'])
            y_pred = knn.predict(X = f_test.drop(['quality'], axis = 1))

            # evaluación del modelo
            acc = accuracy_score(f_test['quality'], y_pred)
            fold_accuracy.append(acc)

        total_scores.append(sum(fold_accuracy) / len(fold_accuracy))

    plt.plot(range(MIN_NEIGHBORDS,MAX_NEIGHBORDS+1), total_scores, marker='o', label=weights)
    print('max value: {:.4f} with {} neighbors'.format(max(total_scores), total_scores.index(max(total_scores))+1))
    
plt.legend()
plt.show()

In [None]:
# Seleccionamos la distancia como peso y 3 vecinos
n_neighbors = 17
weights = 'distance'
knn = neighbors.KNeighborsClassifier(n_neighbors= n_neighbors, weights=weights)

# Entrenamiento y ejecución del modelo
knn.fit(X = X_train, y = y_train)
y_pred = knn.predict(X = X_test)
acc = accuracy_score(y_test, y_pred)
print('Accuracy:', acc)

## Matriz de confusión

In [None]:
# Matriz de confusión
cm = confusion_matrix(y_test, y_pred)
classes = ['3', '4', '5', '6', '7', '8'] 
sns.heatmap(cm, annot=True, cmap='BuPu', fmt='d', xticklabels=classes, yticklabels=classes)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

In [None]:
# Guardamos el modelo en un archivo Pickle con nombre wine_model.pkl
import pickle
pickle.dump(knn, open('wine_model.pkl','wb'))