In [382]:
# Para mas información, consulta el GitHub [https://github.com/Zygmut/Practica_3_IA]
# Autores: Palmer Pérez, Rubén; Torres Torres, Marc; Orr, Kieran Donal

# Setup
Importamos todos los paquetes necesarios para la práctica. 

In [383]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.ensemble import RandomForestClassifier 

Creamos un conjunto de variables para facil acceso y cambio de parámetros

In [384]:
TEST_SIZE = 0.33

Definimos una variable `DEBUG` (y su método de uso) que, en caso de estar activada, nos dará información adicional a lo largo de la ejecución del proyecto 

In [385]:
DEBUG = True 

def debugPrint(string):
    if DEBUG : print(string)

# El dataset
Cargamos los datasets y les añadimos la columna `type` que nos permitara reconocer de que tipo son una vez juntados los datasets. Si `type = 0` el vino será blanco, de lo contrario sera rojo   

In [386]:
white = pd.read_csv("dataset/winequality-white.csv")
white.insert(0, 'type',np.zeros(white.shape[0], dtype=np.int64))
red = pd.read_csv("dataset/winequality-red.csv")
red.insert(0, 'type',np.ones(red.shape[0], dtype=np.int64))
df = pd.concat([white, red])
df

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,0,7.0,0.270,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6
1,0,6.3,0.300,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6
2,0,8.1,0.280,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6
3,0,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
4,0,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1594,1,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,1,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,1,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,1,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


Preparamos los datos del dataset eliminando todos los posibles valores nulos y transformando los valores que no sean `np.float64` o `np.int64` 

In [387]:
# Preparacion del dataset

labelencoder = LabelEncoder()

if not df[df.isna().any(axis=1)].shape[0] == 0 :
    debugPrint("Valores nulos eliminados para el dataframe")
    
df.dropna(inplace=True) # Eliminacion de valores nullos

for column in df.columns: # Transformar posibles valores no numericos
    if not df[column].dtype in (np.float64, np.int64):
        df[column] = labelencoder.fit_transform(df[column])
        debugPrint(f"Los valores de {column} se han transformado a numericos")


### Separación del conjunto de entrenamiento, validación y testeo
Una vez obtenido nuestro dataset con sus valores procesados, creamos los 2 subconjuntos de datos que nos permitirán entrenar nuestros modelos (entrenamiento y testeo). El subconjutno de validación se creará posteriormente.

In [388]:
min_max_scaler = preprocessing.MinMaxScaler()

y = df["type"]
df.drop("type", axis=1, inplace=True)
x = min_max_scaler.fit_transform(df.values)

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=TEST_SIZE, random_state=27)


# Modelos
Una vez preparado nuestros datasets de entrenamiento y testeo, podemos usarlos para nuestros modelos de machine learning

## Aplicación

### Regresión lineal

In [389]:
clr = LogisticRegression(random_state=0).fit(x_train, y_train)
y_hat = clr.predict(x_test)
print(classification_report(y_test, y_hat, target_names=["WHITE", "RED"]))

              precision    recall  f1-score   support

       White       0.99      0.99      0.99      1614
         Red       0.98      0.98      0.98       531

    accuracy                           0.99      2145
   macro avg       0.99      0.99      0.99      2145
weighted avg       0.99      0.99      0.99      2145



### Perceptron

In [390]:
cp = Perceptron(random_state=0).fit(x_train, y_train)
y_hat = cp.predict(x_test)
print(classification_report(y_test, y_hat, target_names=["WHITE", "RED"]))

              precision    recall  f1-score   support

       WHITE       0.89      1.00      0.94      1614
         RED       1.00      0.63      0.77       531

    accuracy                           0.91      2145
   macro avg       0.94      0.81      0.86      2145
weighted avg       0.92      0.91      0.90      2145



### Random forest

In [391]:
crf = RandomForestClassifier(random_state=0).fit(x_train, y_train)
y_hat = cp.predict(x_test)
print(classification_report(y_test, y_hat, target_names=["WHITE", "RED"]))

              precision    recall  f1-score   support

       WHITE       0.89      1.00      0.94      1614
         RED       1.00      0.63      0.77       531

    accuracy                           0.91      2145
   macro avg       0.94      0.81      0.86      2145
weighted avg       0.92      0.91      0.90      2145

