## Ejercicio de Clustering y Regression

Utilizando el dataset de **`FuelConsumptionCo2.csv`**:

**Parte 1**:
1. Realiza un **`Exploratory Data Analysis`** (**EDA**).
2. Realiza **preprocesamiento**.
3. Utilizando métodos de **clustering**, **¿existe alguna forma de "categorizar" los datos?**
4. Selecciona un número de **clusters "optimo"** y crea una columna con la categorización dada por el clustering.
5. Teniendo el conjunto separado en diferentes "clases" o "categorias", **realiza una regresion lineal (`LinearRegression`) para cada conjunto**.
6. Recuerda hacer **`train_test_split`** para poder calcular métricas, agrega la metrica **`r^2 ajustado`**.
7. Haz el método de validación más adecuado para los datos, **solo es necesario hacer uno para cada modelo**.
8. Guarda los modelos en un archivos binario.

**Parte 2**:
1. Investiga sobre otros modelos de regresión e implementalos con este dataset.
2. Compara las métricas de estos nuevos modelos con el anterior (**`LinearRegression`**).
3. En caso de que el modelo lo permita, haz **tunning** al modelo usando **`GridSearchCV`**.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime

# Normalizacion
from sklearn.preprocessing import MinMaxScaler

# GridSearchCV
from sklearn.model_selection import GridSearchCV

# Archivos
import pickle


# Train, Test
from sklearn.model_selection import train_test_split

# Metricas para regresiones
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# Regresores
from sklearn.linear_model import LinearRegression

# Validacion
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import KFold

In [None]:
df = pd.read_csv("FuelConsumptionCo2.csv")

df.head(3)

In [None]:
df.info()

In [None]:
# Histograma de cada columna

for i in df.columns:
    
    plt.figure(figsize = (10, 6))
    sns.histplot(df[i])
    plt.show()


In [None]:
# Columna "MODELYEAR", tiene un solo valor

plt.figure(figsize = (5, 10))
sns.histplot(df["MODELYEAR"], bins = 10)
plt.show()

In [None]:
df["MODEL"].value_counts()

In [None]:
# Mapa de Correlación

sns.heatmap(data = df[df.drop("MODELYEAR", axis = 1).columns.to_list()].corr(), annot = True)
plt.show()

In [None]:
# Relación entre columnas

sns.pairplot(df.drop("MODELYEAR", axis = 1))

plt.show()

### PREPROCESAMIENTO

In [None]:
# Vemos si hay elementos nulos

df.isnull().sum()

In [None]:
# Eliminamos la columna "MODELYEAR"

df.drop("MODELYEAR", axis = 1, inplace= True)

### df_num, df_cat

In [None]:
df_num = df._get_numeric_data().copy()

df_num.head(3)

### df_cat

In [None]:
df_cat = df.drop(df_num.columns, axis = 1)

df_cat.head(3)

In [None]:
dict_class = {y:x.split(" - ")[0] for x, y in zip (df["VEHICLECLASS"].unique(), df["VEHICLECLASS"].unique())}

dict_class

In [None]:
# Transformamos la columna "VEHICLECLASS"
# Nos quedamos con las categorias definidas en "dict_class"

df_cat["VEHICLECLASS"] = df_cat["VEHICLECLASS"].map(dict_class)

df_cat.head()

In [None]:
df_cat["VEHICLECLASS"].value_counts()

In [None]:
# Agrupamos más elementos

dict_class_2 = {"SUBCOMPACT" : "COMPACT",
              "MINICOMPACT" : "COMPACT",
              "MINIVAN" : "VAN"}

df_cat["VEHICLECLASS"] = df_cat["VEHICLECLASS"].replace(dict_class_2)

df_cat.head()

In [None]:
# Vemos la columna "TRANSMISSION"

df_cat["TRANSMISSION"].value_counts()

In [None]:
# Agregamos "0" a los elementos con "AV"

df_cat["TRANSMISSION"] = df_cat["TRANSMISSION"].apply(lambda x : "AV0" if x == "AV" else x)

df_cat["TRANSMISSION"].value_counts()

In [None]:
# Separamos los tipos de transmision con su número y creamos 2 columnas

df_cat["MARCHAS"] = df_cat["TRANSMISSION"].apply(lambda x : x [-1])

df_cat["TRANSMISSION"] = df_cat["TRANSMISSION"].apply(lambda x : x [:-1])

In [None]:
# Aplicamos pd.dummies() a la columna "VEHICLECLASS"

df_cat = pd.concat([df_cat, pd.get_dummies(data = df_cat["VEHICLECLASS"], prefix = "VEHICLECLASS")], axis = 1)

df_cat

In [None]:
# Eliminamos las columnas que ya no nos interesan

df_cat.drop(["VEHICLECLASS", "MAKE", "MODEL"], axis = 1, inplace= True)

df_cat.head(3)

In [None]:
# Aplicamos pd.dummies() a la columna "VEHICLECLASS" y "FUELTYPE"

df_cat = pd.concat([df_cat, pd.get_dummies(data = df_cat["TRANSMISSION"], prefix = "TRANSMISSION")], axis = 1)

df_cat = pd.concat([df_cat, pd.get_dummies(data = df_cat["FUELTYPE"], prefix = "FUELTYPE")], axis = 1)

df_cat.head()

In [None]:
# Eliminamos las columnas "VEHICLECLASS" y "FUELTYPE"

df_cat.drop(["TRANSMISSION", "FUELTYPE"], axis = 1, inplace = True)

df_cat.head(3)

### df_num

In [None]:
df_num.head(3)

In [None]:
# KDEplot para cada columna

for i in df_num.columns:
    plt.figure(figsize = (10, 6))
    sns.kdeplot(df_num[i])
    plt.show()

In [None]:
# Eliminamos la columna "FUELCONSUMPTION_COMB"

df_num.drop("FUELCONSUMPTION_COMB", axis = 1, inplace = True)

In [None]:
df_num = df_num[df_num["CYLINDERS"] <= 7]

In [None]:
df_num.head(3)

### df

In [None]:
df = pd.concat([df_cat, df_num], axis = 1)

In [None]:
# Acotamos el df

df = df[(df["CO2EMISSIONS"] < 450) & (df["FUELCONSUMPTION_COMB_MPG"] < 52)]

In [None]:
df

In [None]:
df.info()

In [None]:
df.reset_index(drop = True, inplace = True)

In [None]:
# Transformamos la curva a una recta
# Renombro la columna en lugar de crear una nueva y eliminar la anterior

df["FUELCONSUMPTION_COMB_MPG"] = df["FUELCONSUMPTION_COMB_MPG"].apply(lambda x: x**-1)

In [None]:
df.head()

In [None]:
# Cluster

from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN

### K-Means

In [None]:
inercias = list() 
  
for k in range(1, 11): 
    kmeans = KMeans(n_clusters = k)
    kmeans.fit(df)     
    inercias.append(kmeans.inertia_) 
    
inercias

In [None]:
plt.figure(figsize = (10, 8))

plt.plot(range(1, len(inercias) + 1), inercias, marker = "x", color = "blue")

plt.xlabel("K's") 
plt.ylabel("Inercia") 

plt.show()

In [None]:
# Cluster

kmeans = KMeans(n_clusters = 3)
kmeans.fit(df)

kmeans.labels_

In [None]:
df_centroides  = pd.DataFrame(data = kmeans.cluster_centers_, columns = [df.columns])
df_centroides

In [None]:
# Escalar Datos

x_scaler = MinMaxScaler()
X = x_scaler.fit_transform(df)

X

### DBSCAN

In [None]:
from sklearn.neighbors import NearestNeighbors

neigh = NearestNeighbors(n_neighbors = 2*X.shape[1] - 1)
neigh.fit(X)
distances, indices = neigh.kneighbors(X)

# Plot
plt.figure(figsize = (10, 8))

# Grafico de distancias con k-vecinos más cercanos
sns.heatmap(neigh.kneighbors_graph(X).toarray())

plt.show()

In [None]:
# Plot para encontrar el valor de eps más "optimo"

plt.figure(figsize = (10, 8))

distances = np.sort(distances, axis = 0)[:, 1]

plt.plot(distances)
plt.show()

In [None]:

dbscan = DBSCAN(eps = 1.43, min_samples = 5)
dbscan.fit(X)

plt.figure(figsize = (10, 6))

# Datos Originales filtrados por DBSCAN
sns.scatterplot(x = X[:, -1], y = X[:, -2], hue = dbscan.labels_, palette = "Accent")
plt.show()

#### Ahora tengo 2 nubes para hacer regresiones:

In [None]:
df["cluster_kmeans"] = kmeans.labels_
df["cluster_dbscan"] = dbscan.labels_

df.head(3)

In [None]:
df["cluster_dbscan"].value_counts()

In [None]:
# No tengo ningun NaN

# df[df["cluster_dbscan"] == -1].T

In [None]:
# Nos quedamos con la agrupación de DBSCAN, eliminamos K-Means

df.drop("cluster_kmeans", axis = 1, inplace = True)

In [None]:
df.to_csv("df_preprocesado.csv")

In [None]:
# Separamos el df en 2

df_0 = df[df["cluster_dbscan"] == 0]
df_1 = df[df["cluster_dbscan"] == 1]

### REGRESION LINEAL DEL GRUPO 0

In [None]:
X0 = np.asarray (df_0.drop("CO2EMISSIONS", axis = 1))
y0 = np.asarray(df_0["CO2EMISSIONS"]).reshape(-1, 1)

In [None]:
# Normalizacion de X
scaler_x0 = MinMaxScaler()
scaler_x0.fit(X0)
X0 = scaler_x0.transform(X0)

# Normalizacion de y
scaler_y0 = MinMaxScaler()
scaler_y0.fit(y0)
y0 = scaler_y0.transform(y0)

In [None]:
# Guardamos los escaladores en archivos binarios

with open("escaladorX0.sav", "wb") as file:
    pickle.dump(scaler_x0, file)
    
with open("escaladory0.sav", "wb") as file:
    pickle.dump(scaler_y0, file)

In [None]:
X0.shape, y0.shape

### Train, Test

In [None]:
X0_train, X0_test, y0_train, y0_test = train_test_split(X0, y0, test_size = 0.30, random_state = 10)
 
print(f"Conjunto de Train: {X0_train.shape, X0_test.shape}")
print(f"Conjunto de Test: {y0_train.shape, y0_test.shape}")

### Modelo

In [None]:
model_0 = LinearRegression()
model_0.fit(X0_train, y0_train)

### Prediccion

In [None]:
yhat0 = model_0.predict(X0_test)

for i, j in zip(yhat0[:5], y0_test[:5]):
    print(f"Predicción: {i[0]} \tValor real: {j[0]}")

### Métricas

In [None]:
RAE0 = np.sum(np.abs(np.subtract(y0_test, yhat0))) / np.sum(np.abs(np.subtract(y0_test, np.mean(y0_test))))

RSE0 = np.sum(np.square(np.subtract(y0_test, yhat0))) / np.sum(np.square(np.subtract(y0_test, np.mean(y0_test))))

r2_ajustada0 = 1 - (1 - model_0.score(X0_test, y0_test))*(len(y0_test) - 1)/(len(y0_test) - X0_test.shape[1] - 1)

In [None]:
print(f"MAE:\t {mean_absolute_error(yhat0, y0_test)}")
print(f"MSE:\t {mean_squared_error(yhat0, y0_test)}")
print(f"R**2:\t {r2_score(yhat0, y0_test)}")
print(f"RAE:\t {RAE0}")
print(f"RSE:\t {RSE0}")
print(f"Adjusted R**2:\t {r2_ajustada0}")

In [None]:
df_pred = pd.DataFrame()

df_pred["y0_test"] = y0_test.flatten()
df_pred["yhat0"] = yhat0.flatten()

df_pred["diferencia %"] = round(abs((df_pred["y0_test"] - df_pred["yhat0"]) / df_pred["y0_test"] * 100), 4)

df_pred = df_pred.sort_values("diferencia %")

df_pred.head(20)

In [None]:
df_pred.tail(20)

In [None]:
plt.figure(figsize = (8, 5))

sns.scatterplot(x = y0_test.flatten(), y = yhat0.flatten(), alpha = 0.5, color = "blue")

plt.xlabel("Valores Reales (y_train)", size = 18)
plt.ylabel("Predicciones (yhat)", size = 18)

plt.show()

### REGRESION LINEAL DEL GRUPO 1

In [None]:
X1 = np.asarray (df_1.drop("CO2EMISSIONS", axis = 1))
y1 = np.asarray(df_1["CO2EMISSIONS"]).reshape(-1, 1)

In [None]:
# Normalizacion de X
scaler_x1 = MinMaxScaler()
scaler_x1.fit(X1)
X1 = scaler_x1.transform(X1)


# Normalizacion de y
scaler_y1 = MinMaxScaler()
scaler_y1.fit(y1)
y1 = scaler_y1.transform(y1)

In [None]:
# Guardamos los escaladores en archivos binarios

with open("escaladorX1.sav", "wb") as file:
    pickle.dump(scaler_x1, file)
    
with open("escaladory1.sav", "wb") as file:
    pickle.dump(scaler_y1, file)

In [None]:
X1.shape, y1.shape

### Train, Test

In [None]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size = 0.30, random_state = 10)
 
print(f"Conjunto de Train: {X1_train.shape, X1_test.shape}")
print(f"Conjunto de Test: {y1_train.shape, y1_test.shape}")

### Modelo

In [None]:
model_1 = LinearRegression()
model_1.fit(X1_train, y1_train)

### Predicciones

In [None]:
yhat1 = model_1.predict(X1_test)

for i, j in zip(yhat1[:5], y1_test[:5]):
    print(f"Predicción:{i[0]} \tValor real:{j[0]}")

### Métricas

In [None]:
RAE1 = np.sum(np.abs(np.subtract(y1_test, yhat1))) / np.sum(np.abs(np.subtract(y1_test, np.mean(y1_test))))

RSE1 = np.sum(np.square(np.subtract(y1_test, yhat1))) / np.sum(np.square(np.subtract(y1_test, np.mean(y1_test))))

r2_ajustada1 = 1 - (1 - model_1.score(X1_test, y1_test))*(len(y1_test) - 1)/(len(y1_test) - X1_test.shape[1] - 1)

In [None]:
print(f"MAE:\t {mean_absolute_error(yhat1, y1_test)}")
print(f"MSE:\t {mean_squared_error(yhat1, y1_test)}")
print(f"R**2:\t {r2_score(yhat1, y1_test)}")
print(f"RAE:\t {RAE1}")
print(f"RSE:\t {RSE1}")
print(f"Adjusted R**2:\t {r2_ajustada1}")

In [None]:
df_pred1=pd.DataFrame()
df_pred1["y1_test"] = y1_test.flatten()
df_pred1["yhat1"] = yhat1.flatten()

df_pred1["diferencia %"] = round(abs((df_pred1["y1_test"] - df_pred1["yhat1"]) / df_pred1["y1_test"] * 100), 4)

df_pred1 = df_pred1.sort_values("diferencia %")

df_pred1.head(20)

In [None]:
df_pred1.tail(20)

In [None]:
plt.figure(figsize = (8, 5))

sns.scatterplot(x = y1_test.flatten(), y = yhat1.flatten(), alpha = 0.5, color = "blue")

plt.xlabel("Valores Reales (y_train)", size = 18)
plt.ylabel("Predicciones (yhat)", size = 18)

plt.show()

# Metodos de Validacion

### Modelo 0

In [None]:
%%time

loo0 = LeaveOneOut()
yhat0 = list()

for train_index, test_index in loo0.split(X0): 
    X0_train, X0_test = X0[train_index], X0[test_index]
    y0_train, y0_test = y0[train_index], y0[test_index]
    
    model = LinearRegression()
    model.fit(X0_train, y0_train)
    
    # Prediccion
    yhat01 = model.predict(X0_test)[0]
    yhat0.append(yhat01)

In [None]:
RAE0 = np.sum(np.abs(np.subtract(y0, yhat0))) / np.sum(np.abs(np.subtract(y0, np.mean(y0))))

RSE0 = np.sum(np.square(np.subtract(y0, yhat0))) / np.sum(np.square(np.subtract(y0, np.mean(y0))))

r2_ajustada0 = 1 - (1 - model.score(X0, y0))*(len(y0) - 1)/(len(y0) - X0.shape[1] - 1)

In [None]:
print(f"MAE:\t {mean_absolute_error(yhat0, y0)}")
print(f"MSE:\t {mean_squared_error(yhat0, y0)}")
print(f"R**2:\t {r2_score(yhat0, y0)}")
print(f"RAE:\t {RAE0}")
print(f"RSE:\t {RSE0}")
print(f"Adjusted R**2:\t {r2_ajustada0}")

In [None]:
### Entrenamos el modelo con todos los datos

model_0 = LinearRegression()
model_0.fit(X0, y0)

### Modelo 1

In [None]:
%%time

loo1 = LeaveOneOut()
yhat1 = list()

for train_index, test_index in loo1.split(X1): 
    X1_train, X1_test = X1[train_index], X1[test_index]
    y1_train, y1_test = y1[train_index], y1[test_index]

    model = LinearRegression()
    model.fit(X1_train, y1_train)    
    
    # Prediccion
    yhat11 = model.predict(X1_test)[0]
    yhat1.append(yhat11)

In [None]:
RAE1 = np.sum(np.abs(np.subtract(y1, yhat1))) / np.sum(np.abs(np.subtract(y1, np.mean(y1))))

RSE1 = np.sum(np.square(np.subtract(y1, yhat1))) / np.sum(np.square(np.subtract(y1, np.mean(y1))))

r2_ajustada1 = 1 - (1 - model.score(X1, y1))*(len(y1) - 1)/(len(y1) - X1.shape[1] - 1)

In [None]:
print(f"MAE:\t {mean_absolute_error(yhat1, y1)}")
print(f"MSE:\t {mean_squared_error(yhat1, y1)}")
print(f"R**2:\t {r2_score(yhat1, y1)}")
print(f"RAE:\t {RAE1}")
print(f"RSE:\t {RSE1}")
print(f"Adjusted R**2:\t {r2_ajustada1}")

In [None]:
### Entrenamos el modelo con todos los datos

model_1 = LinearRegression()
model_1.fit(X1, y1)

In [None]:
# Guardamos ambos modelos

with open("modelo_grupo0.sav", "wb") as file:
    pickle.dump(model_0, file)
with open("modelo_grupo1.sav", "wb") as file:
    pickle.dump(model_1, file)

### Parte 2

#### 1. Investiga sobre otros modelos de regresión e implementalos con este dataset.

In [None]:
# Regresores
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor

### Funcion

In [None]:
def modelos_regresion(modelos, X_train, X_test, y_train, y_test):
    
    metricas = list() 
    
    for modelo in modelos:
        p1 = datetime.now()
        
        # Modelo
        modelo.fit(X_train, y_train.flatten())
        
        # Predicciones
        y_hat = modelo.predict(X_test)
        
        # Metricas
        RAE = np.sum(np.abs(np.subtract(y_test, y_hat))) / np.sum(np.abs(np.subtract(y_test, np.mean(y_test))))
        RSE = np.sum(np.square(np.subtract(y_test, y_hat))) / np.sum(np.square(np.subtract(y_test, np.mean(y_test))))
        r2_ajustada = 1 - (1 - modelo.score(X_test, y_test))*(len(y_test) - 1)/(len(y_test) - X_test.shape[1] - 1)
        MAE = mean_absolute_error(y_test, y_hat)
        MSE = mean_squared_error(y_test, y_hat)
        r2 = r2_score(y_test, y_hat)
        
        p2 = datetime.now()
        
        metricas.append([str(modelo), RAE, RSE, r2_ajustada, MAE, MSE, r2, (p2 - p1).seconds])
        
    return pd.DataFrame(data = metricas,
                        columns = ["modelo", "RAE", "RSE", "r2_ajustada", "MAE", "MSE", "r2", "tiempo_segundos"])

In [None]:
modelos = [LinearRegression(),
           DecisionTreeRegressor(),
           KNeighborsRegressor(),
           RandomForestRegressor(),
           GradientBoostingRegressor(),
           AdaBoostRegressor()]

#### 2. Compara las métricas de estos nuevos modelos con el anterior (**`LinearRegression`**).

### REGRESION LINEAL DEL GRUPO 0

In [None]:
X0 = np.asarray(df_0.drop("CO2EMISSIONS", axis = 1))
y0 = np.asarray(df_0["CO2EMISSIONS"]).reshape(-1, 1)

In [None]:
# Normalizacion de X
scaler_x0 = MinMaxScaler()
scaler_x0.fit(X0)
X0 = scaler_x0.transform(X0)

# Normalizacion de y
scaler_y0 = MinMaxScaler()
scaler_y0.fit(y0)
y0 = scaler_y0.transform(y0)

In [None]:
X0.shape, y0.shape

### Train, Test

In [None]:
X0_train, X0_test, y0_train, y0_test = train_test_split(X0, y0, test_size = 0.30, random_state = 10)
 
print(f"Conjunto de Train: {X0_train.shape, X0_test.shape}")
print(f"Conjunto de Test: {y0_train.shape, y0_test.shape}")

In [None]:
modelos_regresion(modelos, X0_train, X0_test, y0_train, y0_test).sort_values(by = "r2_ajustada", ascending = False)

### REGRESION LINEAL DEL GRUPO 1

In [None]:
X1 = np.asarray (df_1.drop("CO2EMISSIONS", axis = 1))
y1 = np.asarray(df_1["CO2EMISSIONS"]).reshape(-1, 1)

In [None]:
# Normalizacion de X
scaler_x1 = MinMaxScaler()
scaler_x1.fit(X1)
X1 = scaler_x1.transform(X1)

# Normalizacion de y
scaler_y1 = MinMaxScaler()
scaler_y1.fit(y1)
y1 = scaler_y1.transform(y1)

In [None]:
X1.shape, y1.shape

### Train, Test

In [None]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size = 0.30, random_state = 10)
 
print(f"Conjunto de Train: {X1_train.shape, X1_test.shape}")
print(f"Conjunto de Test: {y1_train.shape, y1_test.shape}")

In [None]:
modelos_regresion(modelos, X1_train, X1_test, y1_train, y1_test).sort_values(by = "r2_ajustada", ascending = False)

#### 3. En caso de que el modelo lo permita, haz **tunning** al modelo usando **`GridSearchCV`**.

In [None]:
# Vamos a hacer un GridSearch con GradientBoostingRegressor()

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import SCORERS

In [None]:
modelo = GradientBoostingRegressor()

params = {"loss"          : ["squared_error", "absolute_error", "huber", "quantile"],
          "learning_rate" : [0.1, 0.2, 0.5],
          "n_estimators"  : [100, 150, 200],
          "max_features"  : ["sqrt", "log2"],
          "max_depth"     : [3, 5]}

scorers = {"r2"}

grid_solver = GridSearchCV(estimator  = modelo   , 
                           param_grid = params   , 
                           scoring    = scorers  ,
                           cv         = 5        ,
                           refit      = "r2"     ,
                           n_jobs     = -1        )

model_result = grid_solver.fit(X0_train, y0_train.flatten())

In [None]:
print(model_result.cv_results_["mean_test_r2"].mean())

print("*"*100)

print(model_result.best_score_)
print(model_result.best_params_)

In [None]:
modelo = GradientBoostingRegressor(**model_result.best_params_)
modelo.fit(X0,y0.flatten())

with open("modelo_gradient_bousting0.sav", "wb") as file:
    pickle.dump(modelo, file)

In [None]:
################################################################################################################################