In [None]:
# importing the required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor
from xgboost.sklearn import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import tree

import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [None]:
# reading the data
df = pd.read_csv('Train.csv')

**Procesado de los datos**

In [None]:
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [None]:
df.Item_Identifier.value_counts()

FDW13    10
FDG33    10
NCY18     9
FDD38     9
DRE49     9
         ..
FDY43     1
FDQ60     1
FDO33     1
DRF48     1
FDC23     1
Name: Item_Identifier, Length: 1559, dtype: int64

In [None]:
# Rellenamos los missing values
df['Item_Weight'].fillna(df['Item_Weight'].median(), inplace=True) # LLENAMOS LOS MISSING DE Item_Weight CON LA MEDIANA'''
df['Outlet_Size'].fillna(df['Outlet_Size'].mode()[0], inplace=True) # LLENAMOS LOS MISSING DE Outlet_Size CON LA MODA'''

In [None]:
# Transformamos a categorias para agrupar los datos
df['Item_Type_Combined'] = df['Item_Identifier'].apply(lambda s: s[0:2]) # Tomamos los dos primeros caracteres
df['Item_Type_Combined'] = df['Item_Type_Combined'].map({'FD':'Food', 'NC':'Non-Consumable', 'DR':'Drinks'})

df['Item_Type_Combined'].value_counts()

Food              6125
Non-Consumable    1599
Drinks             799
Name: Item_Type_Combined, dtype: int64

In [None]:
# Creamos nuevas columnas con valor
df['Outlet_Years'] = 2013 - df['Outlet_Establishment_Year']

# Modificamos valores
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace({'LF':'Low Fat', 'reg':'Regular', 'low fat':'Low Fat'})
df['Item_Fat_Content'].value_counts()

Low Fat    5517
Regular    3006
Name: Item_Fat_Content, dtype: int64

In [None]:
# Transformamos valores categoricos a numéricos en la misma columna
le = LabelEncoder()
df['Outlet'] = le.fit_transform(df['Outlet_Identifier'])
var_mod = ['Item_Fat_Content','Outlet_Location_Type','Outlet_Size','Item_Type_Combined','Outlet_Type','Outlet']
le = LabelEncoder()
for i in var_mod:
  df[i] = le.fit_transform(df[i])

# Dummies para el resto de columnas
df = pd.get_dummies(df, columns=['Item_Fat_Content','Outlet_Location_Type','Outlet_Size','Outlet_Type','Item_Type_Combined','Outlet'])

In [None]:
# Eliminamos aquellas columnas que ya no nos interesan
df.drop(['Item_Type','Outlet_Establishment_Year', 'Item_Identifier', 'Outlet_Identifier'],axis=1,inplace=True)

In [None]:
df.head()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Item_Outlet_Sales,Outlet_Years,Item_Fat_Content_0,Item_Fat_Content_1,Outlet_Location_Type_0,Outlet_Location_Type_1,Outlet_Location_Type_2,...,Outlet_0,Outlet_1,Outlet_2,Outlet_3,Outlet_4,Outlet_5,Outlet_6,Outlet_7,Outlet_8,Outlet_9
0,9.3,0.016047,249.8092,3735.138,14,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1,5.92,0.019278,48.2692,443.4228,4,0,1,0,0,1,...,0,0,0,1,0,0,0,0,0,0
2,17.5,0.01676,141.618,2097.27,14,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
3,19.2,0.0,182.095,732.38,15,0,1,0,0,1,...,1,0,0,0,0,0,0,0,0,0
4,8.93,0.0,53.8614,994.7052,26,1,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0


**PREGUNTA**

Compara los ensemble methods aprendidos (con los hiperparámetros por defecto) para estimar el valor de 'Item_Outlet_Sales'. Usa como caso base de comparación un árbol de decisión normal. Decision Tree y los 5 del Colab de Esemble.

Utiliza el error cuadrático medio para compararlos
¿Qué algoritmo es mejor para este caso?

In [None]:
X = df.drop('Item_Outlet_Sales', axis=1)
y = df['Item_Outlet_Sales']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Modelo Arbol de Decision
tree_model = DecisionTreeRegressor()

In [None]:
tree_model.fit(X_train, y_train)

In [None]:
tree_predictions = tree_model.predict(X_test)

In [None]:
tree_mse = mean_squared_error(y_test, tree_predictions)
print(f" El error cuadratico medio con Arbol de Decision es: %.2f" % tree_mse )

 El error cuadratico medio con Arbol de Decision es: 2183496.48


In [None]:
# Modelo Random Forest Regresion
randomforest_model = RandomForestRegressor()

In [None]:
randomforest_model.fit(X_train, y_train)

In [None]:
randomforest_predictions = randomforest_model.predict(X_test)
randon_forest_mse = mean_squared_error(y_test, randomforest_predictions)
print(f" El error cuadratico medio con Randon Forest Regressor es: %.2f" % randon_forest_mse)

 El error cuadratico medio con Randon Forest Regressor es: 1171812.23


In [None]:
# Modelo de Extra Trees
extra_trees_model = ExtraTreesRegressor()

In [None]:
extra_trees_model.fit(X_train, y_train)

In [None]:
extra_trees_predictions = extra_trees_model.predict(X_test)
extra_tree_mse = mean_squared_error(y_test, extra_trees_predictions)
print("El error cuadratico medio con Estra Tree Regressor es: %.2f" % extra_tree_mse)

El error cuadratico medio con Estra Tree Regressor es: 1254153.46


In [None]:
# Boosting methods:
# AdaBoost
adaBoost_model = AdaBoostRegressor(DecisionTreeRegressor())


In [None]:
adaBoost_model.fit(X_train,y_train)

In [None]:
y_prediction = adaBoost_model.predict(X_test)
mseadaBoost = mean_squared_error(y_test, y_prediction)
print("El error cuadratico medio AdaBoost Regressor es: %.2f" % mseadaBoost)

El error cuadratico medio AdaBoost Regressor es: 1407138.75


In [None]:
# Gradient Tree Boosting (XGBoost):
import xgboost as xgb
xgbr = xgb.XGBRegressor()
xgbr.fit(X_train, y_train)

In [None]:
xgbr_prediction = xgbr.predict(X_test)
mse_xgbr = mean_squared_error(y_test, xgbr_prediction)
print("El error cuadratico medio con XgBoost Regressor es: %.2f" % mse_xgbr)

El error cuadratico medio con XgBoost Regressor es: 1288927.32


In [None]:
# Light GBM
lightgbm_model = LGBMRegressor()

In [None]:
lightgbm_model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004914 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 799
[LightGBM] [Info] Number of data points in the train set: 6818, number of used features: 29
[LightGBM] [Info] Start training from score 2202.365232


In [None]:
lightgbm_model_prediction = lightgbm_model.predict(X_test)
mse_light = mean_squared_error(y_test, lightgbm_model_prediction)
print("El error cuadratico medio con Light GBM Regressor es: %.2f" % mse_light)

El error cuadratico medio con Light GBM Regressor es: 1108696.33


In [None]:
print(f"El error cuadratico medio con Arbol de Decision es: %.2f" % tree_mse )
print(f"El error cuadratico medio con Randon Forest Regressor es: %.2f" % randon_forest_mse)
print("El error cuadratico medio con Estra Tree Regressor es: %.2f" % extra_tree_mse)
print("El error cuadratico medio AdaBoost Regressor es: %.2f" % mseadaBoost)
print("El error cuadratico medio con Light GBM Regressor es: %.2f" % mse_light)

El error cuadratico medio con Arbol de Decision es: 2183496.48
El error cuadratico medio con Randon Forest Regressor es: 1171812.23
El error cuadratico medio con Estra Tree Regressor es: 1254153.46
El error cuadratico medio AdaBoost Regressor es: 1407138.75
El error cuadratico medio con Light GBM Regressor es: 1108696.33


Segun los resultados anteriores escogeríamos aquel que tiene el MSE más bajo, en este caso sería el modelo con **Light GBM**, ya que un valor del MSE más pequeño indica mejor rendimiento del modelo.