<a href="https://colab.research.google.com/github/bbalbo/prediccion-de-ventas/blob/main/pipeline_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
#Libraries are downloaded
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn import set_config
set_config(display='diagram')


In [5]:
df=pd.read_csv('/content/drive/MyDrive/sales_predictions.csv')
df

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.300,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.1380
1,DRC01,5.920,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.500,Low Fat,0.016760,Meat,141.6180,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.2700
3,FDX07,19.200,Regular,0.000000,Fruits and Vegetables,182.0950,OUT010,1998,,Tier 3,Grocery Store,732.3800
4,NCD19,8.930,Low Fat,0.000000,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052
...,...,...,...,...,...,...,...,...,...,...,...,...
8518,FDF22,6.865,Low Fat,0.056783,Snack Foods,214.5218,OUT013,1987,High,Tier 3,Supermarket Type1,2778.3834
8519,FDS36,8.380,Regular,0.046982,Baking Goods,108.1570,OUT045,2002,,Tier 2,Supermarket Type1,549.2850
8520,NCJ29,10.600,Low Fat,0.035186,Health and Hygiene,85.1224,OUT035,2004,Small,Tier 2,Supermarket Type1,1193.1136
8521,FDN46,7.210,Regular,0.145221,Snack Foods,103.1332,OUT018,2009,Medium,Tier 3,Supermarket Type2,1845.5976


In [6]:
df['Item_Fat_Content'].value_counts()

Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: Item_Fat_Content, dtype: int64

In [7]:
# The data type of the 'Item_MRP' is modified to float since it was an object
df['Item_MRP']=df['Item_MRP'].astype(float)

In [8]:
# Feature values of 'Item_Fat_Content' ​​are replaced to 0 and 1
df['Item_Fat_Content'].replace({'low fat':0,'LF':0,'Low Fat':0,'Regular':1,'reg':1}, inplace=True)
df['Item_Fat_Content'].value_counts()

0    5517
1    3006
Name: Item_Fat_Content, dtype: int64

In [9]:
# Columns that are not relevant for the analysis are eliminated
y=df['Item_Outlet_Sales']
X=df.drop(columns=['Item_Outlet_Sales','Item_Identifier','Outlet_Identifier','Outlet_Establishment_Year'])

In [10]:
X

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,9.300,0,0.016047,Dairy,249.8092,Medium,Tier 1,Supermarket Type1
1,5.920,1,0.019278,Soft Drinks,48.2692,Medium,Tier 3,Supermarket Type2
2,17.500,0,0.016760,Meat,141.6180,Medium,Tier 1,Supermarket Type1
3,19.200,1,0.000000,Fruits and Vegetables,182.0950,,Tier 3,Grocery Store
4,8.930,0,0.000000,Household,53.8614,High,Tier 3,Supermarket Type1
...,...,...,...,...,...,...,...,...
8518,6.865,0,0.056783,Snack Foods,214.5218,High,Tier 3,Supermarket Type1
8519,8.380,1,0.046982,Baking Goods,108.1570,,Tier 2,Supermarket Type1
8520,10.600,0,0.035186,Health and Hygiene,85.1224,Small,Tier 2,Supermarket Type1
8521,7.210,1,0.145221,Snack Foods,103.1332,Medium,Tier 3,Supermarket Type2


In [11]:
# Training and test sets are generated
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [12]:
X_train.Outlet_Size.value_counts()

Medium    2103
Small     1788
High       689
Name: Outlet_Size, dtype: int64

In [13]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6392 entries, 4776 to 7270
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Item_Weight           5285 non-null   float64
 1   Item_Fat_Content      6392 non-null   int64  
 2   Item_Visibility       6392 non-null   float64
 3   Item_Type             6392 non-null   object 
 4   Item_MRP              6392 non-null   float64
 5   Outlet_Size           4580 non-null   object 
 6   Outlet_Location_Type  6392 non-null   object 
 7   Outlet_Type           6392 non-null   object 
dtypes: float64(3), int64(1), object(4)
memory usage: 449.4+ KB


In [14]:
#ordinales
X_train['Outlet_Size'].value_counts()

#nominales
X_train['Item_Type'].value_counts()
X_train['Outlet_Location_Type'].value_counts()
X_train['Outlet_Type'].value_counts()

Supermarket Type1    4166
Grocery Store         799
Supermarket Type3     723
Supermarket Type2     704
Name: Outlet_Type, dtype: int64

In [15]:
X_train

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Size,Outlet_Location_Type,Outlet_Type
4776,16.350,0,0.029565,Household,256.4646,Medium,Tier 3,Supermarket Type2
7510,15.250,1,0.000000,Snack Foods,179.7660,Medium,Tier 3,Supermarket Type2
5828,12.350,1,0.158716,Meat,157.2946,Medium,Tier 1,Supermarket Type1
5327,7.975,0,0.014628,Baking Goods,82.3250,Small,Tier 2,Supermarket Type1
4810,19.350,0,0.016645,Frozen Foods,120.9098,,Tier 2,Supermarket Type1
...,...,...,...,...,...,...,...,...
5734,9.395,1,0.286345,Fruits and Vegetables,139.1838,,Tier 3,Grocery Store
5191,15.600,0,0.117575,Frozen Foods,75.6670,,Tier 2,Supermarket Type1
5390,17.600,0,0.018944,Health and Hygiene,237.3590,,Tier 2,Supermarket Type1
860,20.350,0,0.054363,Snack Foods,117.9466,,Tier 2,Supermarket Type1


In [16]:
#Selectores
cat_selector=make_column_selector(dtype_include='object')
num_selector=make_column_selector(dtype_exclude='object')
cat1_selector=make_column_selector('.*Size')
cat2_selector=make_column_selector('.*Type')

#Imputers
cat_imputer=SimpleImputer(strategy="most_frequent")
num_imputer=SimpleImputer(strategy='mean')

#Scaler
scaler=StandardScaler()

#One-Hot
ohe=OneHotEncoder(handle_unknown='ignore', sparse=True)

#Ordinal Encoder
oe=OrdinalEncoder()

In [17]:
num_pipe=make_pipeline(num_imputer,scaler)
cat1_pipe=make_pipeline(cat_imputer,oe)
cat2_pipe=make_pipeline(cat_imputer,ohe)

In [18]:
num_tuple=(num_pipe, num_selector)
cat1_tuple=(cat1_pipe, cat1_selector)
cat2_tuple=(cat2_pipe, cat2_selector)

In [19]:
col_transformer=make_column_transformer(num_tuple, cat1_tuple, cat2_tuple)

In [20]:
col_transformer.fit(X_train)
X_train_imputed=col_transformer.transform(X_train)
X_test_imputed=col_transformer.transform(X_test)

In [21]:
pd.DataFrame(X_train_imputed.toarray())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,20,21,22,23,24,25,26,27
0,0.817249,-0.740321,-0.712775,1.828109,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.556340,1.350766,-1.291052,0.603369,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,-0.131512,1.350766,1.813319,0.244541,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,-1.169219,-0.740321,-1.004931,-0.952591,2.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,1.528819,-0.740321,-0.965484,-0.336460,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6387,-0.832409,1.350766,4.309657,-0.044657,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
6388,0.639356,-0.740321,1.008625,-1.058907,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
6389,1.113736,-0.740321,-0.920527,1.523027,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
6390,1.766009,-0.740321,-0.227755,-0.383777,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


# Modelo de Regresión

In [30]:
reg=LinearRegression()
reg.fit(X_train_imputed,y_train)
print(reg.score(X_train_imputed,y_train))
print(reg.score(X_test_imputed,y_test))
print(np.sqrt(mean_squared_error(y_train,reg.predict(X_train_imputed))))
print(np.sqrt(mean_squared_error(y_test,reg.predict(X_test_imputed))))

0.5605748930349453
0.5658450033049149
1140.3767533231833
1094.451358110315


# Modelo de Arbol de Regresión

In [42]:
dec_tree=DecisionTreeRegressor(random_state=42, max_depth=9)
dec_tree.fit(X_train_imputed,y_train)
print(dec_tree.score(X_train_imputed,y_train))
print(dec_tree.score(X_test_imputed,y_test))
print(np.sqrt(mean_squared_error(y_train,dec_tree.predict(X_train_imputed))))
print(np.sqrt(mean_squared_error(y_test,dec_tree.predict(X_test_imputed))))

0.6636092399790507
0.5527563185347324
997.7642776197305
1110.8263407381494
