In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import RobustScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
df = pd.read_csv('bigmart/train.csv')

In [3]:
df

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.300,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.1380
1,DRC01,5.920,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.500,Low Fat,0.016760,Meat,141.6180,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.2700
3,FDX07,19.200,Regular,0.000000,Fruits and Vegetables,182.0950,OUT010,1998,,Tier 3,Grocery Store,732.3800
4,NCD19,8.930,Low Fat,0.000000,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052
...,...,...,...,...,...,...,...,...,...,...,...,...
8518,FDF22,6.865,Low Fat,0.056783,Snack Foods,214.5218,OUT013,1987,High,Tier 3,Supermarket Type1,2778.3834
8519,FDS36,8.380,Regular,0.046982,Baking Goods,108.1570,OUT045,2002,,Tier 2,Supermarket Type1,549.2850
8520,NCJ29,10.600,Low Fat,0.035186,Health and Hygiene,85.1224,OUT035,2004,Small,Tier 2,Supermarket Type1,1193.1136
8521,FDN46,7.210,Regular,0.145221,Snack Foods,103.1332,OUT018,2009,Medium,Tier 3,Supermarket Type2,1845.5976


In [4]:
X = df.drop(columns=['Item_Outlet_Sales','Item_Identifier'])
y = df['Item_Outlet_Sales']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 42)

In [6]:
cat_columns =['Item_Fat_Content','Item_Type','Outlet_Identifier','Outlet_Size','Outlet_Location_Type','Outlet_Type']

num_columns = ['Item_Weight','Item_Visibility','Item_MRP','Outlet_Establishment_Year']
#kita tentuin mana yg kategorikal mana yg numerikal

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,LabelEncoder,MinMaxScaler,StandardScaler,RobustScaler
from sklearn.compose import ColumnTransformer

In [8]:
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler',RobustScaler()),
])



categorical_pipeline = Pipeline([
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('encoder',OneHotEncoder())
])
prepprocessor = ColumnTransformer([
    ('numeric',numerical_pipeline,num_columns),
    ('categorical',categorical_pipeline, cat_columns)
])
Pipe_RF = Pipeline([
    ("prep",prepprocessor),
    ("algo",RandomForestRegressor())
])

In [9]:
X_train_num = numerical_pipeline.fit_transform(X_train[num_columns])

In [10]:
X_train_num

array([[-0.50504809, -0.27888189,  0.29220849,  0.        ],
       [ 0.76266257, -0.09808687,  0.28354863,  0.17647059],
       [ 0.7030056 ,  0.32413765, -0.3511271 , -0.11764706],
       ...,
       [ 0.7030056 , -0.51855422,  1.00448639,  0.17647059],
       [ 1.11314729,  0.00345148, -0.28394569,  0.47058824],
       [ 0.51657756, -0.54730013, -0.52353896, -0.11764706]])

In [11]:
X_train_cat = categorical_pipeline.fit_transform(X_train[cat_columns])

In [12]:
X_train_cat.toarray()

array([[0., 0., 1., ..., 1., 0., 0.],
       [0., 1., 0., ..., 1., 0., 0.],
       [0., 0., 1., ..., 1., 0., 0.],
       ...,
       [0., 1., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 1., 0., 0.]])

In [13]:
X_train_prep = prepprocessor.fit_transform(X_train)

In [14]:
X_train_prep

<6818x45 sparse matrix of type '<class 'numpy.float64'>'
	with 66286 stored elements in Compressed Sparse Row format>

In [15]:
X_test_prep = prepprocessor.fit_transform(X_test)

In [16]:
Pipe_RF.fit(X_train,y_train)

Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   RobustScaler())]),
                                                  ['Item_Weight',
                                                   'Item_Visibility',
                                                   'Item_MRP',
                                                   'Outlet_Establishment_Year']),
                                                 ('categorical',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                       

In [17]:
Pipe_RF.score(X_test,y_test)

0.5701740362800258

In [18]:
Pipe_RF.score(X_train,y_train)

0.9393756332034153

In [19]:
X_train.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
549,9.5,Regular,0.035206,Fruits and Vegetables,171.3448,OUT049,1999,Medium,Tier 1,Supermarket Type1
7757,18.0,Low Fat,0.047473,Household,170.5422,OUT045,2002,,Tier 2,Supermarket Type1
764,17.6,Regular,0.076122,Meat,111.7202,OUT046,1997,Small,Tier 1,Supermarket Type1
6867,8.325,Low Fat,0.029845,Fruits and Vegetables,41.6138,OUT045,2002,,Tier 2,Supermarket Type1
2716,12.85,Low Fat,0.137228,Snack Foods,155.563,OUT046,1997,Small,Tier 1,Supermarket Type1


In [20]:
X_test.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
7503,14.3,Low Fat,0.0263,Frozen Foods,79.4302,OUT013,1987,High,Tier 3,Supermarket Type1
2957,7.93,Low Fat,0.071136,Health and Hygiene,42.7086,OUT046,1997,Small,Tier 1,Supermarket Type1
7031,14.5,Regular,0.041313,Canned,42.0454,OUT049,1999,Medium,Tier 1,Supermarket Type1
1084,,Regular,0.044767,Soft Drinks,173.7054,OUT027,1985,Medium,Tier 3,Supermarket Type3
856,10.195,Regular,0.012456,Meat,197.511,OUT035,2004,Small,Tier 2,Supermarket Type1
