# Import Library

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import sklearn as sk


from sklearn.model_selection import train_test_split # Data Split
from sklearn.preprocessing import StandardScaler # Standarisasi
from sklearn.preprocessing import MinMaxScaler # Normalisasi
from sklearn.impute import SimpleImputer # Data Cleaning
from sklearn.preprocessing import OneHotEncoder # One Hot Encoder

# Membaca File CSV Kemudian Disimipan Kedalam Sebuah Variabel "data"

In [2]:
data = pd.read_csv('Big_Mart_Sale.csv')
data.head(600)

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.30,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.1380
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.50,Low Fat,0.016760,Meat,141.6180,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.2700
3,FDX07,19.20,Regular,0.000000,Fruits and Vegetables,182.0950,OUT010,1998,,Tier 3,Grocery Store,732.3800
4,NCD19,8.93,Low Fat,0.000000,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052
...,...,...,...,...,...,...,...,...,...,...,...,...
595,FDR08,18.70,Low Fat,0.037699,Fruits and Vegetables,110.2886,OUT045,2002,,Tier 2,Supermarket Type1,2223.7720
596,FDU04,7.93,Low Fat,0.000000,Frozen Foods,123.2414,OUT018,2009,Medium,Tier 3,Supermarket Type2,487.3656
597,NCQ50,18.75,Low Fat,0.034501,Household,211.7218,OUT017,2007,,Tier 2,Supermarket Type1,5770.4886
598,FDL46,,Low Fat,0.053795,Snack Foods,117.7466,OUT027,1985,Medium,Tier 3,Supermarket Type3,1414.1592


# Bagi dataset menjadi training set dan testing set dengan proporsi 70:30

In [3]:
X = data[['Item_Weight','Item_Visibility','Item_MRP','Item_Outlet_Sales']]
Y = data[['Item_Type']]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3) 

In [4]:
print(f'Jumlah data training (Status) : {len(X_train)}', 
      f'Jumlah data testing (Status)  : {len(X_test)}',
      f'Jumlah data training (Tipe)   : {len(Y_train)}',
      f'Jumlah data testing (Tipe)    : {len(Y_test)}',
      sep='\n')

Jumlah data training (Status) : 5966
Jumlah data testing (Status)  : 2557
Jumlah data training (Tipe)   : 5966
Jumlah data testing (Tipe)    : 2557


# Lakukan normalisasi data pada salah satu attribute menggunakan Min Max scaler (buatlah copy dataset terlebih dahulu)

In [5]:
normalisasi = data.copy()

In [6]:
scaler = MinMaxScaler()

normalize_dataset = MinMaxScaler().fit_transform(normalisasi[['Item_Weight','Item_Visibility','Item_MRP','Item_Outlet_Sales']])

In [7]:
normalize_dataset = pd.DataFrame(normalize_dataset)
normalize_dataset.head(5)

Unnamed: 0,0,1,2,3
0,0.282525,0.048866,0.927507,0.283587
1,0.081274,0.058705,0.072068,0.031419
2,0.770765,0.051037,0.468288,0.158115
3,0.871986,0.0,0.640093,0.053555
4,0.260494,0.0,0.095805,0.073651


# Lakukan standarisasi pada dataset (buatlah copy dataset terlebih dahulu)

In [8]:
standarisasi = data.copy()

In [9]:
standarisasi.drop(['Item_Identifier','Item_Weight','Item_Fat_Content','Item_Type','Outlet_Identifier','Outlet_Establishment_Year','Outlet_Size','Outlet_Location_Type','Outlet_Type'], axis=1, inplace=True)

standarisasi

Unnamed: 0,Item_Visibility,Item_MRP,Item_Outlet_Sales
0,0.016047,249.8092,3735.1380
1,0.019278,48.2692,443.4228
2,0.016760,141.6180,2097.2700
3,0.000000,182.0950,732.3800
4,0.000000,53.8614,994.7052
...,...,...,...
8518,0.056783,214.5218,2778.3834
8519,0.046982,108.1570,549.2850
8520,0.035186,85.1224,1193.1136
8521,0.145221,103.1332,1845.5976


In [10]:
standart_std = StandardScaler()

scaled_data = standart_std.fit_transform(standarisasi)

print('Dataset sebelum scalling:',  # SEBELUM
      f'Nilai standar deviasi:\n{np.std(standarisasi)}',
      sep='\n', end='\n\n')

print('Dataset setelah scalling:',  # SESUDAH
      scaled_data[:6],
      f'Nilai standar deviasi: {np.std(scaled_data)}',
      sep='\n')

Dataset sebelum scalling:
Nilai standar deviasi:
Item_Visibility         0.051595
Item_MRP               62.271413
Item_Outlet_Sales    1706.399501
dtype: float64

Dataset setelah scalling:
[[-0.97073217  1.74745381  0.910601  ]
 [-0.90811123 -1.48902325 -1.01844035]
 [-0.95691733  0.01004021 -0.04923754]
 [-1.28175775  0.66004955 -0.84910299]
 [-1.28175775 -1.39921961 -0.69537275]
 [-1.28175775 -1.43873372 -0.95211005]]
Nilai standar deviasi: 1.0


# Lakukan Data cleaning pada data dengan nilai null

In [11]:
#Menggunakan strategy modus/most frequent

imputer_modus = SimpleImputer(strategy='most_frequent')

print(f'Jumlah record yang memiliki nilai null :')
print(data.isna().sum())

Jumlah record yang memiliki nilai null :
Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64


In [12]:
data[['Item_Weight', 'Outlet_Size']] = imputer_modus.fit_transform(data[['Item_Weight', 'Outlet_Size']])
print(f'Jumlah record setelah menggunakan fungsi SimpleImputer :')
print(data.isna().sum())

Jumlah record setelah menggunakan fungsi SimpleImputer :
Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
Item_Outlet_Sales            0
dtype: int64


# Lakukan Data cleaning pada data dengan nilai duplikat

In [13]:
# Mengecek data duplikat

standarisasi.duplicated().sum()

1

In [14]:
# Melakukan drop pada data duplikat

standarisasi.drop_duplicates(inplace=True)

standarisasi.duplicated().sum()

0

# Ganti tipe data salah satu attribute angka

In [15]:
data.info() # Cek Info Dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                8523 non-null   object 
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                8523 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(3), int64(1), object(8)
memory usage: 799.2+ KB


In [16]:
dt = {'Outlet_Establishment_Year':float}
pd.options.display.float_format = '{:.1f}'.format
data['Outlet_Establishment_Year'] = data['Outlet_Establishment_Year'].astype('float64')

In [17]:
data.head(10)

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.0,Dairy,249.8,OUT049,1999.0,Medium,Tier 1,Supermarket Type1,3735.1
1,DRC01,5.9,Regular,0.0,Soft Drinks,48.3,OUT018,2009.0,Medium,Tier 3,Supermarket Type2,443.4
2,FDN15,17.5,Low Fat,0.0,Meat,141.6,OUT049,1999.0,Medium,Tier 1,Supermarket Type1,2097.3
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.1,OUT010,1998.0,Medium,Tier 3,Grocery Store,732.4
4,NCD19,8.9,Low Fat,0.0,Household,53.9,OUT013,1987.0,High,Tier 3,Supermarket Type1,994.7
5,FDP36,10.4,Regular,0.0,Baking Goods,51.4,OUT018,2009.0,Medium,Tier 3,Supermarket Type2,556.6
6,FDO10,13.7,Regular,0.0,Snack Foods,57.7,OUT013,1987.0,High,Tier 3,Supermarket Type1,343.6
7,FDP10,12.2,Low Fat,0.1,Snack Foods,107.8,OUT027,1985.0,Medium,Tier 3,Supermarket Type3,4022.8
8,FDH17,16.2,Regular,0.0,Frozen Foods,97.0,OUT045,2002.0,Medium,Tier 2,Supermarket Type1,1076.6
9,FDU28,19.2,Regular,0.1,Frozen Foods,187.8,OUT017,2007.0,Medium,Tier 2,Supermarket Type1,4710.5


In [18]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                8523 non-null   object 
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   float64
 8   Outlet_Size                8523 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), object(8)
memory usage: 799.2+ KB


# Lakukan one hot encoding pada dataset

In [19]:
data['Item_Type'].value_counts()

Fruits and Vegetables    1232
Snack Foods              1200
Household                 910
Frozen Foods              856
Dairy                     682
Canned                    649
Baking Goods              648
Health and Hygiene        520
Soft Drinks               445
Meat                      425
Breads                    251
Hard Drinks               214
Others                    169
Starchy Foods             148
Breakfast                 110
Seafood                    64
Name: Item_Type, dtype: int64

In [20]:
OneHot = data.copy()

data['Item_Type'].unique()

array(['Dairy', 'Soft Drinks', 'Meat', 'Fruits and Vegetables',
       'Household', 'Baking Goods', 'Snack Foods', 'Frozen Foods',
       'Breakfast', 'Health and Hygiene', 'Hard Drinks', 'Canned',
       'Breads', 'Starchy Foods', 'Others', 'Seafood'], dtype=object)

In [21]:
temp_dataset = pd.get_dummies(OneHot[['Item_Type']])

temp_dataset = pd.DataFrame(temp_dataset)

OneHot.drop('Item_Type', axis=1, inplace=True)

OneHot = OneHot.join(temp_dataset)
OneHot.head(10)

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,...,Item_Type_Fruits and Vegetables,Item_Type_Hard Drinks,Item_Type_Health and Hygiene,Item_Type_Household,Item_Type_Meat,Item_Type_Others,Item_Type_Seafood,Item_Type_Snack Foods,Item_Type_Soft Drinks,Item_Type_Starchy Foods
0,FDA15,9.3,Low Fat,0.0,249.8,OUT049,1999.0,Medium,Tier 1,Supermarket Type1,...,0,0,0,0,0,0,0,0,0,0
1,DRC01,5.9,Regular,0.0,48.3,OUT018,2009.0,Medium,Tier 3,Supermarket Type2,...,0,0,0,0,0,0,0,0,1,0
2,FDN15,17.5,Low Fat,0.0,141.6,OUT049,1999.0,Medium,Tier 1,Supermarket Type1,...,0,0,0,0,1,0,0,0,0,0
3,FDX07,19.2,Regular,0.0,182.1,OUT010,1998.0,Medium,Tier 3,Grocery Store,...,1,0,0,0,0,0,0,0,0,0
4,NCD19,8.9,Low Fat,0.0,53.9,OUT013,1987.0,High,Tier 3,Supermarket Type1,...,0,0,0,1,0,0,0,0,0,0
5,FDP36,10.4,Regular,0.0,51.4,OUT018,2009.0,Medium,Tier 3,Supermarket Type2,...,0,0,0,0,0,0,0,0,0,0
6,FDO10,13.7,Regular,0.0,57.7,OUT013,1987.0,High,Tier 3,Supermarket Type1,...,0,0,0,0,0,0,0,1,0,0
7,FDP10,12.2,Low Fat,0.1,107.8,OUT027,1985.0,Medium,Tier 3,Supermarket Type3,...,0,0,0,0,0,0,0,1,0,0
8,FDH17,16.2,Regular,0.0,97.0,OUT045,2002.0,Medium,Tier 2,Supermarket Type1,...,0,0,0,0,0,0,0,0,0,0
9,FDU28,19.2,Regular,0.1,187.8,OUT017,2007.0,Medium,Tier 2,Supermarket Type1,...,0,0,0,0,0,0,0,0,0,0
