In [1]:
# Import dependencies
# import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LinearRegression
# from sklearn.metrics import r2_score
# import matplotlib.pyplot as plt
# import seaborn as sns
# import warnings
# warnings.filterwarnings("ignore")

In [2]:
# Import dataset
store_df = pd.read_csv('Resources/store_data.csv')
store_df.head()

Unnamed: 0,id,food_category,food_department,food_family,store_sales(in millions),store_cost(in millions),unit_sales(in millions),promotion_name,brand_name,SRP,...,grocery_sqft,frozen_sqft,meat_sqft,coffee_bar,video_store,salad_bar,prepared_food,florist,cost,media_type
0,0,Breakfast Foods,Frozen Foods,Food,7.36,2.7232,4.0,Bag Stuffers,Carrington,1.84,...,18670.0,5415.0,3610.0,1.0,1.0,1.0,1.0,1.0,126.62,"Daily Paper, Radio"
1,1,Breakfast Foods,Frozen Foods,Food,5.52,2.5944,3.0,Cash Register Lottery,Carrington,1.84,...,18670.0,5415.0,3610.0,1.0,1.0,1.0,1.0,1.0,59.86,"Daily Paper, Radio"
2,2,Breakfast Foods,Frozen Foods,Food,3.68,1.3616,2.0,High Roller Savings,Carrington,1.84,...,18670.0,5415.0,3610.0,1.0,1.0,1.0,1.0,1.0,84.16,"Daily Paper, Radio"
3,3,Breakfast Foods,Frozen Foods,Food,3.68,1.1776,2.0,Cash Register Lottery,Carrington,1.84,...,18670.0,5415.0,3610.0,1.0,1.0,1.0,1.0,1.0,95.78,In-Store Coupon
4,4,Breakfast Foods,Frozen Foods,Food,4.08,1.428,3.0,Double Down Sale,Golden,1.36,...,18670.0,5415.0,3610.0,1.0,1.0,1.0,1.0,1.0,50.79,Radio


In [3]:
# Check data frame shape
shape = store_df.shape
print(f"Store dataset including {shape[0]} rows and {shape[1]} cloumns.")

Store dataset including 38892 rows and 29 cloumns.


In [4]:
# Check data frame columns
store_df.columns

Index(['id', 'food_category', 'food_department', 'food_family',
       'store_sales(in millions)', 'store_cost(in millions)',
       'unit_sales(in millions)', 'promotion_name', 'brand_name', 'SRP',
       'gross_weight', 'net_weight', 'recyclable_package', 'low_fat',
       'units_per_case', 'store_type', 'store_city', 'store_state',
       'store_sqft', 'grocery_sqft', 'frozen_sqft', 'meat_sqft', 'coffee_bar',
       'video_store', 'salad_bar', 'prepared_food', 'florist', 'cost',
       'media_type'],
      dtype='object')

In [5]:
# Check datatypes
store_df.dtypes

id                            int64
food_category                object
food_department              object
food_family                  object
store_sales(in millions)    float64
store_cost(in millions)     float64
unit_sales(in millions)     float64
promotion_name               object
brand_name                   object
SRP                         float64
gross_weight                float64
net_weight                  float64
recyclable_package          float64
low_fat                     float64
units_per_case              float64
store_type                   object
store_city                   object
store_state                  object
store_sqft                  float64
grocery_sqft                float64
frozen_sqft                 float64
meat_sqft                   float64
coffee_bar                  float64
video_store                 float64
salad_bar                   float64
prepared_food               float64
florist                     float64
cost                        

In [6]:
# Check null values for each column
store_df.isnull().sum()

id                          0
food_category               0
food_department             0
food_family                 0
store_sales(in millions)    0
store_cost(in millions)     0
unit_sales(in millions)     0
promotion_name              0
brand_name                  0
SRP                         0
gross_weight                0
net_weight                  0
recyclable_package          0
low_fat                     0
units_per_case              0
store_type                  0
store_city                  0
store_state                 0
store_sqft                  0
grocery_sqft                0
frozen_sqft                 0
meat_sqft                   0
coffee_bar                  0
video_store                 0
salad_bar                   0
prepared_food               0
florist                     0
cost                        0
media_type                  0
dtype: int64

In [7]:
# Generate categorical variable list
cat = store_df.dtypes[store_df.dtypes == 'object'].index.tolist()
cat

['food_category',
 'food_department',
 'food_family',
 'promotion_name',
 'brand_name',
 'store_type',
 'store_city',
 'store_state',
 'media_type']

In [8]:
# Count the number of categorical variables.
print(f"There are {len(cat)} categorical variables in store data frame.")

There are 9 categorical variables in store data frame.


In [9]:
# Generate a numerical variable list
num = store_df.dtypes[(store_df.dtypes == 'int64') | (store_df.dtypes == 'float64')].index.tolist()
num

['id',
 'store_sales(in millions)',
 'store_cost(in millions)',
 'unit_sales(in millions)',
 'SRP',
 'gross_weight',
 'net_weight',
 'recyclable_package',
 'low_fat',
 'units_per_case',
 'store_sqft',
 'grocery_sqft',
 'frozen_sqft',
 'meat_sqft',
 'coffee_bar',
 'video_store',
 'salad_bar',
 'prepared_food',
 'florist',
 'cost']

In [10]:
# Count the number of numerical variables.
print(f"There are {len(num)} numerical variables in store data frame.")

There are 20 numerical variables in store data frame.


## Multiple Linear Regression

In [11]:
# Create a new data frame which contains store related numercial variables only.
store_num = store_df[['store_sales(in millions)', 'store_cost(in millions)', 'unit_sales(in millions)', 'store_sqft','grocery_sqft','frozen_sqft','meat_sqft','coffee_bar','video_store','salad_bar','prepared_food','florist','cost']]
store_num

Unnamed: 0,store_sales(in millions),store_cost(in millions),unit_sales(in millions),store_sqft,grocery_sqft,frozen_sqft,meat_sqft,coffee_bar,video_store,salad_bar,prepared_food,florist,cost
0,7.36,2.7232,4.0,27694.0,18670.0,5415.0,3610.0,1.0,1.0,1.0,1.0,1.0,126.62
1,5.52,2.5944,3.0,27694.0,18670.0,5415.0,3610.0,1.0,1.0,1.0,1.0,1.0,59.86
2,3.68,1.3616,2.0,27694.0,18670.0,5415.0,3610.0,1.0,1.0,1.0,1.0,1.0,84.16
3,3.68,1.1776,2.0,27694.0,18670.0,5415.0,3610.0,1.0,1.0,1.0,1.0,1.0,95.78
4,4.08,1.4280,3.0,27694.0,18670.0,5415.0,3610.0,1.0,1.0,1.0,1.0,1.0,50.79
...,...,...,...,...,...,...,...,...,...,...,...,...,...
60421,0.99,0.4554,1.0,22478.0,15321.0,4294.0,2863.0,1.0,0.0,0.0,0.0,0.0,127.19
60422,1.21,0.4477,1.0,22478.0,15321.0,4294.0,2863.0,1.0,0.0,0.0,0.0,0.0,78.45
60423,2.76,1.3248,1.0,22478.0,15321.0,4294.0,2863.0,1.0,0.0,0.0,0.0,0.0,95.25
60424,1.60,0.4960,1.0,22478.0,15321.0,4294.0,2863.0,1.0,0.0,0.0,0.0,0.0,69.42


In [12]:
# Statistic details of the store_mlr data frame
store_num.describe()

Unnamed: 0,store_sales(in millions),store_cost(in millions),unit_sales(in millions),store_sqft,grocery_sqft,frozen_sqft,meat_sqft,coffee_bar,video_store,salad_bar,prepared_food,florist,cost
count,38892.0,38892.0,38892.0,38892.0,38892.0,38892.0,38892.0,38892.0,38892.0,38892.0,38892.0,38892.0,38892.0
mean,6.515973,2.6069,3.080556,27199.413427,18241.415895,5374.95107,3583.187314,0.562532,0.247583,0.482979,0.482979,0.496066,98.40235
std,3.460749,1.447546,0.832546,5957.915401,3739.762002,1733.637554,1155.5597,0.496081,0.431614,0.499717,0.499717,0.499991,30.434059
min,0.51,0.1705,1.0,20319.0,13305.0,2452.0,1635.0,0.0,0.0,0.0,0.0,0.0,50.79
25%,3.78,1.4868,3.0,21215.0,15321.0,4746.0,3164.0,0.0,0.0,0.0,0.0,0.0,69.47
50%,5.91,2.37155,3.0,27694.0,18670.0,5011.0,3340.0,1.0,0.0,0.0,0.0,0.0,98.52
75%,8.65,3.475275,4.0,30268.0,22123.0,5633.0,3755.0,1.0,0.0,1.0,1.0,1.0,126.62
max,19.9,9.7265,6.0,39696.0,24390.0,9184.0,6122.0,1.0,1.0,1.0,1.0,1.0,149.75


In [13]:
# Count unique values for each numerical variable column
store_df[cat].nunique()

food_category       45
food_department     22
food_family          3
promotion_name      49
brand_name         111
store_type           4
store_city          10
store_state          3
media_type          13
dtype: int64

In [16]:
# Create a new data frame which contains store related categorcial variables only.
store_cat = store_df[['promotion_name','store_type','store_city','store_state','media_type']]
store_cat.head()

Unnamed: 0,promotion_name,store_type,store_city,store_state,media_type
0,Bag Stuffers,Deluxe Supermarket,Salem,OR,"Daily Paper, Radio"
1,Cash Register Lottery,Deluxe Supermarket,Salem,OR,"Daily Paper, Radio"
2,High Roller Savings,Deluxe Supermarket,Salem,OR,"Daily Paper, Radio"
3,Cash Register Lottery,Deluxe Supermarket,Salem,OR,In-Store Coupon
4,Double Down Sale,Deluxe Supermarket,Salem,OR,Radio


In [None]:
# Define X and y for multiple linear regression
X = store_mlr.iloc[:, :12]
y = store_mlr.iloc[:, 12]

In [None]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

