In [71]:
import warnings
warnings.filterwarnings(action='ignore')

import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf

%matplotlib inline

# Importing the Data

In [72]:
data = pd.read_csv('Carseats.csv')

In [73]:
display(data.head()), data.shape

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.5,138,73,11,276,120,Bad,42,17,Yes,Yes
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
3,7.4,117,100,4,466,97,Medium,55,14,Yes,Yes
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No


(None, (400, 11))

In [74]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 11 columns):
Sales          400 non-null float64
CompPrice      400 non-null int64
Income         400 non-null int64
Advertising    400 non-null int64
Population     400 non-null int64
Price          400 non-null int64
ShelveLoc      400 non-null object
Age            400 non-null int64
Education      400 non-null int64
Urban          400 non-null object
US             400 non-null object
dtypes: float64(1), int64(7), object(3)
memory usage: 34.5+ KB


# Preprocessing`

In [75]:
df = data.copy()

In [76]:
# Identify and drop our target variable 'Sales' from dataframe, isolating our independent variables
X = df.drop('Sales', axis = 1)

# Isolate our dependent variable as a feature
y = df.Sales

In [77]:
# Train Test Split (70/30 size), drop duplicates and missing values

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = 33)

X_train.drop_duplicates(inplace = True)
X_train.dropna(inplace = True)

# Numeric Data

In [78]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 280 entries, 11 to 20
Data columns (total 10 columns):
CompPrice      280 non-null int64
Income         280 non-null int64
Advertising    280 non-null int64
Population     280 non-null int64
Price          280 non-null int64
ShelveLoc      280 non-null object
Age            280 non-null int64
Education      280 non-null int64
Urban          280 non-null object
US             280 non-null object
dtypes: int64(7), object(3)
memory usage: 24.1+ KB


In [79]:
# Select all data excluding 'object' to isolate numeric data
X_train_numeric = X_train.select_dtypes(exclude = 'object')

# Set numeric dataframe columns
X_numeric_cols = X_train_numeric.columns
X_train_numeric.columns = X_numeric_cols

In [80]:
X_train_numeric.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 280 entries, 11 to 20
Data columns (total 7 columns):
CompPrice      280 non-null int64
Income         280 non-null int64
Advertising    280 non-null int64
Population     280 non-null int64
Price          280 non-null int64
Age            280 non-null int64
Education      280 non-null int64
dtypes: int64(7)
memory usage: 17.5 KB


In [81]:
X_train_numeric.head()

Unnamed: 0,CompPrice,Income,Advertising,Population,Price,Age,Education
11,117,94,4,503,94,50,13
214,115,115,3,48,107,73,18
361,131,25,10,183,104,56,15
98,122,77,24,382,127,36,16
358,123,96,10,71,118,69,11


In [82]:
# Perform standard scaling on our numeric data

from sklearn.preprocessing import StandardScaler
from scipy import stats

# Instantiate a standard scaler object
ss = StandardScaler()

# Fit and transform our numeric data to the ss object
X_train_numeric = pd.DataFrame(ss.fit_transform(X_train_numeric))
X_train_numeric.set_index(X_train.index, inplace = True)

X_train_numeric.columns = X_numeric_cols
X_train_numeric.head()

Unnamed: 0,CompPrice,Income,Advertising,Population,Price,Age,Education
11,-0.519682,0.865873,-0.413823,1.619408,-0.952591,-0.180258,-0.386274
214,-0.648968,1.614819,-0.562949,-1.456166,-0.391427,1.260019,1.504576
361,0.385317,-1.594949,0.48093,-0.543633,-0.520926,0.195466,0.370066
98,-0.196468,0.259584,2.568687,0.801508,0.471902,-1.056948,0.748236
358,-0.131825,0.937202,0.48093,-1.300697,0.083404,1.009536,-1.142613


In [83]:
# Remove Outliers (2.5 standard deviations from the mean)
X_train_numeric = X_train_numeric[(np.abs(stats.zscore(X_train_numeric)) < 2.5).all(axis = 1)]