In [1]:
# Import dependencies
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from factor_analyzer import FactorAnalyzer
# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LinearRegression
# from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
# import seaborn as sns
# import warnings
# warnings.filterwarnings("ignore")

In [2]:
# Import dataset
customer_df = pd.read_csv('Resources/customer_data.csv')
customer_df.head()

Unnamed: 0,id,marital_status,gender,total_children,education,member_card,occupation,houseowner,avg_cars_at home(approx),avg. yearly_income,num_children_at_home,store_city,store_state,media_type,cost,promotion_name
0,0,M,F,1.0,Partial High School,Normal,Skilled Manual,Y,1.0,$10K - $30K,1.0,Salem,OR,"Daily Paper, Radio",126.62,Bag Stuffers
1,1,M,M,0.0,Bachelors Degree,Silver,Professional,Y,4.0,$50K - $70K,0.0,Salem,OR,"Daily Paper, Radio",59.86,Cash Register Lottery
2,2,S,F,4.0,Partial High School,Normal,Manual,N,1.0,$10K - $30K,0.0,Salem,OR,"Daily Paper, Radio",84.16,High Roller Savings
3,3,M,F,2.0,High School Degree,Bronze,Manual,Y,2.0,$30K - $50K,2.0,Salem,OR,In-Store Coupon,95.78,Cash Register Lottery
4,4,M,M,0.0,Partial High School,Bronze,Skilled Manual,N,2.0,$30K - $50K,0.0,Salem,OR,Radio,50.79,Double Down Sale


In [3]:
# Check data frame shape
shape = customer_df.shape
print(f"Store dataset including {shape[0]} rows and {shape[1]} cloumns.")

Store dataset including 38892 rows and 16 cloumns.


In [4]:
# Check data frame columns
customer_df.columns

Index(['id', 'marital_status', 'gender', 'total_children', 'education',
       'member_card', 'occupation', 'houseowner', 'avg_cars_at home(approx)',
       'avg. yearly_income', 'num_children_at_home', 'store_city',
       'store_state', 'media_type', 'cost', 'promotion_name'],
      dtype='object')

In [5]:
# Check datatypes
customer_df.dtypes

id                            int64
marital_status               object
gender                       object
total_children              float64
education                    object
member_card                  object
occupation                   object
houseowner                   object
avg_cars_at home(approx)    float64
avg. yearly_income           object
num_children_at_home        float64
store_city                   object
store_state                  object
media_type                   object
cost                        float64
promotion_name               object
dtype: object

In [6]:
# Check null values for each column
customer_df.isnull().sum()

id                          0
marital_status              0
gender                      0
total_children              0
education                   0
member_card                 0
occupation                  0
houseowner                  0
avg_cars_at home(approx)    0
avg. yearly_income          0
num_children_at_home        0
store_city                  0
store_state                 0
media_type                  0
cost                        0
promotion_name              0
dtype: int64

In [7]:
# Generate a categorical variable list
cat = customer_df.dtypes[customer_df.dtypes == 'object'].index.tolist()
cat

['marital_status',
 'gender',
 'education',
 'member_card',
 'occupation',
 'houseowner',
 'avg. yearly_income',
 'store_city',
 'store_state',
 'media_type',
 'promotion_name']

In [8]:
# Count the number of categorical variables.
print(f"There are {len(cat)} categorical variables in customer data frame.")

There are 11 categorical variables in customer data frame.


In [9]:
# Generate a numerical variable list
int_float = customer_df.dtypes[(customer_df.dtypes == 'int64') | (customer_df.dtypes == 'float64')].index.tolist()
int_float

['id',
 'total_children',
 'avg_cars_at home(approx)',
 'num_children_at_home',
 'cost']

In [10]:
# Count the number of numerical variables.
print(f"There are {len(int_float)} numerical variables in customer data frame.")

There are 5 numerical variables in customer data frame.


## Principal Component Analysis

In [11]:
df = customer_df.drop(labels='id', axis=1)
df

Unnamed: 0,marital_status,gender,total_children,education,member_card,occupation,houseowner,avg_cars_at home(approx),avg. yearly_income,num_children_at_home,store_city,store_state,media_type,cost,promotion_name
0,M,F,1.0,Partial High School,Normal,Skilled Manual,Y,1.0,$10K - $30K,1.0,Salem,OR,"Daily Paper, Radio",126.62,Bag Stuffers
1,M,M,0.0,Bachelors Degree,Silver,Professional,Y,4.0,$50K - $70K,0.0,Salem,OR,"Daily Paper, Radio",59.86,Cash Register Lottery
2,S,F,4.0,Partial High School,Normal,Manual,N,1.0,$10K - $30K,0.0,Salem,OR,"Daily Paper, Radio",84.16,High Roller Savings
3,M,F,2.0,High School Degree,Bronze,Manual,Y,2.0,$30K - $50K,2.0,Salem,OR,In-Store Coupon,95.78,Cash Register Lottery
4,M,M,0.0,Partial High School,Bronze,Skilled Manual,N,2.0,$30K - $50K,0.0,Salem,OR,Radio,50.79,Double Down Sale
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60421,S,F,2.0,High School Degree,Bronze,Professional,Y,3.0,$130K - $150K,0.0,San Francisco,CA,Cash Register Handout,127.19,Green Light Special
60422,S,F,1.0,Partial High School,Bronze,Skilled Manual,N,2.0,$50K - $70K,0.0,San Francisco,CA,"Sunday Paper, Radio",78.45,Unbeatable Price Savers
60423,M,F,1.0,Partial High School,Normal,Skilled Manual,Y,1.0,$10K - $30K,1.0,San Francisco,CA,In-Store Coupon,95.25,You Save Days
60424,S,F,2.0,High School Degree,Bronze,Skilled Manual,N,2.0,$30K - $50K,0.0,San Francisco,CA,Sunday Paper,69.42,Price Cutters


In [12]:
# Use get_dummies() to create variables for text features.
X = pd.get_dummies(df, columns=cat)
print(X.shape)
X.head(10)

(38892, 107)


Unnamed: 0,total_children,avg_cars_at home(approx),num_children_at_home,cost,marital_status_M,marital_status_S,gender_F,gender_M,education_Bachelors Degree,education_Graduate Degree,...,promotion_name_Super Savers,promotion_name_Super Wallet Savers,promotion_name_Three for One,promotion_name_Tip Top Savings,promotion_name_Two Day Sale,promotion_name_Two for One,promotion_name_Unbeatable Price Savers,promotion_name_Wallet Savers,promotion_name_Weekend Markdown,promotion_name_You Save Days
0,1.0,1.0,1.0,126.62,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,4.0,0.0,59.86,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,4.0,1.0,0.0,84.16,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2.0,2.0,2.0,95.78,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0.0,2.0,0.0,50.79,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5,2.0,1.0,2.0,50.79,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
6,4.0,2.0,0.0,95.78,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,1.0,4.0,0.0,59.86,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
8,2.0,2.0,0.0,59.86,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,3.0,1.0,0.0,84.16,0,1,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [13]:
# Standardize the data with StandardScalar()
X_scaled = StandardScaler().fit_transform(X)
print(X_scaled.shape)
print(X_scaled)

(38892, 107)
[[-1.01501566 -1.06643567  0.14047524 ... -0.10976426 -0.22962677
  -0.16860121]
 [-1.68963438  1.58140755 -0.63282457 ... -0.10976426 -0.22962677
  -0.16860121]
 [ 1.0088405  -1.06643567 -0.63282457 ... -0.10976426 -0.22962677
  -0.16860121]
 ...
 [-1.01501566 -1.06643567  0.14047524 ... -0.10976426 -0.22962677
   5.93115542]
 [-0.34039694 -0.18382127 -0.63282457 ... -0.10976426 -0.22962677
  -0.16860121]
 [-1.01501566  0.69879314 -0.63282457 ... -0.10976426  4.35489293
  -0.16860121]]


### Reducing Data Dimensions Using PCA

In [17]:
# Using PCA to reduce dimension to four principal components.
pca_4c = PCA(n_components=4)
X_pca= pca_4c.fit_transform(X_scaled)
print(X_pca)

[[ 3.68562139  1.59531789  2.40913181 -2.39922419]
 [-1.85635613  3.0213307   2.14938172 -2.01324807]
 [ 4.1851425   3.05833991  0.21228478 -0.07545603]
 ...
 [ 3.30932034 -1.22203951  2.60737562  1.93253434]
 [ 0.30563267  1.03641767 -1.08026183  2.44827546]
 [-0.5796207  -0.08908818  1.28941887  0.89048813]]


In [15]:
# Create a DataFrame with the three principal components.
pcs_df = pd.DataFrame(
    data=X_pca,
    columns=["PC 1", "PC 2", "PC 3", "PC 4"],
    index=customer_df['id']
)
print(pcs_df.shape)
pcs_df.head(10)

(38892, 4)


Unnamed: 0_level_0,PC 1,PC 2,PC 3,PC 4
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,3.685212,1.582385,2.412124,-2.407924
1,-1.856632,3.020569,2.142685,-1.999627
2,4.18475,3.056789,0.211904,-0.080596
3,-0.048458,1.828033,1.532971,-3.323134
4,1.219984,1.997386,-0.010109,-1.399632
5,-1.488147,1.972462,0.4837,-1.057631
6,0.539677,3.722839,-1.018473,-2.123583
7,0.429626,3.947936,0.283703,-1.574408
8,4.117705,3.804819,0.120067,-1.514014
9,-1.068487,3.915424,-0.271398,0.058036


In [19]:
pca_4c.explained_variance_ratio_

array([0.0405216 , 0.03286366, 0.02998481, 0.02918852])

In [20]:
pca_4c.explained_variance_ratio_.sum()

0.13255858777904517

In [21]:
# Using PCA to reduce dimension to six principal components.
pca_6c = PCA(n_components=6)
X_pca= pca_6c.fit_transform(X_scaled)
print(X_pca)

[[ 3.68458836  1.5908736   2.39594277 -2.40188892 -1.8107102  -1.23230926]
 [-1.8565512   3.02587466  2.16543463 -2.01501705 -2.39237963 -2.35714528]
 [ 4.18444314  3.05808214  0.2116319  -0.08285162 -0.95662687 -1.51983918]
 ...
 [ 3.31452389 -1.19627191  2.65095439  1.94382341 -0.54427784  0.18666645]
 [ 0.30666804  1.03544438 -1.06729177  2.43244645  3.43559142  0.47114195]
 [-0.58055445 -0.09389386  1.28559246  0.88988758  3.06957096 -1.00398887]]


In [22]:
# Create a DataFrame with the three principal components.
pcs_df = pd.DataFrame(
    data=X_pca,
    columns=["PC 1", "PC 2", "PC 3", "PC 4", "PC 5", "PC 6"],
    index=customer_df['id']
)
print(pcs_df.shape)
pcs_df.head(10)

(38892, 6)


Unnamed: 0_level_0,PC 1,PC 2,PC 3,PC 4,PC 5,PC 6
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,3.684588,1.590874,2.395943,-2.401889,-1.81071,-1.232309
1,-1.856551,3.025875,2.165435,-2.015017,-2.39238,-2.357145
2,4.184443,3.058082,0.211632,-0.082852,-0.956627,-1.519839
3,-0.047942,1.82702,1.518448,-3.330078,1.823852,-2.28755
4,1.22351,2.025858,0.021947,-1.449074,1.071392,-1.499643
5,-1.485004,1.999739,0.514242,-1.115865,-1.97696,-1.918102
6,0.541244,3.719691,-1.03171,-2.124022,2.234508,-2.508711
7,0.429047,3.954452,0.301835,-1.582311,-0.579014,-2.425148
8,4.117876,3.807863,0.119469,-1.511005,-1.240202,-2.838545
9,-1.06939,3.919128,-0.294257,0.069561,-1.644814,-1.318589


In [24]:
pca_6c.explained_variance_ratio_

array([0.04052163, 0.0328642 , 0.02998494, 0.02918821, 0.02664048,
       0.01989898])

In [25]:
pca_6c.explained_variance_ratio_.sum()

0.17909844787177936

In [26]:
# fa = FactorAnalyzer(n_factors=4, method='principal', rotation="varimax")
# fa.fit(X_scaled)
# print(fa.loadings_.round(2))