In [1]:
# Import dependencies
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn import preprocessing
# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LinearRegression
# from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
# import seaborn as sns
# import warnings
# warnings.filterwarnings("ignore")

In [2]:
# Import dataset
customer_df = pd.read_csv('Resources/customer_data.csv')
customer_df.head()

Unnamed: 0,id,marital_status,gender,total_children,education,member_card,occupation,houseowner,avg_cars_at_home_approx,avg_yearly_income,num_children_at_home,store_city,store_state,media_type,cost,promotion_name,store_sales_in_millions
0,0,M,F,1.0,Partial High School,Normal,Skilled Manual,Y,1.0,10000,1.0,Salem,OR,"Daily Paper, Radio",126.62,Bag Stuffers,7.36
1,1,M,M,0.0,Bachelors Degree,Silver,Professional,Y,4.0,50000,0.0,Salem,OR,"Daily Paper, Radio",59.86,Cash Register Lottery,5.52
2,2,S,F,4.0,Partial High School,Normal,Manual,N,1.0,10000,0.0,Salem,OR,"Daily Paper, Radio",84.16,High Roller Savings,3.68
3,3,M,F,2.0,High School Degree,Bronze,Manual,Y,2.0,30000,2.0,Salem,OR,In-Store Coupon,95.78,Cash Register Lottery,3.68
4,4,M,M,0.0,Partial High School,Bronze,Skilled Manual,N,2.0,30000,0.0,Salem,OR,Radio,50.79,Double Down Sale,4.08


In [3]:
# Check data frame shape
shape = customer_df.shape
print(f"Store dataset including {shape[0]} rows and {shape[1]} cloumns.")

Store dataset including 38892 rows and 17 cloumns.


In [4]:
# Check data frame columns
customer_df.columns

Index(['id', 'marital_status', 'gender', 'total_children', 'education',
       'member_card', 'occupation', 'houseowner', 'avg_cars_at_home_approx',
       'avg_yearly_income', 'num_children_at_home', 'store_city',
       'store_state', 'media_type', 'cost', 'promotion_name',
       'store_sales_in_millions'],
      dtype='object')

In [5]:
# Check datatypes
customer_df.dtypes

id                           int64
marital_status              object
gender                      object
total_children             float64
education                   object
member_card                 object
occupation                  object
houseowner                  object
avg_cars_at_home_approx    float64
avg_yearly_income            int64
num_children_at_home       float64
store_city                  object
store_state                 object
media_type                  object
cost                       float64
promotion_name              object
store_sales_in_millions    float64
dtype: object

In [6]:
# Check null values for each column
customer_df.isnull().sum()

id                         0
marital_status             0
gender                     0
total_children             0
education                  0
member_card                0
occupation                 0
houseowner                 0
avg_cars_at_home_approx    0
avg_yearly_income          0
num_children_at_home       0
store_city                 0
store_state                0
media_type                 0
cost                       0
promotion_name             0
store_sales_in_millions    0
dtype: int64

In [7]:
# Generate a categorical variable list
cat = customer_df.dtypes[customer_df.dtypes == 'object'].index.tolist()
cat

['marital_status',
 'gender',
 'education',
 'member_card',
 'occupation',
 'houseowner',
 'store_city',
 'store_state',
 'media_type',
 'promotion_name']

In [8]:
# Count the number of categorical variables.
print(f"There are {len(cat)} categorical variables in customer data frame.")

There are 10 categorical variables in customer data frame.


In [9]:
# Generate a numerical variable list
int_float = customer_df.dtypes[(customer_df.dtypes == 'int64') | (customer_df.dtypes == 'float64')].index.tolist()
int_float

['id',
 'total_children',
 'avg_cars_at_home_approx',
 'avg_yearly_income',
 'num_children_at_home',
 'cost',
 'store_sales_in_millions']

In [10]:
# Count the number of numerical variables.
print(f"There are {len(int_float)} numerical variables in customer data frame.")

There are 7 numerical variables in customer data frame.


## PCA with get_dummies(OneHotEncoder)

In [11]:
df = customer_df.drop(labels='id', axis=1)
df

Unnamed: 0,marital_status,gender,total_children,education,member_card,occupation,houseowner,avg_cars_at_home_approx,avg_yearly_income,num_children_at_home,store_city,store_state,media_type,cost,promotion_name,store_sales_in_millions
0,M,F,1.0,Partial High School,Normal,Skilled Manual,Y,1.0,10000,1.0,Salem,OR,"Daily Paper, Radio",126.62,Bag Stuffers,7.36
1,M,M,0.0,Bachelors Degree,Silver,Professional,Y,4.0,50000,0.0,Salem,OR,"Daily Paper, Radio",59.86,Cash Register Lottery,5.52
2,S,F,4.0,Partial High School,Normal,Manual,N,1.0,10000,0.0,Salem,OR,"Daily Paper, Radio",84.16,High Roller Savings,3.68
3,M,F,2.0,High School Degree,Bronze,Manual,Y,2.0,30000,2.0,Salem,OR,In-Store Coupon,95.78,Cash Register Lottery,3.68
4,M,M,0.0,Partial High School,Bronze,Skilled Manual,N,2.0,30000,0.0,Salem,OR,Radio,50.79,Double Down Sale,4.08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38887,S,F,2.0,High School Degree,Bronze,Professional,Y,3.0,130000,0.0,San Francisco,CA,Cash Register Handout,127.19,Green Light Special,0.99
38888,S,F,1.0,Partial High School,Bronze,Skilled Manual,N,2.0,50000,0.0,San Francisco,CA,"Sunday Paper, Radio",78.45,Unbeatable Price Savers,1.21
38889,M,F,1.0,Partial High School,Normal,Skilled Manual,Y,1.0,10000,1.0,San Francisco,CA,In-Store Coupon,95.25,You Save Days,2.76
38890,S,F,2.0,High School Degree,Bronze,Skilled Manual,N,2.0,30000,0.0,San Francisco,CA,Sunday Paper,69.42,Price Cutters,1.60


In [12]:
# Use get_dummies() to create variables for text features.
X = pd.get_dummies(df, columns=cat)
print(X.shape)
X.head(10)

(38892, 101)


Unnamed: 0,total_children,avg_cars_at_home_approx,avg_yearly_income,num_children_at_home,cost,store_sales_in_millions,marital_status_M,marital_status_S,gender_F,gender_M,...,promotion_name_Super Savers,promotion_name_Super Wallet Savers,promotion_name_Three for One,promotion_name_Tip Top Savings,promotion_name_Two Day Sale,promotion_name_Two for One,promotion_name_Unbeatable Price Savers,promotion_name_Wallet Savers,promotion_name_Weekend Markdown,promotion_name_You Save Days
0,1.0,1.0,10000,1.0,126.62,7.36,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,4.0,50000,0.0,59.86,5.52,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,4.0,1.0,10000,0.0,84.16,3.68,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
3,2.0,2.0,30000,2.0,95.78,3.68,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0.0,2.0,30000,0.0,50.79,4.08,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5,2.0,1.0,50000,2.0,50.79,4.08,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
6,4.0,2.0,30000,0.0,95.78,5.44,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
7,1.0,4.0,50000,0.0,59.86,3.74,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
8,2.0,2.0,10000,0.0,59.86,4.08,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
9,3.0,1.0,70000,0.0,84.16,9.72,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
# Standardize the data with StandardScalar()
X_scaled = StandardScaler().fit_transform(X)
print(X_scaled.shape)
print(X_scaled)

(38892, 101)
[[-1.01501566 -1.06643567 -1.0467121  ... -0.10976426 -0.22962677
  -0.16860121]
 [-1.68963438  1.58140755  0.04619831 ... -0.10976426 -0.22962677
  -0.16860121]
 [ 1.0088405  -1.06643567 -1.0467121  ... -0.10976426 -0.22962677
  -0.16860121]
 ...
 [-1.01501566 -1.06643567 -1.0467121  ... -0.10976426 -0.22962677
   5.93115542]
 [-0.34039694 -0.18382127 -0.50025689 ... -0.10976426 -0.22962677
  -0.16860121]
 [-1.01501566  0.69879314 -0.50025689 ... -0.10976426  4.35489293
  -0.16860121]]


### Four principal components with dummies dataset

In [14]:
# Using PCA to reduce dimension to four principal components.
pca_4c = PCA(n_components=4)
X_pca= pca_4c.fit_transform(X_scaled)
print(X_pca)

[[ 3.1681614   1.14053455  2.52339911 -2.62187847]
 [-1.03530703  3.55541432  1.49225752 -2.41417569]
 [ 4.37649301  2.02932278  0.27760862 -0.23299042]
 ...
 [ 2.06343035 -1.49264767  2.93842641  1.87131448]
 [ 1.15343215  0.58379208 -0.83749534  3.12456666]
 [-0.31838896  0.11795584  1.49140815  1.60433632]]


In [15]:
# Create a DataFrame with the three principal components.
pcs_df = pd.DataFrame(
    data=X_pca,
    columns=["PC 1", "PC 2", "PC 3", "PC 4"],
    index=customer_df['id']
)
print(pcs_df.shape)
pcs_df.head(10)

(38892, 4)


Unnamed: 0_level_0,PC 1,PC 2,PC 3,PC 4
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,3.168161,1.140535,2.523399,-2.621878
1,-1.035307,3.555414,1.492258,-2.414176
2,4.376493,2.029323,0.277609,-0.23299
3,0.639827,1.958227,1.645306,-2.763627
4,2.10449,1.484347,0.224778,-1.118803
5,-0.657666,2.127186,0.101953,-1.554472
6,2.056199,3.21268,-0.980532,-1.568649
7,1.868438,3.473982,0.106081,-1.681325
8,4.525178,2.778091,0.12134,-1.674833
9,0.109725,3.96688,-0.962757,-0.153066


In [16]:
pca_4c.explained_variance_ratio_

array([0.03791085, 0.03431798, 0.0314077 , 0.03075084])

In [17]:
pca_4c.explained_variance_ratio_.sum()

0.13438738064486416

### Six principal components with dummies dataset

In [18]:
# Using PCA to reduce dimension to six principal components.
pca_6c = PCA(n_components=6)
X_pca= pca_6c.fit_transform(X_scaled)
print(X_pca)

[[ 3.16836328  1.14242586  2.54318893 -2.62704748 -1.12029239 -2.24078287]
 [-1.03378595  3.55351724  1.50386127 -2.41517091 -2.89629906  0.78600051]
 [ 4.38102452  2.0239718   0.28457693 -0.23536186 -1.29615236 -0.91934048]
 ...
 [ 2.05401793 -1.49644713  2.92333253  1.83414958  0.71990648 -2.43000099]
 [ 1.15268078  0.58743662 -0.83255642  3.1331285   1.1259121  -0.40262699]
 [-0.32158501  0.12191555  1.48761806  1.59223655 -1.14181325  1.71578008]]


In [19]:
# Create a DataFrame with the three principal components.
pcs_df = pd.DataFrame(
    data=X_pca,
    columns=["PC 1", "PC 2", "PC 3", "PC 4", "PC 5", "PC 6"],
    index=customer_df['id']
)
print(pcs_df.shape)
pcs_df.head(10)

(38892, 6)


Unnamed: 0_level_0,PC 1,PC 2,PC 3,PC 4,PC 5,PC 6
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,3.168363,1.142426,2.543189,-2.627047,-1.120292,-2.240783
1,-1.033786,3.553517,1.503861,-2.415171,-2.896299,0.786001
2,4.381025,2.023972,0.284577,-0.235362,-1.296152,-0.91934
3,0.633697,1.954468,1.653315,-2.76646,-1.713905,-0.755181
4,2.102527,1.477128,0.21127,-1.100097,-1.996359,0.903887
5,-0.655814,2.117625,0.08712,-1.535787,-1.868282,-0.9236
6,2.050322,3.208207,-0.973522,-1.577426,-1.719477,-0.577921
7,1.867063,3.472742,0.125314,-1.684111,-2.735732,0.907136
8,4.526532,2.775028,0.135995,-1.683459,-2.542981,-1.217364
9,0.112755,3.962394,-0.957865,-0.154011,-1.059567,-0.858081


In [20]:
pca_6c.explained_variance_ratio_

array([0.0379109 , 0.03431809, 0.0314082 , 0.03075242, 0.02131785,
       0.02011785])

In [21]:
pca_6c.explained_variance_ratio_.sum()

0.17582531639159432

In [22]:
# fa = FactorAnalyzer(n_factors=4, method='principal', rotation="varimax")
# fa.fit(X_scaled)
# print(fa.loadings_.round(2))

## PCA with LabelEncoder

In [23]:
# Use LabelEncoder to encode categorical columns.
label_encoder = preprocessing.LabelEncoder()
df2 = df

for var in cat:
    df2[var] = label_encoder.fit_transform(df2[var])

df2[cat].head()

Unnamed: 0,marital_status,gender,education,member_card,occupation,houseowner,store_city,store_state,media_type,promotion_name
0,0,0,4,2,4,1,5,1,3,0
1,0,1,0,3,3,1,5,1,3,6
2,1,0,4,2,2,0,5,1,3,18
3,0,0,2,0,2,1,5,1,5,6
4,0,1,4,0,4,0,5,1,7,11


In [24]:
# Check new data frame infomation.
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38892 entries, 0 to 38891
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   marital_status           38892 non-null  int64  
 1   gender                   38892 non-null  int64  
 2   total_children           38892 non-null  float64
 3   education                38892 non-null  int64  
 4   member_card              38892 non-null  int64  
 5   occupation               38892 non-null  int64  
 6   houseowner               38892 non-null  int64  
 7   avg_cars_at_home_approx  38892 non-null  float64
 8   avg_yearly_income        38892 non-null  int64  
 9   num_children_at_home     38892 non-null  float64
 10  store_city               38892 non-null  int64  
 11  store_state              38892 non-null  int64  
 12  media_type               38892 non-null  int64  
 13  cost                     38892 non-null  float64
 14  promotion_name        

In [25]:
# Check unique values of each column.
df2.nunique()

marital_status                2
gender                        2
total_children                6
education                     5
member_card                   4
occupation                    5
houseowner                    2
avg_cars_at_home_approx       5
avg_yearly_income             8
num_children_at_home          6
store_city                   10
store_state                   3
media_type                   13
cost                        214
promotion_name               49
store_sales_in_millions    1011
dtype: int64

In [26]:
# Export dataset as a new csv file.
df2.to_csv('Resources/customer_label_encoded.csv')

In [27]:
# Display label encoded data frame
X2 = df2
print(f"There are {X2.shape[0]} rows and {X2.shape[1]} columns.")
X2.head()

There are 38892 rows and 16 columns.


Unnamed: 0,marital_status,gender,total_children,education,member_card,occupation,houseowner,avg_cars_at_home_approx,avg_yearly_income,num_children_at_home,store_city,store_state,media_type,cost,promotion_name,store_sales_in_millions
0,0,0,1.0,4,2,4,1,1.0,10000,1.0,5,1,3,126.62,0,7.36
1,0,1,0.0,0,3,3,1,4.0,50000,0.0,5,1,3,59.86,6,5.52
2,1,0,4.0,4,2,2,0,1.0,10000,0.0,5,1,3,84.16,18,3.68
3,0,0,2.0,2,0,2,1,2.0,30000,2.0,5,1,5,95.78,6,3.68
4,0,1,0.0,4,0,4,0,2.0,30000,0.0,5,1,7,50.79,11,4.08


In [28]:
# Standardize the data with StandardScalar()
X2_scaled = StandardScaler().fit_transform(X2)
print(X2_scaled.shape)
print(X2_scaled)

(38892, 16)
[[-1.01800638 -1.01273146 -1.01501566 ...  0.92718532 -1.74972536
   0.24388881]
 [-1.01800638  0.9874286  -1.68963438 ... -1.26643786 -1.33771203
  -0.28779487]
 [ 0.98231212 -1.01273146  1.0088405  ... -0.46798005 -0.51368537
  -0.81947854]
 ...
 [-1.01800638 -1.01273146 -1.01501566 ... -0.10358099  1.54638127
  -1.08532038]
 [ 0.98231212 -1.01273146 -0.34039694 ... -0.95231207 -0.10167205
  -1.42051226]
 [-1.01800638  0.9874286  -1.01501566 ... -1.01507151  1.47771238
  -0.28779487]]


### Four principal components with Label Encoded dataset

In [29]:
# Using PCA to reduce dimension to four principal components.
pca_4c = PCA(n_components=4)
X2_pca= pca_4c.fit_transform(X2_scaled)
print(X2_pca)

[[-1.65519423  1.72093262  0.87674336 -0.86948348]
 [ 0.57860155 -0.21671058  0.73511687 -1.98039605]
 [-2.36693961  0.14921479  0.10987164  0.21625676]
 ...
 [-1.71395776  1.61479662  1.18418447 -0.16554012]
 [-1.1959489  -1.23357227 -0.10044571 -1.23308156]
 [ 0.51034632 -0.49367954  0.37938644 -1.013116  ]]


In [30]:
# Create a DataFrame with the four principal components.
pcs_df = pd.DataFrame(
    data=X2_pca,
    columns=["PC 1", "PC 2", "PC 3", "PC 4"],
    index=customer_df['id']
)
print(pcs_df.shape)
pcs_df.head(10)

(38892, 4)


Unnamed: 0_level_0,PC 1,PC 2,PC 3,PC 4
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,-1.655194,1.720933,0.876743,-0.869483
1,0.578602,-0.216711,0.735117,-1.980396
2,-2.36694,0.149215,0.109872,0.216257
3,0.756153,0.669109,0.575486,-1.273735
4,-1.400034,-0.166238,-0.102138,-2.132388
5,0.488535,0.106694,0.034473,-1.699331
6,-0.89712,-0.774106,-0.250885,-0.25193
7,-0.046992,-1.307682,0.143495,-1.59164
8,-2.531351,0.012152,0.038375,-1.085009
9,-0.496033,-1.370612,-0.250739,0.356828


In [31]:
pca_4c.explained_variance_ratio_

array([0.12834021, 0.12042782, 0.10494932, 0.07009337])

In [32]:
pca_4c.explained_variance_ratio_.sum()

0.4238107150041297

### Six principal components with Label Encoded dataset

In [33]:
# Using PCA to reduce dimension to six principal components.
pca_6c = PCA(n_components=6)
X2_pca= pca_6c.fit_transform(X2_scaled)
print(X2_pca)

[[-1.65481098  1.7203815   0.87695117 -0.87061396 -0.93077412  1.42665615]
 [ 0.57928603 -0.21719695  0.73475097 -1.98136325 -0.72900879 -1.13251247]
 [-2.36593364  0.14787663  0.11042635  0.2154014   0.93022347  0.44605667]
 ...
 [-1.7104342   1.60969051  1.1871299  -0.16972734 -2.18221292  0.79158768]
 [-1.19333906 -1.23758315 -0.09773274 -1.23458602  0.43387697  1.12871944]
 [ 0.51473976 -0.49959963  0.38236446 -1.01746721 -0.92990481 -1.09135757]]


In [34]:
# Create a DataFrame with the six principal components.
pcs_df = pd.DataFrame(
    data=X2_pca,
    columns=["PC 1", "PC 2", "PC 3", "PC 4", "PC 5", "PC 6"],
    index=customer_df['id']
)
print(pcs_df.shape)
pcs_df.head(10)

(38892, 6)


Unnamed: 0_level_0,PC 1,PC 2,PC 3,PC 4,PC 5,PC 6
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,-1.654811,1.720381,0.876951,-0.870614,-0.930774,1.426656
1,0.579286,-0.217197,0.734751,-1.981363,-0.729009,-1.132512
2,-2.365934,0.147877,0.110426,0.215401,0.930223,0.446057
3,0.75641,0.668552,0.57607,-1.274187,-0.52929,0.773544
4,-1.399588,-0.166627,-0.102297,-2.132543,0.068192,-0.703552
5,0.487654,0.107767,0.034115,-1.697884,0.54609,0.720693
6,-0.896729,-0.774608,-0.250675,-0.251745,1.625021,1.526219
7,-0.046277,-1.308957,0.14462,-1.591674,0.0545,-1.00832
8,-2.531236,0.011829,0.038694,-1.084583,0.853193,0.991321
9,-0.496171,-1.370512,-0.250635,0.35749,1.176794,1.237936


In [35]:
pca_6c.explained_variance_ratio_

array([0.12834032, 0.12042804, 0.10494939, 0.07009346, 0.06924507,
       0.06265723])

In [36]:
pca_6c.explained_variance_ratio_.sum()

0.5557135163892724

### Eight principal components with Label Encoded dataset

In [37]:
# Using PCA to reduce dimension to eight principal components.
pca_8c = PCA(n_components=8)
X2_pca= pca_8c.fit_transform(X2_scaled)
print(X2_pca)

[[-1.65481098  1.7203815   0.87695117 ...  1.42665615  0.04200637
  -0.49825853]
 [ 0.57928603 -0.21719695  0.73475097 ... -1.13251247  0.56002348
  -0.7210563 ]
 [-2.36593364  0.14787663  0.11042635 ...  0.44605667 -0.75562876
   1.86234362]
 ...
 [-1.7104342   1.60969051  1.1871299  ...  0.79158768 -0.3239397
  -0.38094041]
 [-1.19333906 -1.23758315 -0.09773274 ...  1.12871944 -1.66175806
  -0.38555611]
 [ 0.51473976 -0.49959963  0.38236446 ... -1.09135757  0.57667296
  -0.6692594 ]]


In [38]:
# Create a DataFrame with the eight principal components.
pcs_df = pd.DataFrame(
    data=X2_pca,
    columns=["PC 1", "PC 2", "PC 3", "PC 4", "PC 5", "PC 6", "PC 7", "PC 8"],
    index=customer_df['id']
)
print(pcs_df.shape)
pcs_df.head(10)

(38892, 8)


Unnamed: 0_level_0,PC 1,PC 2,PC 3,PC 4,PC 5,PC 6,PC 7,PC 8
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,-1.654811,1.720381,0.876951,-0.870614,-0.930774,1.426656,0.042006,-0.498259
1,0.579286,-0.217197,0.734751,-1.981363,-0.729009,-1.132512,0.560023,-0.721056
2,-2.365934,0.147877,0.110426,0.215401,0.930223,0.446057,-0.755629,1.862344
3,0.75641,0.668552,0.57607,-1.274187,-0.52929,0.773544,-0.592114,0.668218
4,-1.399588,-0.166627,-0.102297,-2.132543,0.068192,-0.703552,-0.726229,-1.546633
5,0.487654,0.107767,0.034115,-1.697884,0.54609,0.720693,-0.710433,0.294326
6,-0.896729,-0.774608,-0.250675,-0.251745,1.625021,1.526219,-1.450035,-0.364844
7,-0.046277,-1.308957,0.14462,-1.591674,0.0545,-1.00832,-0.183898,-0.33629
8,-2.531236,0.011829,0.038694,-1.084583,0.853193,0.991321,-0.961286,0.037581
9,-0.496171,-1.370512,-0.250635,0.35749,1.176794,1.237936,0.453949,0.044555


In [39]:
pca_8c.explained_variance_ratio_

array([0.12834032, 0.12042804, 0.10494939, 0.07009346, 0.06924507,
       0.06265723, 0.06093935, 0.06069112])

In [40]:
pca_8c.explained_variance_ratio_.sum()

0.677343991465269

### Ten principal components with Label Encoded dataset

In [41]:
# Using PCA to reduce dimension to ten principal components.
pca_10c = PCA(n_components=10)
X2_pca= pca_10c.fit_transform(X2_scaled)
print(X2_pca)

[[-1.65481098  1.7203815   0.87695117 ... -0.49825853  0.87857171
  -1.84813538]
 [ 0.57928603 -0.21719695  0.73475097 ... -0.7210563  -0.64498606
  -1.13636551]
 [-2.36593364  0.14787663  0.11042635 ...  1.86234362 -0.87767822
  -0.4643776 ]
 ...
 [-1.7104342   1.60969051  1.1871299  ... -0.38094041 -0.78131576
   1.04682797]
 [-1.19333906 -1.23758315 -0.09773274 ... -0.38555611 -0.80272314
   1.229035  ]
 [ 0.51473976 -0.49959963  0.38236446 ... -0.6692594  -0.0339199
   2.17225907]]


In [42]:
# Create a DataFrame with the ten principal components.
pcs_df = pd.DataFrame(
    data=X2_pca,
    columns=["PC 1", "PC 2", "PC 3", "PC 4", "PC 5", "PC 6", "PC 7", "PC 8", "PC 9", "PC 10"],
    index=customer_df['id']
)
print(pcs_df.shape)
pcs_df.head(10)

(38892, 10)


Unnamed: 0_level_0,PC 1,PC 2,PC 3,PC 4,PC 5,PC 6,PC 7,PC 8,PC 9,PC 10
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,-1.654811,1.720381,0.876951,-0.870614,-0.930774,1.426656,0.042006,-0.498259,0.878572,-1.848135
1,0.579286,-0.217197,0.734751,-1.981363,-0.729009,-1.132512,0.560023,-0.721056,-0.644986,-1.136366
2,-2.365934,0.147877,0.110426,0.215401,0.930223,0.446057,-0.755629,1.862344,-0.877678,-0.464378
3,0.75641,0.668552,0.57607,-1.274187,-0.52929,0.773544,-0.592114,0.668218,0.032631,-0.944688
4,-1.399588,-0.166627,-0.102297,-2.132543,0.068192,-0.703552,-0.726229,-1.546633,-0.662305,-0.068449
5,0.487654,0.107767,0.034115,-1.697884,0.54609,0.720693,-0.710433,0.294326,-1.238005,0.105707
6,-0.896729,-0.774608,-0.250675,-0.251745,1.625021,1.526219,-1.450035,-0.364844,-0.268672,-0.560749
7,-0.046277,-1.308957,0.14462,-1.591674,0.0545,-1.00832,-0.183898,-0.33629,-0.62669,-1.191514
8,-2.531236,0.011829,0.038694,-1.084583,0.853193,0.991321,-0.961286,0.037581,-1.285554,-0.805653
9,-0.496171,-1.370512,-0.250635,0.35749,1.176794,1.237936,0.453949,0.044555,-0.795493,-0.64605


In [43]:
pca_10c.explained_variance_ratio_

array([0.12834032, 0.12042804, 0.10494939, 0.07009346, 0.06924507,
       0.06265723, 0.06093935, 0.06069112, 0.05887859, 0.05554043])

In [44]:
pca_10c.explained_variance_ratio_.sum()

0.7917630134464689