In [1]:
# Import dependencies
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn import preprocessing
# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LinearRegression
# from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
# import seaborn as sns
# import warnings
# warnings.filterwarnings("ignore")

In [2]:
# Import dataset
customer_df = pd.read_csv('Resources/customer_data.csv')
customer_df.head()

Unnamed: 0,id,marital_status,gender,total_children,education,member_card,occupation,houseowner,avg_cars_at home(approx),avg. yearly_income,num_children_at_home,store_city,store_state,media_type,cost,promotion_name
0,0,M,F,1.0,Partial High School,Normal,Skilled Manual,Y,1.0,$10K - $30K,1.0,Salem,OR,"Daily Paper, Radio",126.62,Bag Stuffers
1,1,M,M,0.0,Bachelors Degree,Silver,Professional,Y,4.0,$50K - $70K,0.0,Salem,OR,"Daily Paper, Radio",59.86,Cash Register Lottery
2,2,S,F,4.0,Partial High School,Normal,Manual,N,1.0,$10K - $30K,0.0,Salem,OR,"Daily Paper, Radio",84.16,High Roller Savings
3,3,M,F,2.0,High School Degree,Bronze,Manual,Y,2.0,$30K - $50K,2.0,Salem,OR,In-Store Coupon,95.78,Cash Register Lottery
4,4,M,M,0.0,Partial High School,Bronze,Skilled Manual,N,2.0,$30K - $50K,0.0,Salem,OR,Radio,50.79,Double Down Sale


In [3]:
# Check data frame shape
shape = customer_df.shape
print(f"Store dataset including {shape[0]} rows and {shape[1]} cloumns.")

Store dataset including 38892 rows and 16 cloumns.


In [4]:
# Check data frame columns
customer_df.columns

Index(['id', 'marital_status', 'gender', 'total_children', 'education',
       'member_card', 'occupation', 'houseowner', 'avg_cars_at home(approx)',
       'avg. yearly_income', 'num_children_at_home', 'store_city',
       'store_state', 'media_type', 'cost', 'promotion_name'],
      dtype='object')

In [5]:
# Check datatypes
customer_df.dtypes

id                            int64
marital_status               object
gender                       object
total_children              float64
education                    object
member_card                  object
occupation                   object
houseowner                   object
avg_cars_at home(approx)    float64
avg. yearly_income           object
num_children_at_home        float64
store_city                   object
store_state                  object
media_type                   object
cost                        float64
promotion_name               object
dtype: object

In [6]:
# Check null values for each column
customer_df.isnull().sum()

id                          0
marital_status              0
gender                      0
total_children              0
education                   0
member_card                 0
occupation                  0
houseowner                  0
avg_cars_at home(approx)    0
avg. yearly_income          0
num_children_at_home        0
store_city                  0
store_state                 0
media_type                  0
cost                        0
promotion_name              0
dtype: int64

In [7]:
# Generate a categorical variable list
cat = customer_df.dtypes[customer_df.dtypes == 'object'].index.tolist()
cat

['marital_status',
 'gender',
 'education',
 'member_card',
 'occupation',
 'houseowner',
 'avg. yearly_income',
 'store_city',
 'store_state',
 'media_type',
 'promotion_name']

In [8]:
# Count the number of categorical variables.
print(f"There are {len(cat)} categorical variables in customer data frame.")

There are 11 categorical variables in customer data frame.


In [9]:
# Generate a numerical variable list
int_float = customer_df.dtypes[(customer_df.dtypes == 'int64') | (customer_df.dtypes == 'float64')].index.tolist()
int_float

['id',
 'total_children',
 'avg_cars_at home(approx)',
 'num_children_at_home',
 'cost']

In [10]:
# Count the number of numerical variables.
print(f"There are {len(int_float)} numerical variables in customer data frame.")

There are 5 numerical variables in customer data frame.


## PCA with get_dummies(OneHotEncoder)

In [11]:
df = customer_df.drop(labels='id', axis=1)
df

Unnamed: 0,marital_status,gender,total_children,education,member_card,occupation,houseowner,avg_cars_at home(approx),avg. yearly_income,num_children_at_home,store_city,store_state,media_type,cost,promotion_name
0,M,F,1.0,Partial High School,Normal,Skilled Manual,Y,1.0,$10K - $30K,1.0,Salem,OR,"Daily Paper, Radio",126.62,Bag Stuffers
1,M,M,0.0,Bachelors Degree,Silver,Professional,Y,4.0,$50K - $70K,0.0,Salem,OR,"Daily Paper, Radio",59.86,Cash Register Lottery
2,S,F,4.0,Partial High School,Normal,Manual,N,1.0,$10K - $30K,0.0,Salem,OR,"Daily Paper, Radio",84.16,High Roller Savings
3,M,F,2.0,High School Degree,Bronze,Manual,Y,2.0,$30K - $50K,2.0,Salem,OR,In-Store Coupon,95.78,Cash Register Lottery
4,M,M,0.0,Partial High School,Bronze,Skilled Manual,N,2.0,$30K - $50K,0.0,Salem,OR,Radio,50.79,Double Down Sale
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60421,S,F,2.0,High School Degree,Bronze,Professional,Y,3.0,$130K - $150K,0.0,San Francisco,CA,Cash Register Handout,127.19,Green Light Special
60422,S,F,1.0,Partial High School,Bronze,Skilled Manual,N,2.0,$50K - $70K,0.0,San Francisco,CA,"Sunday Paper, Radio",78.45,Unbeatable Price Savers
60423,M,F,1.0,Partial High School,Normal,Skilled Manual,Y,1.0,$10K - $30K,1.0,San Francisco,CA,In-Store Coupon,95.25,You Save Days
60424,S,F,2.0,High School Degree,Bronze,Skilled Manual,N,2.0,$30K - $50K,0.0,San Francisco,CA,Sunday Paper,69.42,Price Cutters


In [12]:
# Use get_dummies() to create variables for text features.
X = pd.get_dummies(df, columns=cat)
print(X.shape)
X.head(10)

(38892, 107)


Unnamed: 0,total_children,avg_cars_at home(approx),num_children_at_home,cost,marital_status_M,marital_status_S,gender_F,gender_M,education_Bachelors Degree,education_Graduate Degree,...,promotion_name_Super Savers,promotion_name_Super Wallet Savers,promotion_name_Three for One,promotion_name_Tip Top Savings,promotion_name_Two Day Sale,promotion_name_Two for One,promotion_name_Unbeatable Price Savers,promotion_name_Wallet Savers,promotion_name_Weekend Markdown,promotion_name_You Save Days
0,1.0,1.0,1.0,126.62,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,4.0,0.0,59.86,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,4.0,1.0,0.0,84.16,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2.0,2.0,2.0,95.78,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0.0,2.0,0.0,50.79,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5,2.0,1.0,2.0,50.79,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
6,4.0,2.0,0.0,95.78,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,1.0,4.0,0.0,59.86,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
8,2.0,2.0,0.0,59.86,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,3.0,1.0,0.0,84.16,0,1,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [13]:
# Standardize the data with StandardScalar()
X_scaled = StandardScaler().fit_transform(X)
print(X_scaled.shape)
print(X_scaled)

(38892, 107)
[[-1.01501566 -1.06643567  0.14047524 ... -0.10976426 -0.22962677
  -0.16860121]
 [-1.68963438  1.58140755 -0.63282457 ... -0.10976426 -0.22962677
  -0.16860121]
 [ 1.0088405  -1.06643567 -0.63282457 ... -0.10976426 -0.22962677
  -0.16860121]
 ...
 [-1.01501566 -1.06643567  0.14047524 ... -0.10976426 -0.22962677
   5.93115542]
 [-0.34039694 -0.18382127 -0.63282457 ... -0.10976426 -0.22962677
  -0.16860121]
 [-1.01501566  0.69879314 -0.63282457 ... -0.10976426  4.35489293
  -0.16860121]]


### Four principal components with dummies dataset

In [14]:
# Using PCA to reduce dimension to four principal components.
pca_4c = PCA(n_components=4)
X_pca= pca_4c.fit_transform(X_scaled)
print(X_pca)

[[ 3.68483082  1.59055151  2.41233944 -2.38395623]
 [-1.85529127  3.01998147  2.1339033  -1.99309495]
 [ 4.18451765  3.04851924  0.21891695 -0.07257869]
 ...
 [ 3.31664535 -1.18449052  2.61985911  1.94079872]
 [ 0.30765999  1.04696638 -1.05689901  2.39500237]
 [-0.58112561 -0.09595607  1.27269686  0.86598955]]


In [15]:
# Create a DataFrame with the three principal components.
pcs_df = pd.DataFrame(
    data=X_pca,
    columns=["PC 1", "PC 2", "PC 3", "PC 4"],
    index=customer_df['id']
)
print(pcs_df.shape)
pcs_df.head(10)

(38892, 4)


Unnamed: 0_level_0,PC 1,PC 2,PC 3,PC 4
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,3.684831,1.590552,2.412339,-2.383956
1,-1.855291,3.019981,2.133903,-1.993095
2,4.184518,3.048519,0.218917,-0.072579
3,-0.047093,1.831128,1.520241,-3.321933
4,1.219046,2.027683,-0.041534,-1.397816
5,-1.489427,1.996317,0.450796,-1.073176
6,0.542812,3.719476,-1.025327,-2.112132
7,0.429165,3.950258,0.287527,-1.562956
8,4.119082,3.803252,0.126398,-1.491379
9,-1.0696,3.9169,-0.266647,0.083884


In [16]:
pca_4c.explained_variance_ratio_

array([0.04052162, 0.03286386, 0.02998473, 0.02918629])

In [17]:
pca_4c.explained_variance_ratio_.sum()

0.13255649679896372

### Six principal components with dummies dataset

In [18]:
# Using PCA to reduce dimension to six principal components.
pca_6c = PCA(n_components=6)
X_pca= pca_6c.fit_transform(X_scaled)
print(X_pca)

[[ 3.68531318  1.59094839  2.40786515 -2.38605075 -1.81004468 -0.88105993]
 [-1.8569081   3.02410063  2.14530408 -1.98765009 -2.39476765 -2.84671176]
 [ 4.18507394  3.05832829  0.21682259 -0.07231276 -0.94964848 -1.38010804]
 ...
 [ 3.31365214 -1.19994472  2.63751522  1.9013138  -0.54460303  0.75043616]
 [ 0.308215    1.0368764  -1.04370007  2.41976289  3.45554982  0.79083896]
 [-0.57834087 -0.09239611  1.27793917  0.87261828  3.08124477 -1.41424571]]


In [19]:
# Create a DataFrame with the three principal components.
pcs_df = pd.DataFrame(
    data=X_pca,
    columns=["PC 1", "PC 2", "PC 3", "PC 4", "PC 5", "PC 6"],
    index=customer_df['id']
)
print(pcs_df.shape)
pcs_df.head(10)

(38892, 6)


Unnamed: 0_level_0,PC 1,PC 2,PC 3,PC 4,PC 5,PC 6
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,3.685313,1.590948,2.407865,-2.386051,-1.810045,-0.88106
1,-1.856908,3.024101,2.145304,-1.98765,-2.394768,-2.846712
2,4.185074,3.058328,0.216823,-0.072313,-0.949648,-1.380108
3,-0.048653,1.827783,1.521058,-3.343772,1.818051,-1.836372
4,1.21792,2.008548,-0.058343,-1.43677,1.081427,-1.918851
5,-1.490335,1.984379,0.443068,-1.101109,-1.978921,-1.980019
6,0.540115,3.716994,-1.028007,-2.140374,2.233705,-1.87623
7,0.428682,3.953867,0.292485,-1.56792,-0.593836,-2.724535
8,4.117868,3.806577,0.12476,-1.506914,-1.250919,-2.455737
9,-1.068278,3.921136,-0.280386,0.067293,-1.625409,-0.986071


In [20]:
pca_6c.explained_variance_ratio_

array([0.04052164, 0.03286423, 0.02998622, 0.02918887, 0.02664018,
       0.02015452])

In [21]:
pca_6c.explained_variance_ratio_.sum()

0.17935565761181163

In [26]:
# fa = FactorAnalyzer(n_factors=4, method='principal', rotation="varimax")
# fa.fit(X_scaled)
# print(fa.loadings_.round(2))

## PCA with LabelEncoder

In [31]:
# Use labelEncoder to encode categorical columns.
label_encoder = preprocessing.LabelEncoder()
df2 = df

for var in cat:
    df2[var] = label_encoder.fit_transform(df2[var])

df2[cat].head()

Unnamed: 0,marital_status,gender,education,member_card,occupation,houseowner,avg. yearly_income,store_city,store_state,media_type,promotion_name
0,0,0,4,2,4,1,0,5,1,3,0
1,0,1,0,3,3,1,5,5,1,3,6
2,1,0,4,2,2,0,0,5,1,3,18
3,0,0,2,0,2,1,4,5,1,5,6
4,0,1,4,0,4,0,4,5,1,7,11


In [33]:
# Check new data frame infomation.
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 38892 entries, 0 to 60425
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   marital_status            38892 non-null  int64  
 1   gender                    38892 non-null  int64  
 2   total_children            38892 non-null  float64
 3   education                 38892 non-null  int64  
 4   member_card               38892 non-null  int64  
 5   occupation                38892 non-null  int64  
 6   houseowner                38892 non-null  int64  
 7   avg_cars_at home(approx)  38892 non-null  float64
 8   avg. yearly_income        38892 non-null  int64  
 9   num_children_at_home      38892 non-null  float64
 10  store_city                38892 non-null  int64  
 11  store_state               38892 non-null  int64  
 12  media_type                38892 non-null  int64  
 13  cost                      38892 non-null  float64
 14  promot

In [35]:
# Check unique values of each column.
df2.nunique()

marital_status                2
gender                        2
total_children                6
education                     5
member_card                   4
occupation                    5
houseowner                    2
avg_cars_at home(approx)      5
avg. yearly_income            8
num_children_at_home          6
store_city                   10
store_state                   3
media_type                   13
cost                        214
promotion_name               49
dtype: int64

In [60]:
df2.to_csv('Resources/customer_label_encoded.csv')

In [37]:
# Display label encoded data frame
X2 = df2
print(f"There are {X2.shape[0]} rows and {X2.shape[1]} columns.")
X2.head()

(38892, 15)


Unnamed: 0,marital_status,gender,total_children,education,member_card,occupation,houseowner,avg_cars_at home(approx),avg. yearly_income,num_children_at_home,store_city,store_state,media_type,cost,promotion_name
0,0,0,1.0,4,2,4,1,1.0,0,1.0,5,1,3,126.62,0
1,0,1,0.0,0,3,3,1,4.0,5,0.0,5,1,3,59.86,6
2,1,0,4.0,4,2,2,0,1.0,0,0.0,5,1,3,84.16,18
3,0,0,2.0,2,0,2,1,2.0,4,2.0,5,1,5,95.78,6
4,0,1,0.0,4,0,4,0,2.0,4,0.0,5,1,7,50.79,11


In [38]:
# Standardize the data with StandardScalar()
X2_scaled = StandardScaler().fit_transform(X2)
print(X2_scaled.shape)
print(X2_scaled)

(38892, 15)
[[-1.01800638 -1.01273146 -1.01501566 ... -0.70586484  0.92718532
  -1.74972536]
 [-1.01800638  0.9874286  -1.68963438 ... -0.70586484 -1.26643786
  -1.33771203]
 [ 0.98231212 -1.01273146  1.0088405  ... -0.70586484 -0.46798005
  -0.51368537]
 ...
 [-1.01800638 -1.01273146 -1.01501566 ... -0.12523445 -0.10358099
   1.54638127]
 [ 0.98231212 -1.01273146 -0.34039694 ...  1.03602635 -0.95231207
  -0.10167205]
 [-1.01800638  0.9874286  -1.01501566 ...  1.61665674 -1.01507151
   1.47771238]]


### Four principal components with Label Encoded dataset

In [39]:
# Using PCA to reduce dimension to four principal components.
pca_4c = PCA(n_components=4)
X2_pca= pca_4c.fit_transform(X2_scaled)
print(X2_pca)

[[ 2.62178963  0.70439663  0.7091936   0.34890765]
 [-0.78735533  0.11438336  0.75254295  1.35327845]
 [ 2.39311907 -0.97465202 -0.02354851  0.22671556]
 ...
 [ 2.6476013   0.65645823  0.9938943  -0.92369216]
 [-0.11780589 -1.61272885 -0.11179327  1.29489724]
 [-0.94149061 -0.13252371  0.45124056  0.42610043]]


In [40]:
# Create a DataFrame with the four principal components.
pcs_df = pd.DataFrame(
    data=X2_pca,
    columns=["PC 1", "PC 2", "PC 3", "PC 4"],
    index=customer_df['id']
)
print(pcs_df.shape)
pcs_df.head(10)

(38892, 4)


Unnamed: 0_level_0,PC 1,PC 2,PC 3,PC 4
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,2.62179,0.704397,0.709194,0.348908
1,-0.787355,0.114383,0.752543,1.353278
2,2.393119,-0.974652,-0.023549,0.226716
3,-0.637556,1.019851,0.585381,0.837708
4,0.549361,-0.831221,-0.178237,1.930524
5,-0.872472,0.345534,0.044087,1.759466
6,-0.198208,-1.129743,-0.240247,1.061544
7,-0.864032,-1.123528,0.188958,1.377078
8,2.401912,-1.185324,-0.100078,1.36544
9,-0.741706,-1.553906,-0.205279,0.332799


In [41]:
pca_4c.explained_variance_ratio_

array([0.1547902 , 0.13053787, 0.1119392 , 0.07454409])

In [42]:
pca_4c.explained_variance_ratio_.sum()

0.4718113637192467

### Six principal components with Label Encoded dataset

In [43]:
# Using PCA to reduce dimension to six principal components.
pca_6c = PCA(n_components=6)
X2_pca= pca_6c.fit_transform(X2_scaled)
print(X2_pca)

[[ 2.62181381  0.70434102  0.70908587  0.34907036 -1.51680728  1.1464036 ]
 [-0.78725244  0.11414686  0.7520871   1.3539236  -1.51039195 -1.44835573]
 [ 2.39315658 -0.97473823 -0.02371468  0.22694181  1.61004282  1.10467443]
 ...
 [ 2.64766176  0.65631825  0.9936217  -0.92327987 -1.67743408  0.68936798]
 [-0.11780936 -1.61272178 -0.11178168  1.29489737  0.08329889  1.35648325]
 [-0.9413691  -0.13280424  0.45069707  0.42689436 -1.5626131  -1.54597401]]


In [44]:
# Create a DataFrame with the six principal components.
pcs_df = pd.DataFrame(
    data=X2_pca,
    columns=["PC 1", "PC 2", "PC 3", "PC 4", "PC 5", "PC 6"],
    index=customer_df['id']
)
print(pcs_df.shape)
pcs_df.head(10)

(38892, 6)


Unnamed: 0_level_0,PC 1,PC 2,PC 3,PC 4,PC 5,PC 6
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,2.621814,0.704341,0.709086,0.34907,-1.516807,1.146404
1,-0.787252,0.114147,0.752087,1.353924,-1.510392,-1.448356
2,2.393157,-0.974738,-0.023715,0.226942,1.610043,1.104674
3,-0.637589,1.019926,0.585525,0.837509,-1.011026,0.772578
4,0.549411,-0.831335,-0.178457,1.930834,-0.886245,-0.911183
5,-0.872508,0.345618,0.044249,1.759239,-0.220272,0.839366
6,-0.198194,-1.129775,-0.240309,1.06163,1.24265,1.819814
7,-0.864084,-1.123408,0.18919,1.376741,-0.48012,-1.071629
8,2.401902,-1.185303,-0.100036,1.365374,0.566234,1.28992
9,-0.741728,-1.553855,-0.205182,0.332657,0.784839,1.280521


In [45]:
pca_6c.explained_variance_ratio_

array([0.1547902 , 0.13053787, 0.11193921, 0.0745441 , 0.07110619,
       0.06663038])

In [46]:
pca_6c.explained_variance_ratio_.sum()

0.6095479457085751

### Eight principal components with Label Encoded dataset

In [47]:
# Using PCA to reduce dimension to eight principal components.
pca_8c = PCA(n_components=8)
X2_pca= pca_8c.fit_transform(X2_scaled)
print(X2_pca)

[[ 2.62181381  0.70434102  0.70908587 ...  1.1464036  -0.39905914
   0.91767098]
 [-0.78725244  0.11414686  0.7520871  ... -1.44835573 -0.40022045
  -0.70774425]
 [ 2.39315658 -0.97473823 -0.02371468 ...  1.10467443  1.02435071
  -0.74965317]
 ...
 [ 2.64766176  0.65631825  0.9936217  ...  0.68936798 -0.59764188
  -0.65236353]
 [-0.11780936 -1.61272178 -0.11178168 ...  1.35648325 -0.9778454
  -0.30481122]
 [-0.9413691  -0.13280424  0.45069707 ... -1.54597401  0.23718423
  -0.35726968]]


In [49]:
# Create a DataFrame with the eight principal components.
pcs_df = pd.DataFrame(
    data=X2_pca,
    columns=["PC 1", "PC 2", "PC 3", "PC 4", "PC 5", "PC 6", "PC 7", "PC 8"],
    index=customer_df['id']
)
print(pcs_df.shape)
pcs_df.head(10)

(38892, 8)


Unnamed: 0_level_0,PC 1,PC 2,PC 3,PC 4,PC 5,PC 6,PC 7,PC 8
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,2.621814,0.704341,0.709086,0.34907,-1.516807,1.146404,-0.399059,0.917671
1,-0.787252,0.114147,0.752087,1.353924,-1.510392,-1.448356,-0.40022,-0.707744
2,2.393157,-0.974738,-0.023715,0.226942,1.610043,1.104674,1.024351,-0.749653
3,-0.637589,1.019926,0.585525,0.837509,-1.011026,0.772578,0.797382,0.005069
4,0.549411,-0.831335,-0.178457,1.930834,-0.886245,-0.911183,-1.435145,-0.38303
5,-0.872508,0.345618,0.044249,1.759239,-0.220272,0.839366,0.017672,-1.121642
6,-0.198194,-1.129775,-0.240309,1.06163,1.24265,1.819814,-1.070785,0.246633
7,-0.864084,-1.123408,0.18919,1.376741,-0.48012,-1.071629,-0.199362,-0.535467
8,2.401902,-1.185303,-0.100036,1.365374,0.566234,1.28992,-0.757996,-0.871364
9,-0.741728,-1.553855,-0.205182,0.332657,0.784839,1.280521,-0.343922,-0.821349


In [50]:
pca_8c.explained_variance_ratio_

array([0.1547902 , 0.13053787, 0.11193921, 0.0745441 , 0.07110619,
       0.06663038, 0.06417345, 0.06286401])

In [51]:
pca_8c.explained_variance_ratio_.sum()

0.736585407015693

### Ten principal components with Label Encoded dataset

In [55]:
# Using PCA to reduce dimension to ten principal components.
pca_10c = PCA(n_components=10)
X2_pca= pca_10c.fit_transform(X2_scaled)
print(X2_pca)

[[ 2.62181381  0.70434102  0.70908587 ...  0.91767098 -1.79772742
   0.29858137]
 [-0.78725244  0.11414686  0.7520871  ... -0.70774425 -1.15206707
   0.20980085]
 [ 2.39315658 -0.97473823 -0.02371468 ... -0.74965317 -0.5468746
  -0.14368663]
 ...
 [ 2.64766176  0.65631825  0.9936217  ... -0.65236353  0.98510732
   0.35236277]
 [-0.11780936 -1.61272178 -0.11178168 ... -0.30481122  1.10828942
   0.72358319]
 [-0.9413691  -0.13280424  0.45069707 ... -0.35726968  2.07115452
   0.15588065]]


In [56]:
# Create a DataFrame with the ten principal components.
pcs_df = pd.DataFrame(
    data=X2_pca,
    columns=["PC 1", "PC 2", "PC 3", "PC 4", "PC 5", "PC 6", "PC 7", "PC 8", "PC 9", "PC 10"],
    index=customer_df['id']
)
print(pcs_df.shape)
pcs_df.head(10)

(38892, 10)


Unnamed: 0_level_0,PC 1,PC 2,PC 3,PC 4,PC 5,PC 6,PC 7,PC 8,PC 9,PC 10
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,2.621814,0.704341,0.709086,0.34907,-1.516807,1.146404,-0.399059,0.917671,-1.797727,0.298581
1,-0.787252,0.114147,0.752087,1.353924,-1.510392,-1.448356,-0.40022,-0.707744,-1.152067,0.209801
2,2.393157,-0.974738,-0.023715,0.226942,1.610043,1.104674,1.024351,-0.749653,-0.546875,-0.143687
3,-0.637589,1.019926,0.585525,0.837509,-1.011026,0.772578,0.797382,0.005069,-1.091099,0.201491
4,0.549411,-0.831335,-0.178457,1.930834,-0.886245,-0.911183,-1.435145,-0.38303,-0.217122,2.01072
5,-0.872508,0.345618,0.044249,1.759239,-0.220272,0.839366,0.017672,-1.121642,-0.083808,1.809556
6,-0.198194,-1.129775,-0.240309,1.06163,1.24265,1.819814,-1.070785,0.246633,-0.570727,0.114441
7,-0.864084,-1.123408,0.18919,1.376741,-0.48012,-1.071629,-0.199362,-0.535467,-1.250417,-0.856042
8,2.401902,-1.185303,-0.100036,1.365374,0.566234,1.28992,-0.757996,-0.871364,-0.86256,0.476581
9,-0.741728,-1.553855,-0.205182,0.332657,0.784839,1.280521,-0.343922,-0.821349,-0.586948,0.345194


In [58]:
pca_10c.explained_variance_ratio_

array([0.1547902 , 0.13053787, 0.11193921, 0.0745441 , 0.07110619,
       0.06663038, 0.06417345, 0.06286401, 0.05925847, 0.052667  ])

In [59]:
pca_10c.explained_variance_ratio_.sum()

0.8485108736368976