## Execute PCA Analysis

In [26]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from pandas import DataFrame
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

le = LabelEncoder()
scalar = MinMaxScaler()
sns.set()

#### Define functions

In [27]:
def run_pca(_df: DataFrame, _comp_max: int) -> list:
    _pca = PCA(n_components=_comp_max)
    _pca.fit_transform(_df)
    _vr = _pca.explained_variance_ratio_.cumsum()
    return _vr

#### Read Dataset from pickle file

In [28]:
_master: DataFrame = pd.read_pickle('./data/master.pickle')
print(_master.shape)

(98855, 17)


#### Remove the 'brand' column before PCA analysis

In [29]:
_master_features: DataFrame = _master.drop(columns=['JobSatisfaction'])
print(_master_features.shape)

(98855, 16)


#### Encode labels for the categorical variables (to calculate Euclidean distance)

In [30]:
_master_encoded: DataFrame = _master_features.apply(le.fit_transform)
_master_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98855 entries, 0 to 98854
Data columns (total 16 columns):
Student               98855 non-null int32
FormalEducation       98855 non-null int32
EducationParents      98855 non-null int32
Employment            98855 non-null int32
CompanySize           98855 non-null int32
CareerSatisfaction    98855 non-null int32
LastNewJob            98855 non-null int32
HoursComputer         98855 non-null int32
HoursOutside          98855 non-null int32
AIDangerous           98855 non-null int32
AIInteresting         98855 non-null int32
AIResponsible         98855 non-null int32
AIFuture              98855 non-null int32
Exercise              98855 non-null int32
ConvertedSalary       98855 non-null int32
Studied_Software      98855 non-null int32
dtypes: int32(16)
memory usage: 6.0 MB


#### Scale the features based on Min-Max scalar (to calculate Euclidean distance)

In [31]:
_master_array = scalar.fit_transform(_master_encoded)
_master_scaled = pd.DataFrame(_master_array, columns=list(_master_encoded.columns))

    Student  FormalEducation  EducationParents  Employment  CompanySize  \
0       0.5            0.125          0.111111         0.8        0.375   
1       0.5            0.125          0.111111         0.2        0.875   
2       0.5            0.000          1.000000         0.2        0.375   
3       0.5            0.125          0.444444         0.2        0.125   
4       1.0            0.500          0.444444         0.2        0.875   
5       0.5            0.125          0.888889         0.2        0.000   
6       0.5            0.500          0.333333         0.2        0.875   
7       0.5            0.125          0.666667         0.2        0.000   
8       0.5            0.500          0.444444         0.2        0.125   
9       0.5            0.125          1.000000         0.2        0.500   
10      0.5            0.375          1.000000         0.2        0.250   
11      0.5            0.125          1.000000         0.2        0.125   
12      0.5            0.

#### Running PCA analysis (with maximum 5 components)

In [32]:
variance_ratio = run_pca(_master_scaled, 11)
print(variance_ratio)

[0.39182654 0.52452168 0.58731344 0.64919591 0.69426582 0.73577576
 0.77400896 0.80947128 0.84267016 0.87151946 0.89881251]


#### Optimum # of components = 11

Since close to 90% of the variance in the data can be explained by 11 components, the optimum number of components that we can retain is 11, without much loss to the variation in the data.

In [33]:
n_pcs = 11
pca_2 = PCA(n_components=n_pcs)
components_2 = pca_2.fit_transform(_master_scaled)

components_df = pd.DataFrame(data=components_2, columns=['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6',
                                                         'PC7', 'PC8', 'PC9', 'PC10', 'PC11'])

components_df_merged = pd.concat([components_df, _master[['JobSatisfaction']]], axis=1)
components_df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98855 entries, 0 to 98854
Data columns (total 12 columns):
PC1                98855 non-null float64
PC2                98855 non-null float64
PC3                98855 non-null float64
PC4                98855 non-null float64
PC5                98855 non-null float64
PC6                98855 non-null float64
PC7                98855 non-null float64
PC8                98855 non-null float64
PC9                98855 non-null float64
PC10               98855 non-null float64
PC11               98855 non-null float64
JobSatisfaction    98855 non-null category
dtypes: category(1), float64(11)
memory usage: 8.4 MB


#### Identify the component -> feature correlation

In [34]:
# print(abs(pca_2.components_))
features = list(_master_scaled.columns)

_dict = {}
for i in range(n_pcs):
    idx_max = np.abs(pca_2.components_[i]).argmax()
    _dict['PC' + str(i + 1)] = features[idx_max]
print(_dict)

{'PC1': 'AIFuture', 'PC2': 'Studied_Software', 'PC3': 'CompanySize', 'PC4': 'CompanySize', 'PC5': 'LastNewJob', 'PC6': 'EducationParents', 'PC7': 'AIInteresting', 'PC8': 'AIDangerous', 'PC9': 'FormalEducation', 'PC10': 'AIFuture', 'PC11': 'Student'}


#### Store the PCA data-frame as pickle file

In [35]:
components_df_merged.to_pickle('./data/master-pca.pickle')
print('DataFrame stored in pickle file...')

DataFrame stored in pickle file...
