# Intuition Behind PCA

PCA (Principal Component Analysis) refers to choosing those components that have the maximum variance explanation
i.e. those features which cause the maximum effect to the output.

We can see the explained variance ratio in python using sklearn.decomposition.PCA.explained_variance_ratio_

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data.csv')

In [3]:
df.shape

(32561, 15)

In [4]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [5]:
df[df == '?'] = np.nan

In [6]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,,77053,HS-grad,9,Widowed,,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,,186061,Some-college,10,Widowed,,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [7]:
for col in df.columns[df.isnull().sum() != 0].tolist():
    df[col].fillna(df[col].mode()[0], inplace = True)

In [8]:
df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income            0
dtype: int64

In [9]:
x = df.drop('income', axis = 1)
x.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
0,90,Private,77053,HS-grad,9,Widowed,Prof-specialty,Not-in-family,White,Female,0,4356,40,United-States
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States
2,66,Private,186061,Some-college,10,Widowed,Prof-specialty,Unmarried,Black,Female,0,4356,40,United-States
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States


In [10]:
y = df['income']
y.head()

0    <=50K
1    <=50K
2    <=50K
3    <=50K
4    <=50K
Name: income, dtype: object

In [11]:
from sklearn import preprocessing

In [12]:
encode_cols = [
    'workclass',
    'education',
    'marital.status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'native.country'
]

In [13]:
for col in encode_cols:
    x[col] = preprocessing.LabelEncoder().fit_transform(x[col])

In [15]:
x.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
0,90,3,77053,11,9,6,9,1,4,0,0,4356,40,38
1,82,3,132870,11,9,6,3,1,4,0,0,4356,18,38
2,66,3,186061,15,10,6,9,4,2,0,0,4356,40,38
3,54,3,140359,5,4,0,6,4,4,0,0,3900,40,38
4,41,3,264663,15,10,5,9,3,4,0,0,3900,40,38


In [16]:
x = pd.DataFrame(preprocessing.StandardScaler().fit_transform(x), columns = x.columns)

In [17]:
x.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
0,3.769612,-0.085296,-1.067997,0.181332,-0.42006,2.24948,0.720237,-0.277805,0.393668,-1.422331,-0.14592,10.593507,-0.035429,0.26137
1,3.183112,-0.085296,-0.539169,0.181332,-0.42006,2.24948,-0.790092,-0.277805,0.393668,-1.422331,-0.14592,10.593507,-1.817204,0.26137
2,2.01011,-0.085296,-0.03522,1.214869,-0.03136,2.24948,0.720237,1.589322,-1.962621,-1.422331,-0.14592,10.593507,-0.035429,0.26137
3,1.130359,-0.085296,-0.468215,-1.368974,-2.363558,-1.734058,-0.034928,1.589322,0.393668,-1.422331,-0.14592,9.461864,-0.035429,0.26137
4,0.177296,-0.085296,0.709482,1.214869,-0.03136,1.585557,0.720237,0.966947,0.393668,-1.422331,-0.14592,9.461864,-0.035429,0.26137


In [60]:
def calc_acc(nf, x, y):
    from sklearn.linear_model import LogisticRegression
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import accuracy_score
    
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 50)
    
    model = LogisticRegression()
    model.fit(x_train, y_train)
    
    print(f'{nf} featured model accuracy: {round(accuracy_score(y_test, model.predict(x_test)) * 100, 2)}')

In [61]:
calc_acc(len(x.columns.tolist()), x, y)

14 featured model accuracy: 82.86


In [62]:
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(x)

PCA()

In [63]:
pca.explained_variance_ratio_

array([0.14783223, 0.10161135, 0.08056461, 0.07870396, 0.07427552,
       0.07329118, 0.07027089, 0.06772609, 0.06487451, 0.06119792,
       0.0608425 , 0.0486531 , 0.04276666, 0.02738948])

In [64]:
# last three columns have the least explained variance

In [65]:
# 11 featured model
x_11 = x.iloc[0:,:-3]
calc_acc(len(x_11.columns.tolist()), x_11, y)

11 featured model accuracy: 82.58


In [66]:
# 12 featured model
x_12 = x.iloc[0:,:-2]
calc_acc(len(x_12.columns.tolist()), x_12, y)

12 featured model accuracy: 82.88


In [67]:
# 13 featured model
x_13 = x.iloc[0:,:-1]
calc_acc(len(x_13.columns.tolist()), x_13, y)

13 featured model accuracy: 82.84
