#### PCA on random data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler 
from sklearn.decomposition import PCA
%matplotlib inline

In [2]:
# 3000 rows x 10 columns
np.random.seed(5)
random_data = np.random.rand(3000,10)

In [3]:
random_data.shape

(3000, 10)

In [4]:
df = pd.DataFrame(random_data)

In [5]:
df.head(7)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.221993,0.870732,0.206719,0.918611,0.488411,0.611744,0.765908,0.518418,0.296801,0.187721
1,0.080741,0.73844,0.441309,0.15831,0.879937,0.274086,0.414235,0.29608,0.628788,0.579838
2,0.599929,0.265819,0.284686,0.253588,0.327564,0.144164,0.165613,0.963931,0.960227,0.188415
3,0.024307,0.204556,0.699844,0.779515,0.022933,0.577663,0.001642,0.515473,0.639795,0.985624
4,0.259098,0.802497,0.870483,0.92275,0.002214,0.469488,0.981469,0.398945,0.813732,0.546456
5,0.770854,0.484931,0.029112,0.086526,0.111454,0.251245,0.964915,0.631766,0.81666,0.566082
6,0.635356,0.811902,0.926683,0.912627,0.824811,0.094203,0.361048,0.035509,0.546358,0.796143


In [6]:
df.shape

(3000, 10)

* Very small correlation on variables

In [7]:
df.corr()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1.0,0.001752,-0.045251,0.019213,0.01892,-0.023418,0.012631,-0.011487,-0.015242,0.004333
1,0.001752,1.0,0.013402,0.001151,0.01131,0.005903,0.008109,0.016488,0.031241,0.002828
2,-0.045251,0.013402,1.0,0.014618,0.027499,0.007999,0.032764,0.008283,-0.008956,-0.029783
3,0.019213,0.001151,0.014618,1.0,-0.012533,0.00176,-0.020751,-0.000322,-0.000973,-0.000565
4,0.01892,0.01131,0.027499,-0.012533,1.0,0.009004,-0.007245,-0.016318,-0.001239,0.004558
5,-0.023418,0.005903,0.007999,0.00176,0.009004,1.0,-0.00663,0.024819,-0.005915,-0.034567
6,0.012631,0.008109,0.032764,-0.020751,-0.007245,-0.00663,1.0,0.008201,0.005458,-0.009867
7,-0.011487,0.016488,0.008283,-0.000322,-0.016318,0.024819,0.008201,1.0,-0.011453,0.033049
8,-0.015242,0.031241,-0.008956,-0.000973,-0.001239,-0.005915,0.005458,-0.011453,1.0,-0.000219
9,0.004333,0.002828,-0.029783,-0.000565,0.004558,-0.034567,-0.009867,0.033049,-0.000219,1.0


In [8]:
df.dtypes

0    float64
1    float64
2    float64
3    float64
4    float64
5    float64
6    float64
7    float64
8    float64
9    float64
dtype: object

* the N_components parameter determines what percentage of variance the PCA model should capture

In [9]:
pca = PCA(n_components=0.9) # percentage of variance to capture

In [10]:
pca.fit(df)

PCA(n_components=0.9)

The number of components the PCA model came up with

In [11]:
pca.n_components_

9

In [12]:
df.head(7)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.221993,0.870732,0.206719,0.918611,0.488411,0.611744,0.765908,0.518418,0.296801,0.187721
1,0.080741,0.73844,0.441309,0.15831,0.879937,0.274086,0.414235,0.29608,0.628788,0.579838
2,0.599929,0.265819,0.284686,0.253588,0.327564,0.144164,0.165613,0.963931,0.960227,0.188415
3,0.024307,0.204556,0.699844,0.779515,0.022933,0.577663,0.001642,0.515473,0.639795,0.985624
4,0.259098,0.802497,0.870483,0.92275,0.002214,0.469488,0.981469,0.398945,0.813732,0.546456
5,0.770854,0.484931,0.029112,0.086526,0.111454,0.251245,0.964915,0.631766,0.81666,0.566082
6,0.635356,0.811902,0.926683,0.912627,0.824811,0.094203,0.361048,0.035509,0.546358,0.796143


In [13]:
pca.transform(df)

array([[ 0.23944303,  0.02668961,  0.07576588, ..., -0.04602755,
        -0.23009503, -0.20273925],
       [ 0.15849998, -0.1430348 , -0.40232322, ..., -0.00711375,
        -0.27551516, -0.04929153],
       [-0.18237165,  0.25637175, -0.08697834, ...,  0.19576772,
         0.08761661,  0.8251828 ],
       ...,
       [ 0.17595869,  0.35003565, -0.18536301, ..., -0.3340929 ,
         0.04369709,  0.15629183],
       [-0.08456247,  0.17890488,  0.23294004, ..., -0.14453248,
         0.54493584,  0.02365874],
       [ 0.13739893,  0.03523713,  0.09132946, ..., -0.23149361,
         0.13186785, -0.1050173 ]])

In [14]:
def transform_with_pca(pca, df, columns):
    transformed_data = pca.transform(df[columns])
    
    tcols = []
    for i in range(pca.n_components_):       
        tcols.append('component_' + str(i))
    
    print ('components:',tcols)
    df_transformed = pd.DataFrame(transformed_data, columns=tcols)
    
    for col in df_transformed.columns:
        df[col] = df_transformed[col]
    
    df.drop(columns, inplace=True, axis=1)
    
    return tcols

In [15]:
transform_with_pca(pca,df, df.columns)

components: ['component_0', 'component_1', 'component_2', 'component_3', 'component_4', 'component_5', 'component_6', 'component_7', 'component_8']


['component_0',
 'component_1',
 'component_2',
 'component_3',
 'component_4',
 'component_5',
 'component_6',
 'component_7',
 'component_8']

* Not much reduction is possible with random datasets

In [16]:
df.head(7)

Unnamed: 0,component_0,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8
0,0.239443,0.02669,0.075766,0.112855,-0.345108,0.151339,-0.046028,-0.230095,-0.202739
1,0.1585,-0.143035,-0.402323,-0.040042,0.421051,-0.363973,-0.007114,-0.275515,-0.049292
2,-0.182372,0.256372,-0.086978,0.296469,0.072542,-0.172577,0.195768,0.087617,0.825183
3,0.048936,0.753947,0.088073,-0.094737,-0.048718,-0.432731,-0.496847,0.134142,-0.114109
4,0.345275,0.341535,-0.277603,0.415368,-0.151116,0.487913,-0.404048,0.122857,-0.34009
5,-0.426701,0.285982,-0.267762,0.353966,0.252331,0.352886,0.553405,0.157512,0.062357
6,-0.169758,-0.465306,-0.323727,0.033462,-0.025855,-0.027212,-0.751668,-0.195584,-0.245309
