In [1]:
import pandas as pd
import numpy as np
import os
from matplotlib import pyplot as plt
import math
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA 

# Case #1: Congressional Voting Data

After you've downloaded the data from the repository, go ahead and load it with Pandas

In [29]:
votes = pd.read_csv('../CSV/votes.csv')

In [30]:
votes.head()

Unnamed: 0.1,Unnamed: 0,Class,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16
0,1,republican,n,y,n,y,y,y,n,n,n,y,,y,y,y,n,y
1,2,republican,n,y,n,y,y,y,n,n,n,n,n,y,y,y,n,
2,3,democrat,,y,y,,y,y,n,n,n,n,y,n,y,y,n,n
3,4,democrat,n,y,y,n,,y,n,n,n,n,y,n,y,n,n,y
4,5,democrat,y,y,y,n,y,y,n,n,n,n,y,,y,y,y,y


Next, let's define the x and y variables: 

In [32]:
for col in votes.ix[:,2:].columns:
    votes[col] = np.where(votes[col] == 'y', 1, 0)

In [62]:
X = votes.ix[:,2:].values
y = votes.ix[:,1].values

In [63]:
X_standard = StandardScaler().fit_transform(X)

Next, create the covariance matrix from the standardized x-values and decompose these values to find the eigenvalues and eigenvectors

In [64]:
cov_mat = np.cov(X_standard.T)
eigenValues, eigenVectors = np.linalg.eig(cov_mat)

Now, let's check the eigenvalues: 

In [65]:
eigenValues

array([ 7.102243  ,  1.44867882,  1.1998064 ,  0.88126251,  0.8248454 ,
        0.7380054 ,  0.14086407,  0.21188391,  0.61036665,  0.27622346,
        0.31858994,  0.35967328,  0.41340487,  0.54030753,  0.51431461,
        0.45639653])

And the eigenvectors: 

In [66]:
eigenVectors

array([[-0.18803934, -0.19357121,  0.07328808,  0.51751543,  0.35845006,
         0.62089874, -0.05713493, -0.00747574,  0.15602045, -0.03613906,
        -0.07010134, -0.20296072,  0.12290505,  0.07348099,  0.20525446,
        -0.04760145],
       [ 0.05231849, -0.5860793 , -0.23045319,  0.37516036,  0.12646973,
        -0.55398342,  0.01855198, -0.10910015,  0.10486865,  0.0799821 ,
         0.0801356 , -0.13259865, -0.06581724,  0.07925709, -0.00808044,
         0.26963501],
       [-0.29660826, -0.08013044, -0.16884543, -0.01308925,  0.10990706,
         0.03068335, -0.2133564 , -0.06665988, -0.17958811, -0.23883742,
         0.56681358,  0.45295072,  0.08181081, -0.3596669 ,  0.23464645,
         0.08418781],
       [ 0.31510691,  0.1422561 ,  0.07340739,  0.13936979, -0.00770719,
        -0.05450439, -0.39727071,  0.4526401 ,  0.17904898, -0.55964437,
         0.01205996,  0.0780124 , -0.11191782,  0.25270932,  0.01034961,
         0.24886662],
       [ 0.33401889, -0.0385968 , -0

To find the principal componants, find the eigenpairs, and sort them from highest to lowest. 

In [43]:
eig_pairs = [(np.abs(eigenValues[i]), eigenVectors[:,i]) for i in range(len(eigenValues))]
eig_pairs.sort()
eig_pairs.reverse()
for i in eig_pairs:
    print(i[0])

7.10224299609
1.44867881863
1.19980639643
0.881262509772
0.824845400306
0.738005397126
0.610366646038
0.540307526332
0.5143146149
0.456396531035
0.413404865073
0.359673279653
0.31858993662
0.276223460272
0.211883908313
0.140864072849


Now, calculate the explained variance. Recall the methods we learned in lesson 2.2!

In [45]:
tot = sum(eigenValues)
exp_var = [(i/tot) * 100 for i in sorted(eigenValues, reverse=True)]
cum_sum_var = np.cumsum(exp_var)
print(cum_sum_var)

[ 44.286975    53.32040327  60.80195465  66.29718352  71.44061604
  76.04254625  79.84856815  83.21772715  86.42480392  89.27072481
  91.8485655   94.09135577  96.07796544  97.80039334  99.12162345 100.        ]


Now, conduct the PCA using scikit learn

In [47]:
PCA_sk = PCA(n_components=16)
Y_sk = PCA_sk.fit_transform(X_standard)

# Case #2: Airport Delays

In [12]:
airport = pd.read_csv('../CSV/Airport_operations.csv')
airport.head()

Unnamed: 0,airport,year,departures for metric computation,arrivals for metric computation,percent on-time gate departures,percent on-time airport departures,percent on-time gate arrivals,average_gate_departure_delay,average_taxi_out_time,average taxi out delay,average airport departure delay,average airborne delay,average taxi in delay,average block delay,average gate arrival delay
0,ABQ,2004,53971,53818,0.803,0.7809,0.7921,10.38,9.89,2.43,12.1,2.46,0.83,2.55,10.87
1,ABQ,2005,51829,51877,0.814,0.7922,0.8001,9.6,9.79,2.29,11.2,2.26,0.89,2.34,10.24
2,ABQ,2006,49682,51199,0.7983,0.7756,0.7746,10.84,9.89,2.16,12.33,2.12,0.84,2.66,11.82
3,ABQ,2007,53255,53611,0.8005,0.7704,0.7647,11.29,10.34,2.4,12.95,2.19,1.29,3.06,12.71
4,ABQ,2008,49589,49512,0.8103,0.7844,0.7875,10.79,10.41,2.41,12.32,1.82,1.03,2.79,11.48


First, let's define the x and y variables: Airport is going to be our "x" variable

In [54]:
X = airport.ix[:,2:].values
y = airport.ix[:,0].values

Then, standardize the x variable for analysis

In [55]:
X_standard = StandardScaler().fit_transform(X)

Next, create the covariance matrix from the standardized x-values and decompose these values to find the eigenvalues and eigenvectors

In [56]:
cov_mat = np.cov(X_standard.T)
eigenValues, eigenVectors = np.linalg.eig(cov_mat)

Then, check your eigenvalues and eigenvectors:

In [57]:
eigenValues

array([  6.43554591e+00,   4.30034272e+00,   8.46168848e-01,
         5.11389160e-01,   4.68106591e-01,   1.67784858e-01,
         1.20373902e-01,   8.75752619e-02,   6.42089840e-02,
         1.02498362e-02,   2.85997229e-03,   1.66983552e-04,
         7.03167500e-04])

In [58]:
eigenVectors

array([[  1.15534165e-01,  -4.14065994e-01,  -3.99975407e-01,
         -1.44386242e-01,   1.88812357e-01,  -2.89427568e-01,
          5.58777294e-02,  -8.74632562e-02,   3.56070223e-02,
          2.10712103e-02,  -1.29886455e-02,  -7.07882813e-01,
         -3.47029233e-02],
       [  1.15117360e-01,  -4.13745773e-01,  -4.01992814e-01,
         -1.46930754e-01,   1.86961849e-01,  -2.90096332e-01,
          6.01701522e-02,  -1.00170466e-01,   3.50827364e-02,
          3.13868217e-02,   1.01203490e-02,   7.04169491e-01,
          3.79124127e-02],
       [ -3.16692368e-01,  -2.50781656e-01,   2.54173262e-01,
          1.26763507e-01,   7.27315049e-03,  -6.41665658e-02,
          3.29583557e-01,   2.19208711e-02,   3.30107052e-01,
          1.46329054e-01,  -6.79379109e-01,   2.14540200e-02,
         -2.24646995e-01],
       [ -3.76648424e-01,  -7.51268241e-02,   9.06533172e-02,
          1.18107796e-01,   2.02346218e-01,  -4.71651244e-02,
          3.84319806e-01,   2.70677467e-01,   2.477

To find the principal componants, find the eigenpairs, and sort them from highest to lowest. 

In [59]:
eig_pairs = [(np.abs(eigenValues[i]), eigenVectors[:,i]) for i in range(len(eigenValues))]
eig_pairs.sort()
eig_pairs.reverse()
for i in eig_pairs:
    print(i[0])

6.43554590921
4.30034271686
0.84616884844
0.511389159623
0.468106590922
0.167784858101
0.120373901809
0.0875752619392
0.0642089840261
0.0102498362064
0.002859972289
0.000703167500326
0.000166983552113


Next, Calculate the explained variance

In [60]:
tot = sum(eigenValues)
var_exp = [(i / tot)*100 for i in sorted(eigenValues, reverse=True)]
cum_var_exp = np.cumsum(var_exp)
print cum_var_exp

[  49.44533581   82.48556156   88.98681312   92.91589841   96.51243674
   97.80155483   98.72640691   99.39926175   99.89258972   99.97134086
   99.99331449   99.99871704  100.        ]


Finally, conduct the PCA - use the results about to guide your selection of "n" componants

In [61]:
PCA_sk = PCA(n_components=7)
Y_sk = PCA_sk.fit_transform(X_standard)