In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import math
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA 
from sklearn import metrics


%matplotlib inline



# Case #1: Congressional Voting Data

After you've downloaded the data from the repository, go ahead and load it with Pandas

In [11]:
votes = pd.read_csv('../assets/datasets/votes.csv')

In [12]:
votes.head()
print len(votes)
votes=votes.dropna(axis=0)
votes.head()
len(votes)

435


232

Next, let's define the x and y variables: 

In [13]:
x = votes.ix[:,2:14].values
y = votes.ix[:,1].values

In [14]:
xStand = StandardScaler().fit_transform(x)

ValueError: could not convert string to float: n

Next, create the covariance matrix from the standardized x-values and decompose these values to find the eigenvalues and eigenvectors

In [7]:
covMat1 = np.cov(x_standard.T)
eigenValues, eigenVectors = np.linalg.eig(covMat1)

Now, let's check the eigenvalues: 

In [8]:
print(eigenValues)

[ 5.58370916  1.33352682  1.05883875  0.80154671  0.72224813  0.5652465
  0.53378181  0.17360955  0.37261933  0.25322516  0.3350424   0.29425545]


And the eigenvectors: 

In [9]:
print(eigenVectors)

[[-0.22862459  0.18532368 -0.20982618  0.41395079  0.78519601 -0.0193294
  -0.21120426  0.06346635  0.05764349 -0.05367063 -0.15473539  0.04812159]
 [ 0.05838316  0.63955834 -0.04441978  0.58786967 -0.43399521 -0.16292162
   0.0516227   0.01218162  0.0180619   0.09311362  0.00384379 -0.11520818]
 [-0.33766356  0.12644984  0.1638092  -0.02044933 -0.03356182  0.29105654
  -0.30711087  0.19535167  0.22167337  0.09485647  0.74617203  0.09753429]
 [ 0.35226746 -0.11055164 -0.07011938  0.15388749  0.02316103 -0.32060211
   0.04575822  0.30535204 -0.21587818 -0.0517604   0.23826338  0.72827198]
 [ 0.37443542  0.06300629  0.02012467  0.06028777  0.18616264  0.1443653
  -0.11814924 -0.73292363 -0.21402395  0.35614678  0.26092959  0.06650948]
 [ 0.2917094   0.15476312  0.32907494 -0.09194025 -0.10117607  0.20829174
  -0.7270428   0.15452295 -0.05605682 -0.0651421  -0.38441795  0.11382899]
 [-0.33461692 -0.15360238  0.01691192  0.01130203 -0.05623873 -0.36175381
  -0.28295359  0.11059562 -0.70484

To find the principal componants, find the eigenpairs, and sort them from highest to lowest. 

In [17]:
eigenPairs = [(np.abs(eigenValues[i]), eigenVectors[:,i]) for i in range(len(eigenValues))]
eigenPairs.sort()
eigenPairs.reverse()
for i in eig_pairs:
    print(i[0])

5.58370915682
1.33352682037
1.05883875373
0.801546710026
0.722248126307
0.565246499752
0.533781814492
0.372619329093
0.335042401025
0.294255448966
0.253225162928
0.173609546074


Now, calculate the explained variance. Recall the methods we learned in lesson 2.2!

In [12]:
totalEigen = sum(eigenValues)
varExpl = [(i / totalEigen)*100 for i in sorted(eigenValues, reverse=True)]

In [13]:
print(varExpl)

[46.423942031820388, 11.087177012241135, 8.8033720137957374, 6.664200615926001, 6.0048982148868095, 4.6995590209273006, 4.4379560821769664, 3.0980227744506159, 2.7856015717418652, 2.4464916638166279, 2.105358634303931, 1.4434203639126424]


Now, calculate the explained variance and the Cumulative explained variance

In [43]:
cvarex = np.cumsum(varExpl)

In [44]:
print(cvarex)

[ 47.57620567  81.76648321  88.80868815  92.9094282   96.80559213
  98.19321523  98.92422853  99.53016479  99.96782207  99.99273407
  99.99860957 100.        ]


Now, conduct the PCA using scikit learn

In [74]:
PCA_set = PCA(n_components=5)
X = PCA_set.fit_transform(xStand)

# Case #2: Airport Delays

In [5]:
airports = pd.read_csv('../assets/datasets/airport_operations.csv')

In [78]:
airports.head()

Unnamed: 0,airport,year,departures for metric computation,arrivals for metric computation,percent on-time gate departures,percent on-time airport departures,percent on-time gate arrivals,average_gate_departure_delay,average_taxi_out_time,average taxi out delay,average airport departure delay,average airborne delay,average taxi in delay,average block delay,average gate arrival delay
0,ABQ,2004,53971,53818,0.803,0.7809,0.7921,10.38,9.89,2.43,12.1,2.46,0.83,2.55,10.87
1,ABQ,2005,51829,51877,0.814,0.7922,0.8001,9.6,9.79,2.29,11.2,2.26,0.89,2.34,10.24
2,ABQ,2006,49682,51199,0.7983,0.7756,0.7746,10.84,9.89,2.16,12.33,2.12,0.84,2.66,11.82
3,ABQ,2007,53255,53611,0.8005,0.7704,0.7647,11.29,10.34,2.4,12.95,2.19,1.29,3.06,12.71
4,ABQ,2008,49589,49512,0.8103,0.7844,0.7875,10.79,10.41,2.41,12.32,1.82,1.03,2.79,11.48


First, let's define the x and y variables: Airport is going to be our target variable.

In [7]:
x2 = airports.ix[:,2:14].values
y2 = airports.ix[:,0].values

Then, standardize the x variable for analysis

In [17]:
xStand = StandardScaler().fit_transform(x2)

Next, create the covariance matrix from the standardized x-values and decompose these values to find the eigenvalues and eigenvectors

In [9]:
covMat = np.cov(xStand.T)
eigenValues2, eigenVectors2 = np.linalg.eig(covMat)

Then, check your eigenvalues and eigenvectors:

In [10]:
print(eigenValues2)
print(eigenVectors2)

[  5.71594128e+00   4.10771763e+00   8.46070622e-01   4.92674626e-01
   4.68096266e-01   1.66713004e-01   8.78260266e-02   7.27989129e-02
   5.25813963e-02   2.99299918e-03   1.67049793e-04   7.05899593e-04]
[[ -1.92844661e-01  -3.85272521e-01  -4.00175037e-01  -1.64039585e-01
    1.86296653e-01   2.78156147e-01   7.86539360e-02   3.17041164e-02
    7.52767812e-02  -1.14438376e-02  -7.07864735e-01  -3.44682322e-02]
 [ -1.92353674e-01  -3.85057896e-01  -4.02190449e-01  -1.66802678e-01
    1.84403875e-01   2.78351867e-01   9.04981279e-02   3.77083630e-02
    8.09001581e-02   1.35190021e-02   7.04235896e-01   3.71215770e-02]
 [  2.87689649e-01  -3.33455724e-01   2.51323774e-01   8.13357968e-02
    7.50865080e-03   2.37735910e-02  -5.78177983e-02   1.41563250e-02
    4.95309257e-01  -6.57324120e-01   2.23495764e-02  -2.32792117e-01]
 [  3.80590914e-01  -1.72431188e-01   8.79462334e-02   7.06458258e-02
    2.02464908e-01   3.45123821e-03  -3.15914078e-01   9.30547916e-02
    4.84973886e-01 

To find the principal componants, find the eigenpairs, and sort them from highest to lowest. 

In [11]:
eigenPairs2 = [(np.abs(eigenValues2[i]), eigenVectors2[:,i]) for i in range(len(eigenValues2))]
eigenPairs2.sort()
eigenPairs2.reverse()
for i in eigenPairs2:
    print(i[0])

5.71594128131
4.10771763057
0.846070621527
0.492674626426
0.468096266456
0.166713003709
0.0878260265801
0.0727989128724
0.0525813962709
0.0029929991831
0.000705899593355
0.000167049792639


Next, Calculate the explained variance

In [12]:
totalEigen = sum(eigenValues2)
varExpl = [(i / totalEigen)*100 for i in sorted(eigenValues2, reverse=True)]
cumulvarExpl = np.cumsum(varExpl)

In [13]:
print(cumulvarExpl)

[ 47.57620567  81.76648321  88.80868815  92.9094282   96.80559213
  98.19321523  98.92422853  99.53016479  99.96782207  99.99273407
  99.99860957 100.        ]


Finally, conduct the PCA - use the results above to guide your selection of n components

In [76]:
airports_pca = PCA(n_components=3)
airports_pca.fit(xStand)
X = airports_pca.transform(xStand)