In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import math
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA 
from sklearn import metrics


%matplotlib inline

# Case #1: Congressional Voting Data

After you've downloaded the data from the repository, go ahead and load it with Pandas

In [2]:
votes = pd.read_csv('../assets/datasets/votes.csv')

In [3]:
votes.head()
print len(votes)
votes=votes.dropna(axis=0)
votes.head()
len(votes)

435


232

Next, let's define the x and y variables: 

In [4]:
votes.head()

Unnamed: 0.1,Unnamed: 0,Class,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16
5,6,democrat,n,y,y,n,y,y,n,n,n,n,n,n,y,y,y,y
8,9,republican,n,y,n,y,y,y,n,n,n,n,n,y,y,y,n,y
19,20,democrat,y,y,y,n,n,n,y,y,y,n,y,n,n,n,y,y
23,24,democrat,y,y,y,n,n,n,y,y,y,n,n,n,n,n,y,y
25,26,democrat,y,n,y,n,n,n,y,y,y,y,n,n,n,n,y,y


In [5]:
def vcheck(x):
    vs = str(x)
    if "y" == vs:
        return 1
    elif "n" == vs:
        return 0
    
v2 = votes.applymap(vcheck)
 


# votes['vcode'] = df['Language'].apply(check_lang);


In [6]:
v2.head()

Unnamed: 0.1,Unnamed: 0,Class,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16
5,,,0,1,1,0,1,1,0,0,0,0,0,0,1,1,1,1
8,,,0,1,0,1,1,1,0,0,0,0,0,1,1,1,0,1
19,,,1,1,1,0,0,0,1,1,1,0,1,0,0,0,1,1
23,,,1,1,1,0,0,0,1,1,1,0,0,0,0,0,1,1
25,,,1,0,1,0,0,0,1,1,1,1,0,0,0,0,1,1


In [7]:
x = v2.ix[:,2:14].values
y = votes.ix[:,1].values

In [8]:
xStand = StandardScaler().fit_transform(x)



Next, create the covariance matrix from the standardized x-values and decompose these values to find the eigenvalues and eigenvectors

In [11]:
covMat1 = np.cov(xStand.T)
eigenValues, eigenVectors = np.linalg.eig(covMat1)

Now, let's check the eigenvalues: 

In [12]:
print(eigenValues)

[ 5.98641626  1.34559587  1.00591093  0.84131011  0.70467454  0.54652567
  0.10793467  0.18802968  0.22236937  0.33057064  0.30421838  0.46839193]


And the eigenvectors: 

In [13]:
print(eigenVectors)

[[  2.16447105e-01   2.00372933e-01   1.52623519e-01  -6.57735838e-01
   -5.88278228e-01   1.35833663e-01  -1.76507800e-02   6.91594282e-02
   -2.24380469e-02  -1.93655097e-01   2.22339165e-01   5.81948994e-03]
 [ -5.14298479e-02   6.60344281e-01   3.36196928e-02  -3.78806441e-01
    6.06148287e-01  -1.05574514e-01   2.84934924e-03   3.12284380e-02
    9.10218410e-02  -4.00342191e-02  -5.15066761e-02  -1.56812269e-01]
 [  3.29292550e-01   5.59615306e-02  -1.11199233e-01   4.27399041e-03
    1.08087655e-01   4.96004063e-01  -1.59380800e-01   8.72626424e-02
    4.06772084e-02   7.07326775e-01   2.82839744e-01  -5.80265831e-02]
 [ -3.45762494e-01  -1.36473623e-01   7.13202167e-02  -1.71190461e-01
    1.22031337e-02  -2.71169015e-01  -3.30050667e-01  -1.19967267e-01
   -6.34919025e-01   2.21114187e-01   2.50686749e-01  -3.39949795e-01]
 [ -3.77129953e-01   2.96144319e-02  -4.06062368e-02  -1.11437673e-01
   -1.29007859e-01   1.17807013e-01   8.16638279e-01   9.65715638e-02
   -1.16078325e-

To find the principal componants, find the eigenpairs, and sort them from highest to lowest. 

In [18]:
eigenPairs = [(np.abs(eigenValues[i]), eigenVectors[:,i]) for i in range(len(eigenValues))]
eigenPairs.sort()
eigenPairs.reverse()
for i in eigenPairs:
    print(i[0])

5.98641626143
1.34559586978
1.00591092861
0.841310113295
0.704674536358
0.546525670617
0.468391933877
0.330570641824
0.304218377388
0.222369372326
0.188029678945
0.107934667498


Now, calculate the explained variance. Recall the methods we learned in lesson 2.2!

In [19]:
totalEigen = sum(eigenValues)
varExpl = [(i / totalEigen)*100 for i in sorted(eigenValues, reverse=True)]

In [20]:
print(varExpl)

[49.67177285888792, 11.164965729833389, 8.3464592136806353, 6.9806981383280631, 5.84697621762521, 4.5347496376634355, 3.8864416927284582, 2.7428814030625133, 2.5242257606522074, 1.8450906970989365, 1.5601600515885756, 0.89557859885066493]


Now, calculate the explained variance and the Cumulative explained variance

In [21]:
cvarex = np.cumsum(varExpl)

In [22]:
print(cvarex)

[  49.67177286   60.83673859   69.1831978    76.16389594   82.01087216
   86.5456218    90.43206349   93.17494489   95.69917065   97.54426135
   99.1044214   100.        ]


Now, conduct the PCA using scikit learn

In [23]:
PCA_set = PCA(n_components=5)
X = PCA_set.fit_transform(xStand)

# Case #2: Airport Delays

In [24]:
airports = pd.read_csv('../assets/datasets/airport_operations.csv')

In [25]:
airports.head()

Unnamed: 0,airport,year,departures for metric computation,arrivals for metric computation,percent on-time gate departures,percent on-time airport departures,percent on-time gate arrivals,average_gate_departure_delay,average_taxi_out_time,average taxi out delay,average airport departure delay,average airborne delay,average taxi in delay,average block delay,average gate arrival delay
0,ABQ,2004,53971,53818,0.803,0.7809,0.7921,10.38,9.89,2.43,12.1,2.46,0.83,2.55,10.87
1,ABQ,2005,51829,51877,0.814,0.7922,0.8001,9.6,9.79,2.29,11.2,2.26,0.89,2.34,10.24
2,ABQ,2006,49682,51199,0.7983,0.7756,0.7746,10.84,9.89,2.16,12.33,2.12,0.84,2.66,11.82
3,ABQ,2007,53255,53611,0.8005,0.7704,0.7647,11.29,10.34,2.4,12.95,2.19,1.29,3.06,12.71
4,ABQ,2008,49589,49512,0.8103,0.7844,0.7875,10.79,10.41,2.41,12.32,1.82,1.03,2.79,11.48


First, let's define the x and y variables: Airport is going to be our target variable.

In [26]:
x2 = airports.ix[:,2:14].values
y2 = airports.ix[:,0].values

Then, standardize the x variable for analysis

In [27]:
xStand = StandardScaler().fit_transform(x2)

Next, create the covariance matrix from the standardized x-values and decompose these values to find the eigenvalues and eigenvectors

In [28]:
covMat = np.cov(xStand.T)
eigenValues2, eigenVectors2 = np.linalg.eig(covMat)

Then, check your eigenvalues and eigenvectors:

In [29]:
print(eigenValues2)
print(eigenVectors2)

[  5.71594128e+00   4.10771763e+00   8.46070622e-01   4.92674626e-01
   4.68096266e-01   1.66713004e-01   8.78260266e-02   7.27989129e-02
   5.25813963e-02   2.99299918e-03   1.67049793e-04   7.05899593e-04]
[[ -1.92844661e-01  -3.85272521e-01  -4.00175037e-01  -1.64039585e-01
    1.86296653e-01   2.78156147e-01   7.86539360e-02   3.17041164e-02
    7.52767812e-02  -1.14438376e-02  -7.07864735e-01  -3.44682322e-02]
 [ -1.92353674e-01  -3.85057896e-01  -4.02190449e-01  -1.66802678e-01
    1.84403875e-01   2.78351867e-01   9.04981279e-02   3.77083630e-02
    8.09001581e-02   1.35190021e-02   7.04235896e-01   3.71215770e-02]
 [  2.87689649e-01  -3.33455724e-01   2.51323774e-01   8.13357968e-02
    7.50865080e-03   2.37735910e-02  -5.78177983e-02   1.41563250e-02
    4.95309257e-01  -6.57324120e-01   2.23495764e-02  -2.32792117e-01]
 [  3.80590914e-01  -1.72431188e-01   8.79462334e-02   7.06458258e-02
    2.02464908e-01   3.45123821e-03  -3.15914078e-01   9.30547916e-02
    4.84973886e-01 

To find the principal componants, find the eigenpairs, and sort them from highest to lowest. 

In [30]:
eigenPairs2 = [(np.abs(eigenValues2[i]), eigenVectors2[:,i]) for i in range(len(eigenValues2))]
eigenPairs2.sort()
eigenPairs2.reverse()
for i in eigenPairs2:
    print(i[0])

5.71594128131
4.10771763057
0.846070621527
0.492674626426
0.468096266456
0.166713003709
0.0878260265801
0.0727989128724
0.0525813962709
0.0029929991831
0.000705899593354
0.000167049792638


Next, Calculate the explained variance

In [31]:
totalEigen = sum(eigenValues2)
varExpl = [(i / totalEigen)*100 for i in sorted(eigenValues2, reverse=True)]
cumulvarExpl = np.cumsum(varExpl)

In [32]:
print(cumulvarExpl)

[ 47.57620567  81.76648321  88.80868815  92.9094282   96.80559213
  98.19321523  98.92422853  99.53016479  99.96782207  99.99273407
  99.99860957 100.        ]


Finally, conduct the PCA - use the results above to guide your selection of n components

In [34]:
airports_pca = PCA(n_components=3)
airports_pca.fit(xStand)
X = airports_pca.transform(xStand)

X

array([[-2.08669639,  0.42693771, -0.07114673],
       [-2.44053404,  0.29445167,  0.01200508],
       [-2.03388475,  0.63058527,  0.08050235],
       ..., 
       [ 4.53001134,  6.48010081,  1.17827769],
       [ 3.95447158,  6.4383177 ,  1.92397082],
       [ 4.46838891,  6.34309961,  1.33541883]])