The purpose of this notebook is to analyze PCA/ICA of the verbose (large) data collection for the singleton weaselball case. This will tell us what data to put into the Markov Chain.

In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier

## Get rows to skip (Downsample)

In [22]:
import csv
row_count = len(open('../../data/10-07-2018_09-47-33_long.csv').readlines())
row_count

1134717

In [24]:
skip_rows = []
SAMPLE = 250
for i in range(row_count+100):
    if (i % SAMPLE != 0):
        skip_rows.append(i)
#There appears to be something wrong with the data collection where extra rows are given for no reason... 
#So I am clipping out some of the data
for i in range(int((2.0/3.0)*row_count)):
    skip_rows.append(i)

## Import data

In [117]:
df = pd.read_csv('../../data/10-07-2018_09-47-33_long.csv', index_col=False, skiprows = skip_rows, sep=',', header=None)
df.shape

(8277904, 40)

In [118]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
0,8209 267000000,48.0,0.0,0.348368,0.032544,2.2545,0.338572,0.027191,0.041,-0.885893,...,-3.0472,-4.37102,4.07534,21.4933,61.6821,-129.158,1.0,1.0,0.0,
1,8209 517000000,48.0,0.0,0.304632,-0.002705,2.2502,0.298493,-0.011255,0.041,-0.90101,...,-0.409053,-5.47509,2.79871,-18.0717,-54.975,-2.41614,1.0,1.0,0.0,
2,8209 767000000,48.0,0.0,0.296372,-0.097195,2.27751,0.291532,-0.106351,0.041,1.13975,...,-3.32071,-10.3622,4.82058,13.6629,-55.8134,-2.61094,1.0,1.0,0.0,
3,8210 17000000,48.0,0.0,0.259804,-0.236097,2.24678,0.249329,-0.240273,0.041,-1.15415,...,-4.77342,-13.5122,3.49727,7.17984,-17.301,-25.6303,1.0,1.0,0.0,
4,8210 267000000,48.0,0.0,0.254483,-0.355668,2.24703,0.250686,-0.365134,0.041,0.826869,...,-3.76355,-6.80746,2.19545,30.6324,18.8031,-35.7307,1.0,1.0,0.0,


In [119]:
#Remove useless columns
df = df.drop(columns=[0,1,36,37,38,39], axis=1)
df.head()

Unnamed: 0,2,3,4,5,6,7,8,9,10,11,...,26,27,28,29,30,31,32,33,34,35
0,0.0,0.348368,0.032544,2.2545,0.338572,0.027191,0.041,-0.885893,-0.885893,0.945808,...,0.067895,3.56425,-4.09517,-1.56143,-3.0472,-4.37102,4.07534,21.4933,61.6821,-129.158
1,0.0,0.304632,-0.002705,2.2502,0.298493,-0.011255,0.041,-0.90101,-0.90101,-2.32247,...,0.184853,0.55579,-0.317537,1.85807,-0.409053,-5.47509,2.79871,-18.0717,-54.975,-2.41614
2,0.0,0.296372,-0.097195,2.27751,0.291532,-0.106351,0.041,1.13975,1.13975,-2.6907,...,-0.410996,0.794314,0.49204,-1.68216,-3.32071,-10.3622,4.82058,13.6629,-55.8134,-2.61094
3,0.0,0.259804,-0.236097,2.24678,0.249329,-0.240273,0.041,-1.15415,-1.15415,2.84016,...,0.580768,-0.398561,-0.81468,0.392415,-4.77342,-13.5122,3.49727,7.17984,-17.301,-25.6303
4,0.0,0.254483,-0.355668,2.24703,0.250686,-0.365134,0.041,0.826869,0.826869,-2.99834,...,-0.301732,0.619249,1.27,1.28842,-3.76355,-6.80746,2.19545,30.6324,18.8031,-35.7307


In [120]:
df = df.drop(df.index[len(df)-1])

In [121]:
#Find NaNs
nans = lambda df: df[df.isnull().any(axis=1)]
nans(df)

Unnamed: 0,2,3,4,5,6,7,8,9,10,11,...,26,27,28,29,30,31,32,33,34,35


In [122]:
#Mean center the data
df = df.subtract(df.mean())
df.head()

Unnamed: 0,2,3,4,5,6,7,8,9,10,11,...,26,27,28,29,30,31,32,33,34,35
0,0.0,0.352125,0.049922,2.343159,0.342194,0.04457,-6.786441e-07,-0.882409,-0.882409,0.940654,...,0.067911,3.567122,-4.17272,-1.560504,-2.981006,1.475069,4.06956,21.423727,65.107146,-129.088527
1,0.0,0.308389,0.014672,2.338859,0.302115,0.006124,-6.786441e-07,-0.897526,-0.897526,-2.327624,...,0.18487,0.558662,-0.395087,1.858996,-0.342859,0.370999,2.79293,-18.141273,-51.549954,-2.346667
2,0.0,0.300129,-0.079817,2.366169,0.295154,-0.088972,-6.786441e-07,1.143234,1.143234,-2.695854,...,-0.410979,0.797186,0.41449,-1.681234,-3.254516,-4.516111,4.8148,13.593327,-52.388354,-2.541467
3,0.0,0.263561,-0.21872,2.335439,0.252951,-0.222894,-6.786441e-07,-1.150666,-1.150666,2.835006,...,0.580785,-0.395689,-0.89223,0.393341,-4.707226,-7.666111,3.49149,7.110267,-13.875954,-25.560827
4,0.0,0.25824,-0.338291,2.335689,0.254308,-0.347755,-6.786441e-07,0.830353,0.830353,-3.003494,...,-0.301715,0.622121,1.19245,1.289346,-3.697356,-0.961371,2.18967,30.562827,22.228146,-35.661227


## PCA

In [115]:
from sklearn.decomposition import PCA

In [123]:
pca = PCA()
pca.fit(df)

MemoryError: 

In [None]:
plt.figure(figsize=(8,8))
plt.plot(pca.singular_values_)
plt.grid()
plt.title('PCA of Data')

It can be seen that the 5 components should be able to sufficiently represent our data...

In [None]:
N = 6
print("The top {} components make up {}".format(N, (np.sum(pca.singular_values_[0:N]))/(np.sum(pca.singular_values_))))