# Principle Component Analysis (PCA) Transformation and Data Vis
PCA offers the possibility of reducing higher dimensionality into 

In [1]:
#plotly imports 
import plotly as py 
import plotly.graph_objs as go
from plotly import __version__

#use this format for working locally 
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot, plot_mpl
init_notebook_mode(connected=True)
print('Plotly version: %s' %(__version__))


#Data Containers & Essential Libraries
import data_prep as dp
import pandas as pd
import numpy as np
import pickle as pkl

# Alternate visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline


from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

Plotly version: 3.3.0


In [2]:
data, target = dp.load_agg_Xy(path='./data/', strat_cat='attack_cat', sample_size=0.05)

with open('ct_ohe_ssc_xyagg.pkl', 'rb') as f:
    col_trans = pkl.load(f)


Columns (1,3,47) have mixed types. Specify dtype option on import or set low_memory=False.


Columns (3,39,47) have mixed types. Specify dtype option on import or set low_memory=False.



In [3]:
pca = PCA(n_components = 3, random_state=0)
ct_pca = Pipeline([
    ('col_transform', col_trans),
    ('pca', pca)
])

X_pca = ct_pca.fit_transform(data)

ct_pca.named_steps['pca'].explained_variance_ratio_


Data with input dtype int64, float64 were all converted to float64 by StandardScaler.


Data with input dtype int64, float64 were all converted to float64 by StandardScaler.



array([0.25511532, 0.12779854, 0.09251875])

In [8]:
atk_msk = (target!='normal')
attack_rows = np.sum(atk_msk)
norm_rows = X_pca.shape[0]-attack_rows
atk_dat = X_pca[atk_msk][np.random.choice(attack_rows, size=2000, replace=False)]
norm_dat = X_pca[~atk_msk][np.random.choice(norm_rows, size=2000, replace=False)]

In [9]:
# Graphing 3-D reduced data
marker_pos = dict(size=3, symbol='circle',
                  #color='rgb(127, 127, 127)',
                  color = 'rgb(255, 127, 14)',
                  #line=dict(width=1, color='rgba(217, 217, 217, 0.14)'),
                  opacity=0.3)
marker_neg = dict(size=3, symbol='circle',
                  color='rgb(127, 127, 127)',
                  #line=dict(color='rgb(204, 204, 204)',width=0.5),
                  opacity=0.8)

trace1 = go.Scatter3d(x = atk_dat[:,0], y=atk_dat[:,1], z=atk_dat[:,2],
                    mode = 'markers',
                     marker = marker_pos,
                     name = 'Cyberattack examples')

trace2 = go.Scatter3d(x = norm_dat[:,0], y=norm_dat[:,1], z=norm_dat[:,2],
                    mode = 'markers',
                     marker = marker_neg,
                     name = 'Normal Traffic')
data=[trace1, trace2]
#data = [trace1]

layout = go.Layout(
    margin=dict(
        l=0,
        r=0,
        b=0,
        t=0
    ),
    title = 'Scania Truck Data Reduced to 3 Components',
    xaxis = dict(title = 'PC1', titlefont = dict(size=18)),
    yaxis = dict(title = 'PC2', titlefont = dict(size=18)),
    #zaxis = dict(title = 'PC3', titlefont = dict(size=18))
)
fig = go.Figure(data=data, layout=layout)
py.offline.iplot(fig, filename='simple-3d-scatter')