In [1]:
## Import the required python utilities
from plotly.offline import init_notebook_mode, iplot
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import pandas as pd
import numpy as np

## Import sklearn important modules
from sklearn.decomposition import PCA, SparsePCA, MiniBatchSparsePCA, KernelPCA, IncrementalPCA
from sklearn.decomposition import TruncatedSVD, FastICA, NMF, FactorAnalysis
from sklearn.manifold import TSNE

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.preprocessing import MinMaxScaler
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans

init_notebook_mode(connected=True)

In [2]:
train = pd.read_csv('train.csv')
target = train['target']
train = train.drop(["target", "ID"], axis=1)

print ("Rows: " + str(train.shape[0]) + ", Columns: " + str(train.shape[1]))
train.head()

Rows: 4459, Columns: 4991


Unnamed: 0,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,20aa07010,dc5a8f1d8,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,0.0,0,0.0,0,0,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
1,0.0,0,0.0,0,0,0,0,0,2200000.0,0.0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
2,0.0,0,0.0,0,0,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
3,0.0,0,0.0,0,0,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
4,0.0,0,0.0,0,0,0,0,0,2000000.0,0.0,...,0.0,0.0,0.0,0,0,0,0,0,0,0


In [3]:
feature_df = train.describe().T
feature_df = feature_df.reset_index().rename(columns = {'index' : 'columns'})
feature_df['distinct_vals'] = feature_df['columns'].apply(lambda x : len(train[x].value_counts()))
feature_df['column_var'] = feature_df['columns'].apply(lambda x : np.var(train[x]))
feature_df['target_corr'] = feature_df['columns'].apply(lambda x : np.corrcoef(target, train[x])[0][1])
feature_df.head()

Unnamed: 0,columns,count,mean,std,min,25%,50%,75%,max,distinct_vals,column_var,target_corr
0,48df886f9,4459.0,14654.930101,389329.8,0.0,0.0,0.0,0.0,20000000.0,32,151543700000.0,0.010188
1,0deb4b6a8,4459.0,1390.894819,64283.02,0.0,0.0,0.0,0.0,4000000.0,5,4131381000.0,0.013805
2,34b15f335,4459.0,26722.450922,569965.2,0.0,0.0,0.0,0.0,20000000.0,29,324787500000.0,0.014694
3,a8cb14b00,4459.0,4530.163714,235912.4,0.0,0.0,0.0,0.0,14800000.0,3,55642180000.0,-0.002917
4,2f0771a37,4459.0,26409.95739,1514730.0,0.0,0.0,0.0,0.0,100000000.0,6,2293893000000.0,0.016647


In [4]:
# variable variance
len(feature_df[feature_df['column_var'].astype(float) == 0.0]) 

256

In [5]:
feature_df = feature_df.sort_values('column_var', ascending = True)
feature_df['column_var'] = (feature_df['column_var'] - feature_df['column_var'].min()) / (feature_df['column_var'].max() - feature_df['column_var'].min())
trace1 = go.Scatter(x=feature_df['columns'], y=feature_df['column_var'], opacity=0.75, marker=dict(color="red"))
layout = dict(height=400, title='Feature Variance', legend=dict(orientation="h"));
fig = go.Figure(data=[trace1], layout=layout);
iplot(fig);

standardized_train = MinMaxScaler().fit_transform(train.values)

In [6]:
trace1 = go.Histogram(x=feature_df[feature_df['column_var'] <= 0.01]['column_var'], opacity=0.45, marker=dict(color="red"))
layout = dict(height=400, title='Distribution of Variable Variance <= 0.01', legend=dict(orientation="h"));
fig = go.Figure(data=[trace1], layout=layout);
iplot(fig);

trace1 = go.Histogram(x=feature_df[feature_df['column_var'] > 0.01]['column_var'], opacity=0.45, marker=dict(color="red"))
layout = dict(height=400, title='Distribution of Variable Variance > 0.01', legend=dict(orientation="h"));
fig = go.Figure(data=[trace1], layout=layout);
iplot(fig);

In [7]:
trace1 = go.Histogram(x=feature_df['target_corr'], opacity=0.45, marker=dict(color="green"))
layout = dict(height=400, title='Distribution of correlation with target', legend=dict(orientation="h"));
fig = go.Figure(data=[trace1], layout=layout);
iplot(fig);

In [8]:
mean_vec = np.mean(standardized_train, 0)
cov_matrix = np.cov(standardized_train.T)
eig_vals, eig_vecs = np.linalg.eig(cov_matrix)

In [9]:
# Create a list of (eigenvalue, eigenvector) tuples
eig_pairs = [(np.abs(eig_vals[i]),eig_vecs[:,i]) for i in range(len(eig_vals))]

# Sort the eigenvalue, eigenvector pair from high to low
eig_pairs.sort(key = lambda x: x[0], reverse= True)

# Calculation of Explained Variance from the eigenvalues
tot = sum(eig_vals)

# Individual explained variance
var_exp = [(i/tot)*100 for i in sorted(eig_vals, reverse=True)] 
var_exp_real = [v.real for v in var_exp]

# Cumulative explained variance
cum_var_exp = np.cumsum(var_exp) 
cum_exp_real = [v.real for v in cum_var_exp]

## plot the variance and cumulative variance 
trace1 = go.Scatter(x=train.columns, y=var_exp_real, name="Individual Variance", opacity=0.75, marker=dict(color="red"))
trace2 = go.Scatter(x=train.columns, y=cum_exp_real, name="Cumulative Variance", opacity=0.75, marker=dict(color="blue"))
layout = dict(height=400, title='Variance Explained by Variables', legend=dict(orientation="h", x=0, y=1.2));
fig = go.Figure(data=[trace1, trace2], layout=layout);
iplot(fig);

In [10]:
def _get_number_components(model, threshold):
    component_variance = model.explained_variance_ratio_
    explained_variance = 0.0
    components = 0

    for var in component_variance:
        explained_variance += var
        components += 1
        if(explained_variance >= threshold):
            break
    return components

### Get the optimal number of components
pca = PCA()
train_pca = pca.fit_transform(standardized_train)
components = _get_number_components(pca, threshold=0.85)
components

780

In [11]:
def plot_3_components(x_trans, title):
    trace = go.Scatter3d(x=x_trans[:,0], y=x_trans[:,1], z = x_trans[:,2],
                          name = target, mode = 'markers', text = target, showlegend = False,
                          marker = dict(size = 8, color=x_trans[:,1], colorscale ='Rainbow', 
                          line = dict(width = 1, color = '#fefefe'), opacity = 0.7))
    layout = go.Layout(title = title, showlegend= True)
    fig = dict(data=[trace], layout=layout)
    iplot(fig)

def plot_2_components(x_trans, title):
    trace = go.Scatter(x=x_trans[:,0], y=x_trans[:,1], name=target, mode='markers',
        text = target, showlegend = False,
        marker = dict(size = 8, color=x_trans[:,1], line = dict(width = 1, color = '#fefefe'), opacity = 0.7))
    layout = go.Layout(title = title, hovermode= 'closest',
        xaxis= dict(title= 'First Component',
            ticklen = 5, zeroline= False, gridwidth= 2),
        yaxis=dict(title= 'Second Component',
            ticklen = 5, gridwidth = 2), showlegend= True)
    fig = dict(data=[trace], layout=layout)
    iplot(fig)

In [12]:
### Implement PCA 
obj = model = PCA(n_components = components)
X_pca = obj.fit_transform(standardized_train)

## Visualize the Components 
plot_3_components(X_pca, 'PCA - First Three Components (Zoom In to view)')
plot_2_components(X_pca, 'PCA - First Two Components')