In [1]:
# source: https://www.kaggle.com/tilii7/dimensionality-reduction-pca-tsne

In [2]:
import warnings
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    %matplotlib inline
    import matplotlib.cm as cm
    from sklearn.decomposition import PCA

In [3]:
print('\nLoading files ...')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
X = train.drop(['id', 'target'], axis=1).values
y = train['target'].values.astype(np.int8)
target_names = np.unique(y)
print('\nThere are %d unique target valuess in this dataset:' % (len(target_names)), target_names)


Loading files ...
('\nThere are 2 unique target valuess in this dataset:', array([0, 1], dtype=int8))


Principal Component Analysis identifies the combination of components (directions in the feature space) that account for the most variance in the data. This is useful for visualizing high dimensional data using only components that account for most of the variance in data.

In [4]:
from sklearn.preprocessing import MinMaxScaler

# Creating dummy variables for categorical data

def scale_data(X, scaler=None):
    if not scaler:
        scaler = MinMaxScaler(feature_range=(-1, 1))
        scaler.fit(X)
    X = scaler.transform(X)
    return X, scaler

X = train.drop(['id', 'target'], axis=1)
test.drop(['id'], axis=1, inplace=True)
n_train = X.shape[0]
train_test = pd.concat((X, test)).reset_index(drop=True)
col_to_drop = X.columns[X.columns.str.endswith('_cat')]
col_to_dummify = X.columns[X.columns.str.endswith('_cat')].astype(str).tolist()

for col in col_to_dummify:
    dummy = pd.get_dummies(train_test[col].astype('category'))
    columns = dummy.columns.astype(str).tolist()
    columns = [col + '_' + w for w in columns]
    dummy.columns = columns
    train_test = pd.concat((train_test, dummy), axis=1)

train_test.drop(col_to_dummify, axis=1, inplace=True)

# Scaling the data

train_test_scaled, scaler = scale_data(train_test)
X = np.array(train_test_scaled[:n_train, :])
test = np.array(train_test_scaled[n_train:, :])
print('\n Shape of processed train data:', X.shape)
print(' Shape of processed test data:', test.shape)

('\n Shape of processed train data:', (595212L, 227L))
(' Shape of processed test data:', (892816L, 227L))


In [5]:
# Doing PCA
n_comp = 20
print('\nRunning PCA again ...')
pca = PCA(n_components=n_comp, svd_solver='full', random_state=1001)
X_pca = pca.fit_transform(X)
print('Explained variance: %.4f' % pca.explained_variance_ratio_.sum())

print('Individual variance contributions:')
for j in range(n_comp):
    print(pca.explained_variance_ratio_[j])

plt.figure(1, figsize=(10, 10))

for color, i, target_name in zip(colors, [0, 1], target_names):
    plt.scatter(X_pca[y == i, 0], X_pca[y == i, 1], color=color, s=1,
                alpha=.8, label=target_name, marker='.')
plt.legend(loc='best', shadow=False, scatterpoints=3)
plt.title(
        "Scatter plot of the training data projected on the 1st "
        "and 2nd principal components")
plt.xlabel("Principal axis 1 - Explains %.1f %% of the variance" % (
        pca.explained_variance_ratio_[0] * 100.0))
plt.ylabel("Principal axis 2 - Explains %.1f %% of the variance" % (
        pca.explained_variance_ratio_[1] * 100.0))

plt.show()


Running PCA again ...
Explained variance: 0.6231
Individual variance contributions:
0.074084070101
0.0618868203631
0.0559539511796
0.0427607611418
0.0359086048879
0.0345707250307
0.0314658500583
0.0285582078293
0.0255109028989
0.0252506880295
0.0245827886253
0.0241400322075
0.0238642769192
0.0232114066023
0.021035114187
0.0209135028964
0.020064977449
0.0189478800073
0.0162507118409
0.014169058539


NameError: name 'colors' is not defined

<matplotlib.figure.Figure at 0x1b183d30>

It is visible from this plot that there are very few clearly defined clusters of 1s and there is no clear separation
between 0s and 1s.