In [1]:
# MSDS 422, Section 58, Assignment 5, Alan Kessler
# Python 3.5 on Mac OS 10.13.5 edited in Atom
# Demonstrates use of PCA

import pandas as pd
import numpy as np
import time
import os.path
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import urllib.request as ur

# Random number seed
seed = 527

# DATA PREPARATION

# Download the data from an alternative source.
# mldata.org appears to be down (July 8, 2018)
if not os.path.isfile('train.csv'):
    ur.urlretrieve('https://pjreddie.com/media/files/mnist_train.csv',
                   'train.csv')

if not os.path.isfile('test.csv'):
    ur.urlretrieve('https://pjreddie.com/media/files/mnist_test.csv',
                   'test.csv')

# Create variable names to use
names = ['label']
for i in range(1, 785):
    names.append("var" + str(i).zfill(3))
names

# Read in data with pandas
train = pd.read_csv('train.csv', names=names)
test = pd.read_csv('test.csv', names=names)

# Create arrays for use with sklearn
X_train = train.drop(['label'], axis=1).values
y_train = train['label'].values
X_test = test.drop(['label'], axis=1).values
y_test = test['label'].values

# Initialize a dictionary to track results
metrics = {}

# Label metrics being captured
names = ['F1 Score', 'Time', 'Number of Variables', 'time']

# Specify the random forest used across approaches
rf = RandomForestClassifier(max_features='sqrt', random_state=seed)

# RANDOM FOREST WITHOUT PCA

# Start timer
start = time.clock()

# Fit the random forest and generate test data predictions
predictions = rf.fit(X_train, y_train).predict(X_test)

# Calcuate the f1 score
score = f1_score(y_test, predictions, average='micro')

# Record the clock time it takes
duration = time.clock() - start

# Store the results in a dictionary
metrics['RF - No PCA'] = [score, duration, X_train.shape[1], time.clock()]

# PCA TRAINING AND TESTING COMBINED

# Start timer
start = time.clock()

# Concatenate training and testing data
X = np.concatenate([X_train, X_test]).astype(float)

# Scale the data prior to PCA (fits and transforms at once)
X = StandardScaler().fit_transform(X)

# Define PCA to explain at least 95% of the variance of the data
X = PCA(n_components=0.95, random_state=seed).fit_transform(X)

# Generate the components on training and testing data
# Assumption is that this should go with PCA timing
X_train_pca = X[0:60000]
X_test_pca = X[-10000:]

# Record the clock time it takes
duration = time.clock() - start

# Capture metrics (f1 score does not apply for PCA)
metrics['PCA'] = [float('NaN'), duration, X_train.shape[1], time.clock()]

# RANDOM FOREST WITH PCA

# Start timer
start = time.clock()

# Fit the random forest and generate test data predictions
predictions = rf.fit(X_train_pca, y_train).predict(X_test_pca)

# Calcuate the f1 score
score = f1_score(y_test, predictions, average='micro')

# Record the clock time it takes
duration = time.clock() - start

# Store the results in a dictionary
metrics['RF - PCA'] = [score, duration, X_train_pca.shape[1], time.clock()]

# PCA BASED ON TRAINING

# Start timer
start = time.clock()

# Scale training data
std_scaler = StandardScaler().fit(X_train.astype(float))

# Transform training and testing data based on scaler
X_train_std = std_scaler.transform(X_train.astype(float))
X_test_std = std_scaler.transform(X_test.astype(float))

# Define PCA to explain at least 95% of variance of training data
pca = PCA(n_components=0.95, random_state=seed).fit(X_train_std)

# Generate components on training and testing data
# Assumption is that this should go with PCA timing
X_train_pca = pca.transform(X_train_std)
X_test_pca = pca.transform(X_test_std)

# Record the clock time it takes
duration = time.clock() - start

# Capture metrics (f1 score does not apply for PCA)
metrics['PCA - Fixed'] = [float('NaN'), duration, X_train.shape[1],
                          time.clock()]

# RANDOM FOREST WITH PCA (FIXED PCA)

# Start timer
start = time.clock()

# Fit the random forest and generate test data predictions
predictions = rf.fit(X_train_pca, y_train).predict(X_test_pca)

# Calcuate the f1 score
score = f1_score(y_test, predictions, average='micro')

# Record the clock time it takes
duration = time.clock() - start

# Store the results in a dictionary
metrics['RF - PCA (Fixed)'] = [score, duration, X_train_pca.shape[1],
                               time.clock()]

# RANDOM FOREST WITH FIRST 350 VARIABLES

# Start timer
start = time.clock()

# Fit the random forest and generate test data predictions
predictions = rf.fit(X_train[:, :350], y_train).predict(X_test[:, :350])

# Calcuate the f1 score
score = f1_score(y_test, predictions, average='micro')

# Record the clock time it takes
duration = time.clock() - start

# Store the results in a dictionary
metrics['RF - First 350'] = [score, duration, 350, time.clock()]

# Convert metrics dictionary to dataframe for display
results_summary = pd.DataFrame.from_dict(metrics, orient='index')
results_summary.columns = names
results_summary.sort_values(by=['time'], inplace=True)
results_summary.drop(['time'], axis=1, inplace=True)
results_summary.to_csv('results_summary.csv')


In [2]:
results_summary

Unnamed: 0,F1 Score,Time,Number of Variables
RF - No PCA,0.9477,4.106164,784
PCA,,32.015627,784
RF - PCA,0.8827,15.128917,332
PCA - Fixed,,32.186825,784
RF - PCA (Fixed),0.8834,16.261314,331
RF - First 350,0.8676,2.895794,350
