In [None]:
%matplotlib inline
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import os

<h1>PCA Cloud Prediction Invocation Template</h1>
<h4>Invoke SageMaker Prediction Service</h4>

In [None]:
import boto3
import re
from sagemaker import get_execution_role
import sagemaker

In [None]:
# Acquire a realtime endpoint
endpoint_name = 'pca-biketrain-v1'
predictor = sagemaker.predictor.RealTimePredictor(endpoint=endpoint_name)

In [None]:
from sagemaker.predictor import csv_serializer, json_deserializer

predictor.content_type = 'text/csv'
predictor.serializer = csv_serializer
predictor.deserializer = json_deserializer

In [None]:
# We are not going to use numeric features: 'temp','atemp','humidity','windspeed'
# Instead, we are going to use new components (aka features) generated by PCA for model training and testing
columns = ['count', 'season', 'holiday', 'workingday', 'weather','year', 'month', 'day', 'dayofweek','hour']

# PCA Training
colums_for_pca = ['temp','atemp','humidity','windspeed']

In [None]:
df = pd.read_csv('train_normalized.csv')
df_test = pd.read_csv('test_normalized.csv')

In [None]:
df.head(2)

In [None]:
df_test.head(2)

In [None]:
df[colums_for_pca].head()

In [None]:
# test = df[colums_for_pca].head().as_matrix()
test = df[colums_for_pca].head().values

In [None]:
result = predictor.predict(test)

In [None]:
result

In [None]:
l = [values['projection'] for values in result['projections']]

In [None]:
l

In [None]:
df_temp = pd.DataFrame(l)

In [None]:
df_temp

In [None]:
# For large number of predictions, we can split the input data and
# Query the prediction service.
# array_split is convenient to specify how many splits are needed
def get_projection(arr_features):
    projections = []
    for arr in np.array_split(arr_features,100):        
        if arr.shape[0] > 0:
            print (arr.shape)
            result = predictor.predict(arr)
            projections += [values['projection'] for values in result['projections']]
    return projections
        

In [None]:
def replace_features(predictor, df, colums_for_pca):
    
    # arr_features = df[colums_for_pca].as_matrix()
    arr_features = df[colums_for_pca].values
    
    projections = get_projection(arr_features)
    df_projection = pd.DataFrame(projections)
    
    tcols = []
    # New column names
    for i in range(df_projection.shape[1]):       
        tcols.append('component_' + str(i))
    
    df_projection.columns = tcols
    print ('components:',tcols)
    
    
    for col in df_projection.columns:
        df[col] = df_projection[col]
    
    df.drop(colums_for_pca, inplace=True, axis=1)
    
    return tcols

In [None]:
df.head(2)

In [None]:
new_cols = replace_features(predictor,df,colums_for_pca)

In [None]:
replace_features(predictor,df_test,colums_for_pca)

In [None]:
for col in new_cols:
    columns.append(col)

In [None]:
columns

In [None]:
## Training, Validation and Test Set
### Target Variable as first column followed by input features
### Training, Validation files do not have a column header

In [None]:
# Training = 70% of the data
# Validation = 30% of the data
# Randomize the datset
np.random.seed(5)
l = list(df.index)
np.random.shuffle(l)
df = df.iloc[l]

In [None]:
rows = df.shape[0]
train = int(.7 * rows)
test = int(.3 * rows)

In [None]:
rows, train, test

In [None]:
# Write Training Set
df[:train].to_csv('bike_train_pca.csv'
                          ,index=False,header=False
                          ,columns=columns)

In [None]:
# Write Validation Set
df[train:].to_csv('bike_validation_pca.csv'
                          ,index=False,header=False
                          ,columns=columns)

In [None]:
# Test Data has only input features
df_test.to_csv('bike_test_pca.csv',index=False)

In [None]:
# Write Column List
with open('bike_train_column_list_pca.txt','w') as f:
    f.write(','.join(columns))