In [2]:
import numpy as np
import pandas

def return_dataframe(filename):
    df = pandas.read_csv(filename)
    return df
    

In [13]:
def normalize_features(df):
    """
    Normalize the features in the data set.
    """
    mu = df.mean()
    sigma = df.std()
    
    if (sigma == 0).any():
        raise Exception("One or more features had the same value for all samples, and thus could " + \
                         "not be normalized. Please do not include features with only a single value " + \
                         "in your model.")
    df_normalized = (df - df.mean()) / df.std()

    return df_normalized, mu, sigma


In [4]:
def compute_cost(features, values, theta):
    """
    Compute the cost function given a set of features / values, 
    and the values for our thetas.
    """
    
    m = len(values)
    
    # the actual formula: subracting the infromation from the predicted to the actual value
    sum_of_square_errors = np.square(np.dot(features, theta) - values).sum()
    cost = sum_of_square_errors / (2*m)

    return cost

In [5]:
def gradient_descent(features, values, theta, alpha, num_iterations):
    """
    Perform gradient descent given a data set with an arbitrary number of features.
    """
    
    m = len(values)
    cost_history = []

    for i in range(num_iterations):
        pred_values = np.dot(features, theta)
        theta = theta - alpha/m * np.dot((pred_values - values), features)
        cost = compute_cost(features, values, theta)
        cost_history.append(cost)
    
    return theta, pandas.Series(cost_history)

In [16]:
def calculate_r_squared(data, prediction):
    SSE = ((data - predictions)**2).sum()
    SSTO = ((data - np.mean(data))**2).sum()
    r_squared = 1 - (SSE/SSTO)
    return r_squared


In [19]:
from ggplot import *

"""
In this question, you need to:
1) implement the compute_cost() and gradient_descent() procedures
2) Select features (in the predictions procedure) and make predictions.

"""

def predictions(filename):
    '''
    The NYC turnstile data is stored in a pandas dataframe called weather_turnstile.
    Using the information stored in the dataframe, let's predict the ridership of
    the NYC subway using linear regression with gradient descent.
        
    Your prediction should have a R^2 value of 0.40 or better.
    You need to experiment using various input features contained in the dataframe. 
    We recommend that you don't use the EXITSn_hourly feature as an input to the 
    linear model because we cannot use it as a predictor: we cannot use exits 
    counts as a way to predict entry counts. 
    '''
    df = return_dataframe(filename)
    
    # Select Features (try different features!)
    features = df[['rain', 'precipi', 'Hour', 'meantempi']]
    
    # Add UNIT to features using dummy variables
    dummy_units = pandas.get_dummies(df['UNIT'], prefix='unit')
    features = features.join(dummy_units)
    
    # Values
    values = df['ENTRIESn_hourly']
    m = len(values)

    features, mu, sigma = normalize_features(features)
    features['ones'] = np.ones(m) # Add a column of 1s (y intercept)
    
    # Convert features and values to numpy arrays
    features_array = np.array(features)
    values_array = np.array(values)

    # Set values for alpha, number of iterations.
    alpha = 0.1 
    num_iterations = 75 

    # Initialize theta, perform gradient descent
    theta_gradient_descent = np.zeros(len(features.columns))
    theta_gradient_descent, cost_history = gradient_descent(features_array, 
                                                            values_array, 
                                                            theta_gradient_descent, 
                                                            alpha, 
                                                            num_iterations)
        
    predictions = np.dot(features_array, theta_gradient_descent)

predictions('turnstile_data_master_with_weather.csv')


0            0.0
1          217.0
2          890.0
3         2451.0
4         4400.0
5         3372.0
6            0.0
7           42.0
8           50.0
9          316.0
10         633.0
11         639.0
12           0.0
13           0.0
14           0.0
15           0.0
16           0.0
17           0.0
18           0.0
19           0.0
20           0.0
21           0.0
22           0.0
23           0.0
24           0.0
25           0.0
26           1.0
27           0.0
28           0.0
29           0.0
           ...  
131921       6.0
131922     192.0
131923     350.0
131924      25.0
131925       0.0
131926      30.0
131927     208.0
131928     107.0
131929     139.0
131930     331.0
131931     275.0
131932     133.0
131933       7.0
131934     128.0
131935     142.0
131936       0.0
131937     248.0
131938      62.0
131939     116.0
131940      68.0
131941       7.0
131942      80.0
131943     195.0
131944      18.0
131945       0.0
131946      19.0
131947     158.0
131948      54