In [None]:
import numpy as np
import pandas
import statsmodels.api as sm

In [4]:
def linear_regression(features, values):
    
    features = sm.add_constant(features)
    model = sm.OLS(values, features)
    results = model.fit()
    intercept = results.params[0]
    params = results.params[1:]
    return intercept, params


In [23]:
def predictions(intercept, features, params):
    predictions = intercept + np.dot(features, params)
    return predictions

In [16]:
def compute_r_squared(data, predictions):
    
    SSpred = ((data - predictions)**2).sum()
    SSmean = ((data - np.mean(data))**2).sum()
    
    r_squared = 1 - (SSpred / SSmean)
    
    return r_squared

In [19]:
################################ MODIFY THIS SECTION #####################################
# Select features. You should modify this section to try different features!             #
# We've selected rain, precipi, Hour, meantempi, and UNIT (as a dummy) to start you off. #
# See this page for more info about dummy variables:                                     #
# http://pandas.pydata.org/pandas-docs/stable/generated/pandas.get_dummies.html          #
##########################################################################################
dataframe = pandas.read_csv('3-turnstile_data_master_with_weather.csv')
features = dataframe[['rain', 'precipi', 'Hour', 'meantempi', 'maxtempi', 'meanwindspdi', 'meanpressurei']]
dummy_units = pandas.get_dummies(dataframe['UNIT'], prefix='unit')
features = features.join(dummy_units)

# Values
values = dataframe['ENTRIESn_hourly']

In [20]:
lms = linear_regression(features, values)
lms[1:]

(fog               118.781835
 rain              -13.750008
 precipi            -9.702977
 Hour               67.415458
 meantempi         -36.139719
 maxtempi           27.077071
 meanwindspdi       21.269070
 meanpressurei    -274.503936
 unit_R001        2445.942015
 unit_R002        -615.529645
 unit_R003       -1311.882330
 unit_R004        -990.236894
 unit_R005       -1002.250182
 unit_R006        -930.897654
 unit_R007       -1152.721592
 unit_R008       -1117.904374
 unit_R009       -1188.096638
 unit_R010        3047.584786
 unit_R011        6532.486601
 unit_R012        5965.952543
 unit_R013         982.662916
 unit_R014        2503.689590
 unit_R015         638.930190
 unit_R016        -551.087019
 unit_R017        2727.033321
 unit_R018        4432.891413
 unit_R019        1398.268754
 unit_R020        4998.186777
 unit_R021        2965.743264
 unit_R022        7121.154108
                     ...     
 unit_R450       -1013.876926
 unit_R451        -588.850397
 unit_R452

In [30]:
intercept = lms[0]
params = lms[1]
predictions = intercept + np.dot(features, params)
predictions[1:5]

array([ 3588.62346134,  3858.28529493,  4127.94712853,  4397.60896213])

In [32]:
r_squared = compute_r_squared(values, predictions)
r_squared

0.4594291407105473