In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import glob
import itertools
import sklearn
from sklearn.model_selection import train_test_split
import datetime as dt
from sklearn.linear_model import LinearRegression, Ridge, LogisticRegression
from sklearn import metrics
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor

In [14]:
divvy_agg = pd.read_csv('divvy_agg_data.csv')

In [16]:
divvy_agg.head()

Unnamed: 0,station_id,date,hour,trips_departed,trips_arrived,Air Temperature,Wet Bulb Temperature,Humidity,Rain Intensity,Interval Rain,...,Solar Radiation,Heading,Battery Life,day_of_week,month,day_of_month,latitude,longitude,dpcapacity,online_date
0,2,2016-01-01,0,0.0,0.0,-2.923333,-4.2,66.0,0.0,0.0,...,2.666667,356.0,13.066667,4.0,1.0,1.0,41.872638,-87.623979,35,2015-05-08
1,2,2016-01-01,1,0.0,0.0,-3.72,-4.8,67.333333,0.0,0.0,...,3.0,356.0,12.9,4.0,1.0,1.0,41.872638,-87.623979,35,2015-05-08
2,2,2016-01-01,2,0.0,0.0,-4.566667,-5.6,65.0,0.0,0.0,...,3.0,356.5,13.066667,4.0,1.0,1.0,41.872638,-87.623979,35,2015-05-08
3,2,2016-01-01,3,0.0,0.0,-5.393333,-6.35,67.0,0.0,0.0,...,3.0,356.0,12.866667,4.0,1.0,1.0,41.872638,-87.623979,35,2015-05-08
4,2,2016-01-01,4,0.0,0.0,-6.066667,-6.9,69.0,0.0,0.0,...,3.0,356.0,12.933333,4.0,1.0,1.0,41.872638,-87.623979,35,2015-05-08


## Creating Time based Indicators
We first create time based features that can be used to calculate statistics on station activity:

    1) trips/sma = Number of Trips(departures or arrivals) over the average number of departures 
    over the last N hours
    2) Momentum = % change in departures or arrivals between hours (in this case last hour and prior hour)
    3) ema = exponential moving average = average number of departures or arrivals in the 
    last N hours but weighted more heavily towards the most recent hours

In [17]:
def trips_over_sma(trips, n):
    sma = trips.rolling(window=n).mean()
    trips_sma = trips.shift(1)/sma - 1
    return trips_sma

def momentum(trips, n):
    mom = trips/trips.shift(n) - 1
    return mom

def ema(trips, n):
    return trips.ewm(span=n).mean()

## Deriving the Predictor features
These in turn are then used to derive features for both departures and arrivals.  I also include bike changes (departures - arrivals)
along with previous hour weather data as they could potentially be predictive of departures and arrivals. Finally a few
indicators that measure departures and arrivals as a percentage of total capacity are also calculated to normalize for
big and small stations.

In [18]:
def create_time_features(divvy_agg, start, finish):
    stations = divvy_agg['station_id'].unique()
    indicators_data = pd.DataFrame()
    print ('Feature Engineering Stations Completed:')
    
    for station in stations[start:finish]:
        print(','+ str(station), end='')
        
        data = divvy_agg
        data.is_copy = False
        data = data.loc[data['station_id'] == station,:]
        
        ##first creating weather features
        data['temp_last_hour'] = data['Air Temperature'].shift(1)
        data['interval_rain_last_hour'] = data['Interval Rain'].shift(1)
        data['humidity_last_hour'] = data['Humidity'].shift(1)
        data['rain_intensity_last_hour'] = data['Rain Intensity'].shift(1)
        
        
        ##now creating features based on station activity
        data['departed_trips_sma'] = trips_over_sma(data['trips_departed'].shift(1), 4)
        data['departed_momentum'] = momentum(data['trips_departed'].shift(1), 2)
        data['departed_ema_n'] = ema(data['trips_departed'].shift(1), 4)
        data['departed_last_hour'] = data['trips_departed'].shift(1)
        data['departed_last_hour_yesterday'] = data['trips_departed'].shift(24)

        data['arrived_trips_sma'] = trips_over_sma(data['trips_arrived'].shift(1), 4)
        data['arrived_momentum'] = momentum(data['trips_arrived'].shift(1), 2)
        data['arrived_ema_n'] = ema(data['trips_arrived'].shift(1), 4)
        data['arrived_last_hour'] = data['trips_arrived'].shift(1)
        data['arrived_last_hour_yesterday'] = data['trips_arrived'].shift(24)

        
        data['net_bike_change'] = data['trips_arrived'] - data['trips_departed']
        data['bike_change_momentum'] = momentum(data['net_bike_change'].shift(1), 2)
        data['bike_change_ema_n'] = ema(data['net_bike_change'].shift(1), 3)
        data['bike_change_last_hour'] = data['net_bike_change'].shift(1)
        
        data['departures_over_capacity'] = data['departed_last_hour']/data['dpcapacity']
        data['arrivals_over_capacity'] = data['arrived_last_hour']/data['dpcapacity']
        data['bike_change_over_capacity'] = data['bike_change_last_hour']/data['dpcapacity']

            
        indicators_data = indicators_data.append(data, ignore_index=True)
        indicators_data['departed_momentum'].replace(np.inf, 1, inplace=True)
        indicators_data['arrived_momentum'].replace(np.inf, 1, inplace=True)
        indicators_data['bike_change_momentum'].replace(np.inf, 1, inplace=True)
        indicators_data['departed_trips_sma'].replace(np.inf, 1, inplace=True)
        indicators_data['arrived_trips_sma'].replace(np.inf, 1, inplace=True)
                                
    return indicators_data

In [19]:
data_with_time_features_1 = create_time_features(divvy_agg,0,100)
data_with_time_features_2 = create_time_features(divvy_agg,100,200)
data_with_time_features_3 = create_time_features(divvy_agg,200,300)
data_with_time_features_4 = create_time_features(divvy_agg,300,400)
data_with_time_features_5 = create_time_features(divvy_agg,400,500)
data_with_time_features_6 = create_time_features(divvy_agg,500,581)

,2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/s

,3,4,5,6,7,9,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,66,67,68,69,71,72,73,74,75,76,77,80,81,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,140,141,142,143,144,145,146,147,148,149,150,152,153,154,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,188,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,222,223,224,225,226,227,228,229,230,231,232,233,234,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,267,268,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,

In [20]:
data_with_time_features = data_with_time_features_1.append(data_with_time_features_2, ignore_index=True)
data_with_time_features = data_with_time_features.append(data_with_time_features_3, ignore_index=True)
data_with_time_features = data_with_time_features.append(data_with_time_features_4, ignore_index=True)
data_with_time_features = data_with_time_features.append(data_with_time_features_5, ignore_index=True)
data_with_time_features = data_with_time_features.append(data_with_time_features_6, ignore_index=True)

In [21]:
'''
data_with_time_features['departures_over_capacity'] = data_with_time_features['departed_last_hour']/data_with_time_features['dpcapacity']
data_with_time_features['arrivals_over_capacity'] = data_with_time_features['arrived_last_hour']/data_with_time_features['dpcapacity']
data_with_time_features['bike_change_over_capacity'] = data_with_time_features['bike_change_last_hour']/data_with_time_features['dpcapacity']
'''

## filtering for only the relevant features we want to use

In [23]:
features_to_keep = ['station_id','date','hour','trips_departed', 'trips_arrived', 'Air Temperature','Humidity'
                    ,'Rain Intensity','Wind Speed','Max Wind Speed'
                   ,'Interval Rain','latitude','longitude','Total Rain','Precipitation Type','dpcapacity'
                    ,'day_of_week','month','day_of_month'
                   ,'temp_last_hour','interval_rain_last_hour','humidity_last_hour','rain_intensity_last_hour'
                    ,'departed_trips_sma', 'departed_momentum', 'departed_ema_n',
                    'departed_last_hour', 'departed_last_hour_yesterday',
                    'arrived_trips_sma', 'arrived_momentum', 'arrived_ema_n',
                    'arrived_last_hour', 'arrived_last_hour_yesterday',
                    'net_bike_change', 'bike_change_ema_n', 'bike_change_last_hour',
                    'departures_over_capacity', 'arrivals_over_capacity','bike_change_over_capacity']

Let's look at the final dataset with the derived features

In [24]:
data_with_time_features_final = data_with_time_features.loc[data_with_time_features['date']>'2016-01-02']
data_with_time_features_final = data_with_time_features_final.filter(items=features_to_keep)
data_with_time_features_final = data_with_time_features_final.fillna(0)
data_with_time_features_final.head()

Unnamed: 0,station_id,date,hour,trips_departed,trips_arrived,Air Temperature,Humidity,Rain Intensity,Wind Speed,Interval Rain,...,arrived_momentum,arrived_ema_n,arrived_last_hour,arrived_last_hour_yesterday,net_bike_change,bike_change_ema_n,bike_change_last_hour,departures_over_capacity,arrivals_over_capacity,bike_change_over_capacity
48,2,2016-01-03,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.008789,0.0,0.0,0.0,0.0
49,2,2016-01-03,1,0.0,0.0,-2.456667,69.333333,0.0,3.9,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.004395,0.0,0.0,0.0,0.0
50,2,2016-01-03,2,0.0,0.0,-2.82,70.666667,0.0,2.533333,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.002197,0.0,0.0,0.0,0.0
51,2,2016-01-03,3,0.0,0.0,-3.06,70.0,0.0,2.633333,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.001099,0.0,0.0,0.0,0.0
52,2,2016-01-03,4,0.0,0.0,-3.053333,68.333333,0.0,3.533333,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.000549,0.0,0.0,0.0,0.0


In [25]:
data_with_time_features_final.describe()

Unnamed: 0,station_id,hour,trips_departed,trips_arrived,Air Temperature,Humidity,Rain Intensity,Wind Speed,Interval Rain,latitude,...,arrived_momentum,arrived_ema_n,arrived_last_hour,arrived_last_hour_yesterday,net_bike_change,bike_change_ema_n,bike_change_last_hour,departures_over_capacity,arrivals_over_capacity,bike_change_over_capacity
count,9700776.0,9700776.0,9700776.0,9700776.0,9700776.0,9700776.0,9700776.0,9700776.0,9700776.0,9700776.0,...,9700776.0,9700776.0,9700776.0,9700776.0,9700776.0,9700776.0,9700776.0,9700776.0,9700776.0,9700776.0
mean,302.6733,11.5,0.7635876,0.7636243,11.38817,65.63681,0.160601,2.888842,0.1443156,41.88819,...,0.08032177,0.7636334,0.7636263,0.7637045,3.66981e-05,3.692132e-05,3.690426e-05,0.03794305,0.03802347,8.042538e-05
std,174.3764,6.922187,2.519022,2.574031,10.55793,20.03322,1.482422,4.557347,0.8775919,0.06482263,...,0.8721453,2.038082,2.574031,2.574039,1.857332,1.251559,1.857333,0.1022696,0.1039946,0.08509159
min,2.0,0.0,0.0,0.0,-18.92,0.0,0.0,0.0,0.0,41.73665,...,-1.0,0.0,0.0,0.0,-119.0,-62.41798,-119.0,0.0,0.0,-5.173913
25%,154.0,5.75,0.0,0.0,2.92,56.0,0.0,1.7,0.0,41.85378,...,0.0,0.001480169,0.0,0.0,0.0,-0.08630426,0.0,0.0,0.0,0.0
50%,300.0,11.5,0.0,0.0,11.29,68.33333,0.0,2.633333,0.0,41.88824,...,0.0,0.1484383,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,451.0,17.25,1.0,1.0,20.86667,79.66667,0.0,3.766667,0.0,41.9319,...,0.0,0.7276999,1.0,1.0,0.0,0.08213845,0.0,0.04347826,0.03703704,0.0
max,620.0,23.0,181.0,208.0,33.92667,100.0,96.0,506.05,17.88333,42.064,...,72.0,159.757,208.0,208.0,101.0,67.918,101.0,6.0,5.0,4.066667


In [28]:
data_with_time_features_final.to_csv('divvy_agg_features.csv',index=False)