In [1]:
# Import packages
import numpy as np
import pandas as pd
import datetime as dt
# For dates
from datetime import datetime as dta
import matplotlib.pyplot as plt

In [2]:
# Load the rides for a given year
which_year = '2018'
file_format = '.csv'
station_num  = '3045'
rides_filename = 'Station' + station_num + '-' + which_year + file_format
weather_filename = 'Weather' + which_year + file_format
rides_df = pd.read_csv(rides_filename)
weather_df = pd.read_csv(weather_filename)

In [3]:
# Drop nans for all weather variables
weather_df = weather_df.dropna(subset=['apparentTemperature', 'temperature', 'precipProbability', 
                                                    'precipIntensity', 'pressure', 'windBearing', 'windSpeed'])

In [4]:
# Get a little more from the data out of the precipitation
def is_rain(x):
    if x == 'rain': 
        return 1
    else: 
        return 0

def is_snow(x):
    if x in ['sleet', "snow"]: 
        return 1
    else: 
        return 0    
        
def precYN(x):
    if x in ['sleet', "snow", "rain"]: 
        return 1
    else: 
        return 0   
    
def day_week(x):
    if x == 0: 
        return "Monday"
    elif x == 1: 
        return "Tuesday"
    elif x == 2: 
        return "Wednesday"
    elif x == 3: 
        return "Thursday"
    elif x == 4: 
        return "Friday"
    elif x == 5: 
        return "Saturday"
    elif x == 6: 
        return "Sunday"
    
def is_weekday(x):
    weekendList = ['Saturday', "Sunday"]
    if x in weekendList: 
        return 0
    else: 
        return 1

def clean_holidays(x): 
    obs_string = ' (Observed)'
    if obs_string in x: 
        return x.strip(obs_string)
    else:
        return x        

In [5]:
# shape columns for join for weather data
weather_df['hour'] = pd.DatetimeIndex(weather_df['hour']).hour
weather_df['date'] = pd.to_datetime(weather_df['date_time'])
weather_df['snow'] = weather_df["precipType"].apply(is_snow)
weather_df['rain'] = weather_df["precipType"].apply(is_rain)
weather_df['precYN'] = weather_df["precipType"].apply(precYN)

# set dummy variables for weekdays
weather_df['weekday_dummy'] = weather_df["weekday"].apply(day_week)
temp_dummy_week = pd.get_dummies(weather_df['weekday_dummy'])
weather_df = pd.concat([weather_df, temp_dummy_week], axis=1)
weather_df['is_weekday'] = weather_df["weekday_dummy"].apply(is_weekday)

# clean holidays
# weather_df['which_holiday'] = weather_df['which_holiday'].apply(clean_holidays)
# set dummy variables for holidays
# temp_dummy_holiday = pd.get_dummies(weather_df['which_holiday'])
#weather_df = pd.concat([weather_df, temp_dummy_holiday], axis=1)

# delete the column we don't need
weather_df.drop(['Unnamed: 0'], axis=1, inplace = True)

In [6]:
# categorize hours
bins = [0, 6, 10, 16, 20]
names = ['night', 'rushM', 'midday', 'rushE', 'evening']
temp_d = dict(enumerate(names, 1))
weather_df['time_day'] = np.vectorize(temp_d.get)(np.digitize(weather_df['hour'], bins))

temp_dummy_day = pd.get_dummies(weather_df['time_day'])
weather_df = pd.concat([weather_df, temp_dummy_day], axis=1)

In [7]:
# shape columns for join for rides data
rides_df['hour'] = rides_df['start_time_hour']
rides_df['date'] = pd.to_datetime(rides_df['start_time_date']) 
rides_df.drop(['Unnamed: 0'], axis=1, inplace = True)


In [8]:
# Merge
new_df = pd.merge(rides_df, weather_df, on = ['date', 'hour'])
new_df

Unnamed: 0,count,timestamp,start_time_date,start_time_hour,hour,date,apparentTemperature,date_time,day,humidity,...,rushE,rushM,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday,night
0,0,2018-01-01 00:00:00,2018-01-01,0,0,2018-01-01,9.12,2018-01-01T00:00:00,1,0.55,...,0,0,0,1,0,0,0,0,0,1
1,0,2018-01-01 00:00:00,2018-01-01,0,0,2018-01-01,8.83,2018-01-01T00:00:00,1,0.55,...,0,0,0,1,0,0,0,0,0,1
2,0,2018-01-01 00:00:00,2018-01-01,0,0,2018-01-01,8.40,2018-01-01T00:00:00,1,0.56,...,0,0,0,1,0,0,0,0,0,1
3,0,2018-01-01 00:00:00,2018-01-01,0,0,2018-01-01,7.77,2018-01-01T00:00:00,1,0.58,...,0,0,0,1,0,0,0,0,0,1
4,0,2018-01-01 00:00:00,2018-01-01,0,0,2018-01-01,7.28,2018-01-01T00:00:00,1,0.59,...,0,0,0,1,0,0,0,0,0,1
5,0,2018-01-01 00:00:00,2018-01-01,0,0,2018-01-01,6.92,2018-01-01T00:00:00,1,0.60,...,0,0,0,1,0,0,0,0,0,1
6,0,2018-01-01 00:00:00,2018-01-01,0,0,2018-01-01,6.41,2018-01-01T00:00:00,1,0.62,...,0,1,0,1,0,0,0,0,0,1
7,0,2018-01-01 00:00:00,2018-01-01,0,0,2018-01-01,6.09,2018-01-01T00:00:00,1,0.63,...,0,1,0,1,0,0,0,0,0,1
8,0,2018-01-01 00:00:00,2018-01-01,0,0,2018-01-01,6.72,2018-01-01T00:00:00,1,0.63,...,0,1,0,1,0,0,0,0,0,1
9,0,2018-01-01 00:00:00,2018-01-01,0,0,2018-01-01,3.52,2018-01-01T00:00:00,1,0.60,...,0,1,0,1,0,0,0,0,0,1


In [9]:
new_df

Unnamed: 0,count,timestamp,start_time_date,start_time_hour,hour,date,apparentTemperature,date_time,day,humidity,...,Thursday,Tuesday,Wednesday,is_weekday,time_day,evening,midday,night,rushE,rushM
0,0,2018-01-01 00:00:00,2018-01-01,0,0,2018-01-01,9.12,2018-01-01T00:00:00,1,0.55,...,0,0,0,1,night,0,0,1,0,0
1,1,2018-01-01 01:00:00,2018-01-01,1,1,2018-01-01,8.83,2018-01-01T00:00:00,1,0.55,...,0,0,0,1,night,0,0,1,0,0
2,0,2018-01-01 02:00:00,2018-01-01,2,2,2018-01-01,8.40,2018-01-01T00:00:00,1,0.56,...,0,0,0,1,night,0,0,1,0,0
3,0,2018-01-01 03:00:00,2018-01-01,3,3,2018-01-01,7.77,2018-01-01T00:00:00,1,0.58,...,0,0,0,1,night,0,0,1,0,0
4,0,2018-01-01 04:00:00,2018-01-01,4,4,2018-01-01,7.28,2018-01-01T00:00:00,1,0.59,...,0,0,0,1,night,0,0,1,0,0
5,0,2018-01-01 05:00:00,2018-01-01,5,5,2018-01-01,6.92,2018-01-01T00:00:00,1,0.60,...,0,0,0,1,night,0,0,1,0,0
6,0,2018-01-01 06:00:00,2018-01-01,6,6,2018-01-01,6.41,2018-01-01T00:00:00,1,0.62,...,0,0,0,1,rushM,0,0,0,0,1
7,1,2018-01-01 07:00:00,2018-01-01,7,7,2018-01-01,6.09,2018-01-01T00:00:00,1,0.63,...,0,0,0,1,rushM,0,0,0,0,1
8,0,2018-01-01 08:00:00,2018-01-01,8,8,2018-01-01,6.72,2018-01-01T00:00:00,1,0.63,...,0,0,0,1,rushM,0,0,0,0,1
9,0,2018-01-01 09:00:00,2018-01-01,9,9,2018-01-01,3.52,2018-01-01T00:00:00,1,0.60,...,0,0,0,1,rushM,0,0,0,0,1


In [10]:
# Analysis of strike data
if which_year == '2016': 
    start_from_temp = "24/10/2016"
    end_at_temp = "14/11/2016"
    start_from = dta.strptime(start_from_temp, "%d/%m/%Y")
    end_at = dta.strptime(end_at_temp, "%d/%m/%Y")
    strike_df  = new_df[(pd.to_datetime(new_df['date_time']) > start_from) & 
                      (pd.to_datetime(new_df['date_time']) <= end_at)].reset_index()
    strike_trips = strike_df.groupby(['start_time_date'])['count'].sum()
    thisMax = strike_trips.max()
    
    plot_fig = False
    if plot_fig == True: 
        # this is why we drop the weird week
        import matplotlib.pyplot as plt
        %matplotlib inline
        f = plt.figure()
        plt.plot(strike_trips, 'o', color = (178/255, 34/255, 34/255))
        plt.xlabel('dates')
        plt.ylabel('nTrips')
        ax = plt.gca()
        x_labels = ax.get_xticks()
        ax.set_xticklabels(['25T', '26W','27T', '28F', '29S', '30S', '31M',
                    '1T', '2W', '3T', '4F', '5S', '6S', '7M', 
                    '8T', '9W', '10T', '11F', '12S', '13S', '13M'])
        plt.plot([6.5, 6.5], [0, 140], color='k', linestyle='--', linewidth=2)
        plt.plot([13.5, 13.5], [0, 140], color='k', linestyle='--', linewidth=2)
        plt.ylim([0, thisMax+10])
        plt.show()
        f.savefig("strike.pdf", bbox_inches = 'tight')
    
    strike_start_temp = "1/11/2016"
    strike_end_temp = "7/11/2016"
    strike_start = dta.strptime(strike_start_temp, "%d/%m/%Y")
    strike_end = dta.strptime(strike_end_temp, "%d/%m/%Y")
    new_df  = new_df[(pd.to_datetime(new_df['date_time']) < strike_start) | 
                      (pd.to_datetime(new_df['date_time']) > strike_end)].reset_index() 
    new_df.drop(['index'], axis=1, inplace = True)    

In [11]:
result_filename = 'FullStation' + station_num + which_year + file_format
new_df.to_csv(result_filename)

In [12]:
new_df


Unnamed: 0,count,timestamp,start_time_date,start_time_hour,hour,date,apparentTemperature,date_time,day,humidity,...,Thursday,Tuesday,Wednesday,is_weekday,time_day,evening,midday,night,rushE,rushM
0,0,2018-01-01 00:00:00,2018-01-01,0,0,2018-01-01,9.12,2018-01-01T00:00:00,1,0.55,...,0,0,0,1,night,0,0,1,0,0
1,1,2018-01-01 01:00:00,2018-01-01,1,1,2018-01-01,8.83,2018-01-01T00:00:00,1,0.55,...,0,0,0,1,night,0,0,1,0,0
2,0,2018-01-01 02:00:00,2018-01-01,2,2,2018-01-01,8.40,2018-01-01T00:00:00,1,0.56,...,0,0,0,1,night,0,0,1,0,0
3,0,2018-01-01 03:00:00,2018-01-01,3,3,2018-01-01,7.77,2018-01-01T00:00:00,1,0.58,...,0,0,0,1,night,0,0,1,0,0
4,0,2018-01-01 04:00:00,2018-01-01,4,4,2018-01-01,7.28,2018-01-01T00:00:00,1,0.59,...,0,0,0,1,night,0,0,1,0,0
5,0,2018-01-01 05:00:00,2018-01-01,5,5,2018-01-01,6.92,2018-01-01T00:00:00,1,0.60,...,0,0,0,1,night,0,0,1,0,0
6,0,2018-01-01 06:00:00,2018-01-01,6,6,2018-01-01,6.41,2018-01-01T00:00:00,1,0.62,...,0,0,0,1,rushM,0,0,0,0,1
7,1,2018-01-01 07:00:00,2018-01-01,7,7,2018-01-01,6.09,2018-01-01T00:00:00,1,0.63,...,0,0,0,1,rushM,0,0,0,0,1
8,0,2018-01-01 08:00:00,2018-01-01,8,8,2018-01-01,6.72,2018-01-01T00:00:00,1,0.63,...,0,0,0,1,rushM,0,0,0,0,1
9,0,2018-01-01 09:00:00,2018-01-01,9,9,2018-01-01,3.52,2018-01-01T00:00:00,1,0.60,...,0,0,0,1,rushM,0,0,0,0,1


In [13]:
rides_df

Unnamed: 0,count,timestamp,start_time_date,start_time_hour,hour,date
0,0,2018-01-01 00:00:00,2018-01-01,0,0,2018-01-01
1,1,2018-01-01 01:00:00,2018-01-01,1,1,2018-01-01
2,0,2018-01-01 02:00:00,2018-01-01,2,2,2018-01-01
3,0,2018-01-01 03:00:00,2018-01-01,3,3,2018-01-01
4,0,2018-01-01 04:00:00,2018-01-01,4,4,2018-01-01
5,0,2018-01-01 05:00:00,2018-01-01,5,5,2018-01-01
6,0,2018-01-01 06:00:00,2018-01-01,6,6,2018-01-01
7,1,2018-01-01 07:00:00,2018-01-01,7,7,2018-01-01
8,0,2018-01-01 08:00:00,2018-01-01,8,8,2018-01-01
9,0,2018-01-01 09:00:00,2018-01-01,9,9,2018-01-01
