In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import pyplot
import warnings
warnings.filterwarnings("ignore")
from math import radians, cos, sin, asin, sqrt
from sklearn import linear_model
from datetime import timedelta
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [3]:
#Reading data file 
data = pd.read_csv('df_mod.csv')
data.head()

Unnamed: 0,trip_id,bike_id,start_station,end_station,trip_route_category,start_time,end_time,start_lat,start_lon,end_lat,end_lon,plan_duration,passholder_type
0,59256078,6417,4159,3042,One Way,21/11/2017 11:42:00 AM,21/11/2017 12:37:00 PM,34.145901,-118.11653,34.049301,-118.2388,0,Walk-up
1,60223612,6453,4159,4138,One Way,27/11/2017 8:13:00 AM,27/11/2017 8:41:00 AM,34.145901,-118.11653,34.145691,-118.14823,0,Walk-up
2,58576402,6650,4159,4138,One Way,17/11/2017 4:39:00 PM,17/11/2017 4:57:00 PM,34.145901,-118.11653,34.145691,-118.14823,30,Monthly Pass
3,51293790,12327,4159,4138,One Way,11/10/2017 16:18,11/10/2017 16:33,34.145901,-118.11653,34.145691,-118.14823,30,Monthly Pass
4,50594191,6000,4159,4138,One Way,8/10/2017 11:51,8/10/2017 12:12,34.145901,-118.11653,34.145691,-118.14823,0,Walk-up


In [4]:
#Creating a new column Date from the column "start_time" of the dataframe 
data["Date"] = pd.to_datetime(data["start_time"])

In [5]:
#Creating a copy of data since loading data takes a lot of time
data1 = data.copy()

In [6]:
#Dropping null values
data1 = data1.dropna() 
# Converting float values to interger
data1["end_station"] = data1["end_station"].astype(int)  
data1 = data1.set_index("Date") #setting index to Date column
data1 = data1.sort_index() #Arranging dates 

In [7]:
data1.head()

Unnamed: 0_level_0,trip_id,bike_id,start_station,end_station,trip_route_category,start_time,end_time,start_lat,start_lon,end_lat,end_lon,plan_duration,passholder_type
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2016-01-08 00:12:00,4767292,6127,3034,3055,One Way,1/8/2016 0:12,1/8/2016 0:49,34.042061,-118.26338,34.044159,-118.25158,0,Walk-up
2016-01-08 00:13:00,4767291,6026,3034,3055,One Way,1/8/2016 0:13,1/8/2016 0:49,34.042061,-118.26338,34.044159,-118.25158,0,Walk-up
2016-01-08 00:18:00,4767290,5990,3021,3022,One Way,1/8/2016 0:18,1/8/2016 0:43,34.045609,-118.23703,34.04607,-118.23309,30,Monthly Pass
2016-01-08 00:23:00,4767289,6585,3063,3064,One Way,1/8/2016 0:23,1/8/2016 1:46,34.049198,-118.25283,34.04681,-118.25698,0,Walk-up
2016-01-08 00:25:00,4767288,6069,3063,3064,One Way,1/8/2016 0:25,1/8/2016 1:46,34.049198,-118.25283,34.04681,-118.25698,0,Walk-up


In [8]:
#length of end station is 142 and length of start station is 140
end_stn = list(set(data1.end_station))
start_stn = list(set(data1.start_station))
"""As length is not same of these two list, a search method is used to identify which element 
is missing from the start station, hence the next chunk of code is to find
the station number not in the start station list"""
remove_stn = []
for i in range(len(end_stn)):
    if end_stn[i] not in start_stn:
        remove_stn.append(end_stn[i])
#Updating end station list
end_stn = [stn for stn in end_stn if stn not in remove_stn]

In [9]:
#Removing rows with station number 4110,4143 and 4118 as it is discarded from the analysis
data1 = data1[data1['end_station']!=4110]
data1 = data1[data1['end_station']!=4118]
data1 = data1[data1['start_station']!=4143]

In [10]:
#Setting plan_duration = 0 for passholder_type = Walk-up
data1['plan_duration'].loc[data1['passholder_type'] == "Walk-up"] = 0

#Setting annual pass = flex as both of them are the same
data1['passholder_type'].loc[data1['passholder_type'] == "Annual Pass"] = "Flex Pass"

In [11]:
#Calculating trip duration in minutes
data1['start_time']= pd.to_datetime(data1['start_time']) 
data1['end_time']= pd.to_datetime(data1['end_time']) 
data1['trip_duration_mins'] = (data1.end_time - data1.start_time)/ timedelta(minutes=1)

In [12]:
#Creating a copy of data since it takes time to load data
data2 = data1.copy()

In [13]:
#Create dummy variables for passholder type
one_hot_pass = pd.get_dummies(data2['passholder_type']).rename(columns={
            'Flex Pass': 'annual',
            'Monthly Pass': 'monthly',
            'One Day Pass': 'one_day',
            'Walk-up':'walk_up'})

#Create dummy variables for trip route category
one_hot_trip_type = pd.get_dummies(data2['trip_route_category']).rename(columns={
            'Round Trip': 'round_trip',
            'One Way': 'one_way'})

In [14]:
data2 = pd.concat([data2, one_hot_trip_type,one_hot_pass], axis=1, sort=False)

In [16]:
data2.to_csv('pricing_optimization.csv')

In [None]:
import googlemaps

#Setting up my google api key
apiKey = 'XXXXXXXXXXXXXXXXXXXXXXXXXX'
gmaps = googlemaps.Client(key=apiKey)

#Entering latitude and longitude to be used
start_loc = "34.04739000000001,-118.21884900000035"
end_loc = "34.04652000000171,-118.23741100000254"

#Calculating distance by setting the travel mode = bicycling
result = gmaps.distance_matrix(start_loc, end_loc, mode="bicycling")

#Saving Distance and duration in the variables
dist = result["rows"][0]["elements"][0]["distance"]["value"]
dur = result["rows"][0]["elements"][0]["duration"]["value"]
dur = dur/60.0

#printing results
print(" Cycling Distance(meters): ", dist)
print(" Expected Cycling Time(min): ", dur)
print(result)

In [17]:
unique_coordinates = data2[data2['round_trip']==0]
unique_coordinates = data2[['start_lat', 'start_lon', 'end_lat','end_lon']].drop_duplicates()
print(unique_coordinates.shape)

(13641, 4)


In [26]:
def distance_calc(df):
    start_loc = (df['start_lat'], df['start_lon'])
    end_loc = (df['end_lat'], df['end_lon'])
    #Setting up my google api key
    apiKey = 'AIzaSyBbF1n4Nlhwv1gc07THKfvp-IBkEcuJV7k'
    gmaps = googlemaps.Client(key=apiKey)
    result = gmaps.distance_matrix(start_loc, end_loc, mode="bicycling")
    dist = result["rows"][0]["elements"][0]["distance"]["value"]
    return dist

In [20]:
import googlemaps

#Dropping rows where coordinates are 0,0 
unique_coordinates = unique_coordinates.loc[(unique_coordinates!=0).any(axis=1)]

#Calculating distance
unique_coordinates['distance'] = unique_coordinates.head(5).apply (lambda row: distance_calc(row),axis=1)
unique_coordinates.head(10)

Unnamed: 0_level_0,start_lat,start_lon,end_lat,end_lon,distance
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2016-01-08 00:12:00,34.042061,-118.26338,34.044159,-118.25158,1550.0
2016-01-08 00:18:00,34.045609,-118.23703,34.04607,-118.23309,367.0
2016-01-08 00:23:00,34.049198,-118.25283,34.04681,-118.25698,629.0
2016-01-08 00:42:00,34.058319,-118.24609,34.058319,-118.24609,0.0
2016-01-08 00:51:00,34.04607,-118.23309,34.046612,-118.26273,3960.0
2016-01-08 00:54:00,34.045609,-118.23703,34.042061,-118.26338,
2016-01-08 01:08:00,34.042061,-118.26338,34.04417,-118.26117,
2016-01-08 01:11:00,34.044159,-118.25158,34.04855,-118.25905,
2016-01-08 01:31:00,34.0532,-118.25095,34.0532,-118.25095,
2016-01-08 01:33:00,34.04417,-118.26117,34.039982,-118.2664,


In [21]:
#Calculating Distance traveled 
def calc_dist(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
    return c * r

In [None]:
data2 = data2[(data2['trip_duration_mins']>=0) & (data2['trip_duration_mins']<50)]
data2 = data2[data2['Distance'] <50]
data2.shape

In [None]:
month = 'month'
week = 'week'
day = 'day'
hour = 'hour'
def grouping(data,col):
    '''
    Stacking dataset on the basis of start_station
    Args: data = dataframe
          col = column name/ time frame
    Returns: Stacked dataframe with index resetted
    '''
    df = data.copy()
    df["Year"]=df.start_time.dt.year
    if col.lower() == 'month':
        col = col.capitalize()
        df[col] = df.start_time.dt.month
    elif col.lower() == 'week':
        col = col.capitalize()
        df[col] = df.start_time.dt.week
    elif col.lower() == 'day':
        col = col.capitalize()
        df[col] = df.start_time.dt.day
    elif col.lower() == 'hour':
        col = col.capitalize()
        df[col] = df.start_time.dt.hour
    else:
        return print('Unable to extract time from index!!')
    df_sum = df.groupby(by=["start_station","Year",col]).sum()[['Distance', 'trip_duration_mins']]
    df_count = df.groupby(by=["start_station","Year",col]).count()[['trip_id']]
    df = pd.concat([df_sum, df_count], axis=1)
    df = df.reset_index()
    return df

In [None]:
data3 = grouping(data2,month)
#data3 = grouping(common,week)

In [None]:
data3.head(20)

In [None]:
#Setting offset for continuity in months
data3.loc[data3['Year']==2017, 'Month'] = data3['Month']+12
data3.loc[data3['Year']==2018, 'Month'] = data3['Month']+24

In [None]:
data3 = data3[data3['start_station']!=4276]

In [None]:
station = list(set(data3.start_station))
r_squared = []
#Looping over all the stations to fit linear regressions individually
for i in range(len(station)):
    stn = station[i]
    df = data3.loc[data3['start_station']==stn]
    x = df[['Distance','trip_duration_mins']]
    y = df["trip_id"]
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
    print(X_train)
    reg = linear_model.LinearRegression()
    print(stn)
    est2 = reg.fit(X_train,y_train)
    ypred = est2.predict(x) #predicting y values on test dataset
    score = r2_score(y,ypred) #Calculating r-squared value on the test data
    r_squared.append(score)
r_squared

In [None]:
df = data3.loc[data3['start_station']==4273]
x = df[['Distance','trip_duration_mins']]
y = df["trip_id"]
X_train, X_test, y_train, y_test = train_test_split( #Splitting data into train & test
            x, y, test_size=0.33, random_state=42)
reg = linear_model.LinearRegression()
est2 = reg.fit(X_train,y_train)
ypred = est2.predict(X_test) #predicting y values on test dataset
score = r2_score(y_test,ypred) #Calculating r-squared value on the test data
#r_squared.append(score)
score

In [None]:
#Checking whether there is time series in the data
import matplotlib.pyplot as plt
plt.plot(df['Week'],df['trip_id'])
plt.show()