In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import pyplot
import warnings
warnings.filterwarnings("ignore")
from math import radians, cos, sin, asin, sqrt

In [11]:
"""Load data and create the time stamped index""" 
def load_data(path):
    ext = path.split(".")[-1].lower()
    if ext == "csv":
        data = pd.read_csv(path)
    elif ext == "xlsx":
        data = pd.read_excel(path)
    
    data["Date"] = pd.to_datetime(data["start_time"])
    data = data.set_index("Date")
    data = data.sort_index()
    data = data.dropna()
    data["end_station"] = [int(stn) for stn in data.end_station]   ## Converting float values to interger. 
    return data

In [12]:
"""Adding Dummy Variable and dropping station with no entries"""
def add_variables(data):
    df = data
    start_stn = list(set(df.start_station))
    end_stn = list(set(df.end_station))
    """ length of end station is 142 and length of start station is 140"""

    """As length is not same of these two list, a search method is used to identify which element 
    is missing from the start station, hence the next chunk of code is to find
    the station number not in the start station list"""
    remove_stn = []
    for i in range(len(end_stn)):
        if end_stn[i] not in start_stn:
            remove_stn.append(end_stn[i])
    """Updating end station list"""   
    end_stn = [stn for stn in end_stn if stn not in remove_stn]

    """Removing rows with station number 4110 and 4118 as it is discarded from the analysis"""
    index_df =[]
    for i in range(len(df.end_station)):
        if df.end_station[i] == remove_stn[0] or df.end_station[i] == remove_stn[1]:
            index_df.append(df.index[i])
    data = df.drop(index_df)
    
    from datetime import timedelta
    #Setting plan_duration = 0 for passholder_type = Walk-up
    data['plan_duration'].loc[data['passholder_type'] == "Walk-up"] = 0

    #Setting annual pass = flex as both of them are the same
    data['passholder_type'].loc[data['passholder_type'] == "Annual Pass"] = "Flex Pass"

    #Calculating trip duration in minutes
    data['start_time']= pd.to_datetime(data['start_time']) 
    data['end_time']= pd.to_datetime(data['end_time']) 
    data['trip_duration_mins'] = (data.end_time - data.start_time)/ timedelta(minutes=1)
    
    #Create dummy variables for passholder type
    one_hot_pass = pd.get_dummies(data['passholder_type']).rename(columns={
            'Flex Pass': 'annual',
            'Monthly Pass': 'monthly',
            'One Day Pass': 'one_day',
            'Walk-up':'walk_up'})

    #Create dummy variables for trip route category
    one_hot_trip_type = pd.get_dummies(data['trip_route_category']).rename(columns={
            'Round Trip': 'round_trip',
            'One Way': 'one_way'})
    #Calculating Distance traveled 
    data["Distance"]=""
    def haversine(lon1, lat1, lon2, lat2):
        """
        Calculate the great circle distance between two points 
        on the earth (specified in decimal degrees)
        """
        # convert decimal degrees to radians 
        lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

        # haversine formula 
        dlon = lon2 - lon1 
        dlat = lat2 - lat1 
        a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
        c = 2 * asin(sqrt(a)) 
        r = 6371 # Radius of earth in kilometers. Use 3956 for miles
        return c * r

    vfunc = np.vectorize(haversine)
    data.Distance= vfunc(data.start_lon,data.start_lat,data.end_lon,data.end_lat)
    
    data = pd.concat([data , one_hot_pass, one_hot_trip_type], axis=1)

    return data

In [13]:
"""Creating Regression Data"""
def regression_data(data):
    
    def grouping_by_month(data, yr, offset):
        df = data.copy()
        df.index = df.index.year
        df = df.loc[yr]
        df["Date"] = pd.to_datetime(df["start_time"])
        df = df.set_index("Date")
        df["Month"] = df.index.month + offset

        df_sum = df.groupby(by=["start_station","Month",]).sum()[['Distance', 'trip_duration_mins', 'annual', 'monthly',
           'one_day', 'walk_up', 'one_way', 'round_trip']]
        df_count = df.groupby(by=["start_station","Month",]).count()[['trip_id']]

        df = pd.concat([df_sum, df_count], axis=1)

        return df
    
    df2016 = grouping_by_month(data, 2016, 0)
    df2017 = grouping_by_month(data, 2017, 12)
    df2018 = grouping_by_month(data, 2018, 24)
    col = list(df2016.columns.values)
    
    data_16_17 = pd.merge(df2016, df2017, left_index=True, right_index=True, how="outer", on= col )
    final_data = pd.merge(data_16_17, df2018, left_index=True, right_index=True, how="outer", on= col )
    
    return final_data


In [14]:
def network(data):
    """Now creating a matrix of 140 x 140 with each station number"""
    matrix = np.zeros([140,140])
    station = list(set(data.start_station))
    strt_end_station = np.stack([data.start_station , data.end_station], axis= 0)

    for i in range(len(strt_end_station[0])):
        index1 = station.index(strt_end_station[0][i])
        index2 = station.index(strt_end_station[1][i])
        matrix[index1][index2] +=1

    station_matrix = pd.DataFrame(matrix, index= station, columns=station)
    
    return station_matrix

In [123]:
def linear_regression(data, station_list):
    station = station_list
    reg_coef = {"Demand_in_Stn": ['Distance','trip_duration_mins','annual','monthly','one_day','walk_up',
                             'one_way','round_trip','Time_line'] }
    for i in range(len(station)):
        stn = station[i]
        df = data.loc[stn]
        x = df.drop(["trip_id"], axis=1)
        x["Time_line"] = x.index
        y = df["trip_id"]
        reg = linear_model.LinearRegression()
        reg.fit(x,y)
        reg_coef[stn] =  reg.coef_
        coef_df = pd.DataFrame(reg_coef).T
        coef_df.reset_index()
        coef_df.columns = list(coef_df.iloc[0])
        coef_df = coef_df.drop(['Demand_in_Stn'], axis=0 )
        
    return coef_df

In [8]:
# Loading the LA Bike data
data = load_data("LABikeData.xlsx")

In [15]:
# Adding dummy variables and removing stations with the missing information
data1 = add_variables(data)

In [16]:
# Creating Linear regression dataset. 
data_reg = regression_data(data1)

In [126]:
# Linear regression 
station = list(set(data.start_station))

"""Data Frame with the coefficient of features listed in column head for all 140 stations"""
coef = linear_regression(data_reg, station)

In [125]:
coef

Unnamed: 0,Distance,trip_duration_mins,annual,monthly,one_day,walk_up,one_way,round_trip,Time_line
3074,-5.51347e-16,-7.0235e-16,0.333333,0.333333,0.333333,0.333333,0.666667,0.666667,1.76244e-15
3075,-1.363e-15,2.27538e-19,0.333333,0.333333,0.333333,0.333333,0.666667,0.666667,1.96789e-15
3076,-1.35139e-15,1.35233e-17,0.333333,0.333333,0.333333,0.333333,0.666667,0.666667,-8.33656e-17
3077,-6.49861e-16,2.73583e-16,0.333333,0.333333,0.333333,0.333333,0.666667,0.666667,-1.34812e-16
3078,6.5006e-15,-1.97657e-15,0.333333,0.333333,0.333333,0.333333,0.666667,0.666667,7.14903e-16
3079,-2.98584e-16,-2.72251e-18,0.333333,0.333333,0.333333,0.333333,0.666667,0.666667,-1.49279e-16
3080,-5.20085e-16,-3.1123e-18,0.333333,0.333333,0.333333,0.333333,0.666667,0.666667,7.7016e-17
3081,-1.15429e-16,9.82194e-19,0.333333,0.333333,0.333333,0.333333,0.666667,0.666667,8.66638e-16
3082,-1.23807e-15,1.94486e-18,0.333333,0.333333,0.333333,0.333333,0.666667,0.666667,2.08371e-15
4108,-5.008e-18,-8.10916e-18,0.420108,0.348294,0.186715,0.348294,0.651706,0.651706,7.34268e-16


In [6]:
# data["Year"] = data.index.year
# data["Month"] = data.index.month
# data["Week"] = data.index.week
# data["Weekday Name"] = data.index.weekday_name
# data["time"] = data.index.time

In [5]:
# #Reading json file which includes address and capabilities of different station
# #Parsing out relevant fields
# df = pd.read_json("LABike_data.json", orient='columns')
# station_properties = pd.read_json( (df['features']).to_json(),orient='index')
# station_details = pd.read_json(station_properties['properties'].to_json(), orient = 'index')
# station_info = station_details[['kioskId','addressStreet','name','bikesAvailable','totalDocks','docksAvailable']]

In [15]:
# station_details.columns

In [16]:
# plt.bar(station_details.index, station_details.bikesAvailable)

In [17]:
# data.to_csv("LABike_data_with_time_index.csv")

In [18]:
# daily_data = data.groupby(by="Weekday Name").count()
# pyplot.figure(num=None, figsize=(4, 3), dpi=250, facecolor='w', edgecolor='k')
# plt.bar(daily_data.index,daily_data.trip_id, color = "green")
# plt.ylabel("Total Count of Trips")
# plt.xlabel("Days")
# plt.title("Trips Per Day")
# plt.rc("xtick", labelsize=5)
# plt.rc("ytick", labelsize=5)

In [19]:
# weekly_data = data.groupby(by="Week").count()
# pyplot.figure(num=None, figsize=(4, 3), dpi=250, facecolor='w', edgecolor='k')
# plt.bar(weekly_data.index,weekly_data.trip_id, color = "magenta")
# plt.ylabel("Total Count of Trips")
# plt.xlabel("Weeks")
# plt.title("Trips Per Week")
# plt.rc("xtick", labelsize=10)
# plt.rc("ytick", labelsize=10)

In [20]:
# monthly_data = data.groupby(by="Month").count()
# pyplot.figure(num=None, figsize=(4, 3), dpi=250, facecolor='w', edgecolor='k')
# plt.bar(monthly_data.index,monthly_data.trip_id, color = "cyan")
# plt.ylabel("Total Count of Trips")
# plt.xlabel("Months")
# plt.title("Trips Per Month")
# plt.rc("xtick", labelsize=10)
# plt.rc("ytick", labelsize=10)

In [21]:
# yearly_data = data.groupby(by="Year").count()
# pyplot.figure(num=None, figsize=(4, 3), dpi=250, facecolor='w', edgecolor='k')
# plt.bar(yearly_data.index, yearly_data.trip_id, color= "grey")
# plt.ylabel("Total Count of Trips")
# plt.xlabel("Year")
# plt.title("Trips Per Year")
# plt.rc("xtick", labelsize=6)
# plt.rc("ytick", labelsize=10)

In [22]:
# passholder_type_data = data.groupby(by="passholder_type").count()
# pyplot.figure(num=None, figsize=(4, 3), dpi=250, facecolor='w', edgecolor='k')
# plt.bar(passholder_type_data.index, passholder_type_data.trip_id, color= "blue")
# plt.ylabel("Total Count of Trips")
# plt.xlabel("Type of pass")
# plt.title("Trips Per pass")
# plt.rc("xtick", labelsize=10)
# plt.rc("ytick", labelsize=10)

In [23]:
# start_station_data = data.groupby(by="start_station").count()
# start_station_data.index = [str(ind) for ind in start_station_data.index]
# start_station_data = start_station_data.sort_values(by=["trip_id"], ascending= False)
# first_30 = start_station_data.iloc[:30]
# pyplot.figure(num=None, figsize=(4, 3), dpi=250, facecolor='w', edgecolor='k')
# plt.bar(first_30.index, first_30.trip_id, color= "pink")
# plt.ylabel("Total Count of Trips")
# plt.xlabel("Starting Station (top 30)")
# plt.title("Trips Per per station")
# plt.rc("xtick", labelsize=2)
# plt.rc("ytick", labelsize=10)

In [24]:
# time_data = data.groupby(by="time").count()
# pyplot.figure(num=None, figsize=(4, 3), dpi=250, facecolor='w', edgecolor='k')
# plt.scatter(time_data.index, time_data.trip_id, color= "brown", marker=".")
# plt.ylabel("Total Count of Trips")
# plt.xlabel("Time")
# plt.title("Trips Per hr")
# plt.rc("xtick", labelsize=10)
# plt.rc("ytick", labelsize=10)

In [25]:
# trip_data = data.groupby(by="trip_route_category").count()
# pyplot.figure(num=None, figsize=(4, 3), dpi=250, facecolor='w', edgecolor='k')
# plt.bar(trip_data.index, trip_data.trip_id,color= "yellow")
# plt.ylabel("Total Count of Trips")
# plt.xlabel("Type of Trip")
# plt.title("Trips Per types of trip")
# plt.rc("xtick", labelsize=10)
# plt.rc("ytick", labelsize=10)