In [170]:
import pandas as pd
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [3]:
! ls ../data

apr_2024.csv may_2024.csv


In [9]:
! head -2 ../data/apr_2024.csv

Departure,Return,Bike,Electric bike,Departure station,Return station,Membership type,Covered distance (m),Duration (sec.),Departure temperature (C),Return temperature (C),Stopover duration (sec.),Number of stopovers
2024-05-01 0:00,2024-05-01 0:00,30250,TRUE,0002 Burrard Station (Melville & Dunsmuir),0266 St Catherines & 7th,365 Corporate Plus,4322,1064,7,11,0,0


In [66]:
def convert_to_int(df, columns):
    # convert columns to int and drop those that can't be converted
    for col in columns_to_convert:
        df[col] = pd.to_numeric(df[col], errors='coerce')
        
    df.dropna(subset=columns_to_convert, inplace=True)
    
    for col in columns_to_convert:
        df[col] = df[col].astype(int)
    return df

In [87]:
df = pd.read_csv("../data/apr_2024.csv")
df.head(2)

Unnamed: 0,Departure,Return,Bike,Electric bike,Departure station,Return station,Membership type,Covered distance (m),Duration (sec.),Departure temperature (C),Return temperature (C),Stopover duration (sec.),Number of stopovers
0,2024-05-01 0:00,2024-05-01 0:00,30250.0,True,0002 Burrard Station (Melville & Dunsmuir),0266 St Catherines & 7th,365 Corporate Plus,4322.0,1064,7,11,0,0
1,2024-05-01 0:00,2024-05-01 0:00,30643.0,True,0310 Jervis & Robson,0190 Melville & Bute,365 Day Pass Plus,501.0,108,5,9,0,0


In [88]:
def map_membership_type(membership_type):
    if pd.isna(membership_type):
        return 'Unknown'
    elif '365' in membership_type:
        return 'Annual Pass'
    elif '30 Day' in membership_type or 'Monthly' in membership_type:
        return 'Monthly Pass'
    elif 'Community Pass' in membership_type:
        return 'Community Pass'
    elif 'Pay Per Ride' in membership_type:
        return 'Pay Per Ride'
    else:
        return 'Other'

In [162]:
df = pd.read_csv("../data/apr_2024.csv")

columns_to_convert = ['Bike', 'Covered distance (m)']

df = convert_to_int(df, columns_to_convert)
df['Departure'] = pd.to_datetime(df['Departure'], errors='coerce', format='%Y-%m-%d %H:%M')

df['departure_day'] = df['Departure'].dt.date
df['departure_hour'] = df['Departure'].dt.hour
df['membership_cat'] = df['Membership type'].apply(map_membership_type)


df = pd.get_dummies(df, columns=["membership_cat"])
boolean_columns = [x for x in df.columns if x.startswith("membership_cat")] + ["Electric bike"]
df[boolean_columns] = df[boolean_columns].astype(int)
df["duration_min"] = (df["Duration (sec.)"]/60).astype(int)
# 270 is 99.5 percentile of Apr 2024 data
df = df.query("duration_min > 0 and duration_min < 270")

df.drop(["Departure", "Return", 
         "Return station", "Return temperature (C)",
         "Number of stopovers", "Stopover duration (sec.)", 
         "Membership type", "Covered distance (m)"], axis=1, inplace=True)
# drop for now, might use in future version
df.drop(["Departure station", "departure_day", "Bike", "Duration (sec.)"], axis=1, inplace=True)

In [163]:
df.head(100)

Unnamed: 0,Electric bike,Departure temperature (C),departure_hour,membership_cat_Annual Pass,membership_cat_Community Pass,membership_cat_Monthly Pass,membership_cat_Other,membership_cat_Pay Per Ride,membership_cat_Unknown,duration_min
0,1,7,0,1,0,0,0,0,0,17
1,1,5,0,1,0,0,0,0,0,1
2,0,10,0,1,0,0,0,0,0,2
3,0,9,0,1,0,0,0,0,0,6
4,1,6,0,0,0,1,0,0,0,8
...,...,...,...,...,...,...,...,...,...,...
98,0,11,23,1,0,0,0,0,0,10
99,0,12,23,0,0,0,0,1,0,14
100,1,7,23,0,0,0,1,0,0,15
101,1,7,22,0,0,0,0,1,0,7


In [164]:
X = df.drop(["duration_min"], axis=1)
y = df[["duration_min"]]

In [166]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [167]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((74024, 9), (18507, 9), (74024, 1), (18507, 1))

In [168]:
model = LinearRegression()
model.fit(X_train, y_train)

In [171]:
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R^2 Score: {r2}")

Mean Squared Error: 368.65356515651763
Mean Absolute Error: 11.868483483644455
R^2 Score: 0.15915447558325857
