In [1]:
#| echo: false
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import Lasso, LogisticRegression, Ridge

In [2]:
#| echo: false
# Read Data
b_data = pd.read_csv("https://github.com/bnkessler/DataScienceMachineLearningProject/blob/main/Data/CTA_-_Ridership_-_Bus_Routes_-_Daily_Totals_by_Route_20231020.csv?raw=true")
t_data = pd.read_csv("https://github.com/bnkessler/DataScienceMachineLearningProject/blob/main/Data/CTA_-_Ridership_-__L__Station_Entries_-_Daily_Totals_20231020.csv?raw=true")
a_data = pd.read_csv("/Users/ben/Documents/GitHub/DataScienceMachineLearningProject/Data/Traffic_Crashes_-_Crashes.csv")

In [3]:
# Joining bus and train data
t_data["transit"] = t_data["date"].map(lambda x: "train")
b_data["transit"] = b_data["date"].map(lambda x: "bus")

b_data["rides"] = b_data["rides"].apply(lambda x: x.replace(",", ""))
b_data["rides"] = b_data["rides"].astype(int)

t_data["rides"] = t_data["rides"].apply(lambda x: x.replace(",", ""))
t_data["rides"] = t_data["rides"].astype(int)

b_dr = b_data[["date", "daytype", "rides", "transit"]].groupby("date").sum("rides").reset_index()
t_dr = t_data[["date", "daytype", "rides", "transit"]].groupby("date").sum("rides").reset_index()

b_data = pd.merge(b_dr, b_data[["date", "daytype", "transit"]].drop_duplicates(), on="date")
t_data = pd.merge(t_dr, t_data[["date", "daytype", "transit"]].drop_duplicates(), on = "date")

joined_data = pd.concat([b_data, t_data])

joined_data["daytype"] = joined_data["daytype"].replace({"A": "Saturday", "U": "Sunday/Holiday", "W": "Weekday"})
joined_data["year"] = joined_data["date"].map(lambda x: x[6:10])
joined_data["year"] = joined_data["year"].astype(int)
joined_data["month"] = joined_data["date"].map(lambda x: x[0:2])
joined_data["month"] = joined_data["month"].astype(int)

In [4]:
a_data_clean = a_data[["CRASH_RECORD_ID", "CRASH_DATE", "TRAFFIC_CONTROL_DEVICE", "POSTED_SPEED_LIMIT", "WEATHER_CONDITION", "LIGHTING_CONDITION", "TRAFFICWAY_TYPE", "ROAD_DEFECT", "ROADWAY_SURFACE_COND", "CRASH_DAY_OF_WEEK", "CRASH_MONTH"]]

a_data_clean["CRASH_YEAR"] = a_data_clean["CRASH_DATE"].map(lambda x: x[6:10])
a_data_clean["CRASH_YEAR"] = a_data_clean["CRASH_YEAR"].astype(int)
a_data_clean["CRASH_DATE"] = a_data_clean["CRASH_DATE"].map(lambda x: x[0:10])
a_data_clean["CRASH_DATE"] = a_data_clean["CRASH_DATE"].astype(str)

a_data_clean["CRASH_DAY_OF_WEEK"] = a_data_clean["CRASH_DAY_OF_WEEK"].astype(str)
a_data_clean["CRASH_DAY_OF_WEEK"] = a_data_clean["CRASH_DAY_OF_WEEK"].replace({"1": "Sunday/Holiday", "2": "Weekday", "3": "Weekday", "4": "Weekday", "5": "Weekday", "6": "Weekday", "7": "Saturday"})

num_accidents = a_data_clean[["CRASH_DATE", "CRASH_RECORD_ID"]].groupby("CRASH_DATE").count().reset_index()
num_accidents.columns = ["CRASH_DATE", "NUM_ACCIDENTS"]
a_data_clean = pd.merge(a_data_clean, num_accidents, on = "CRASH_DATE")

a_data_filtered = a_data_clean[a_data_clean["CRASH_YEAR"] >= 2017]
joined_data_filtered = joined_data[joined_data['year'] >= 2017]

merged_data = pd.merge(a_data_filtered, joined_data_filtered[["date", "transit", "rides", "daytype"]], left_on='CRASH_DATE', right_on='date', how='inner')
merged_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  a_data_clean["CRASH_YEAR"] = a_data_clean["CRASH_DATE"].map(lambda x: x[6:10])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  a_data_clean["CRASH_YEAR"] = a_data_clean["CRASH_YEAR"].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  a_data_clean["CRASH_DATE"] = a_data_clean["CRASH_DATE"].ma

Unnamed: 0,CRASH_RECORD_ID,CRASH_DATE,TRAFFIC_CONTROL_DEVICE,POSTED_SPEED_LIMIT,WEATHER_CONDITION,LIGHTING_CONDITION,TRAFFICWAY_TYPE,ROAD_DEFECT,ROADWAY_SURFACE_COND,CRASH_DAY_OF_WEEK,CRASH_MONTH,CRASH_YEAR,NUM_ACCIDENTS,date,transit,rides,daytype
0,9a29d74f70c39201a6c1ccd252169a04b0d128489dd77d...,06/30/2023,NO CONTROLS,15,CLEAR,DARKNESS,PARKING LOT,NO DEFECTS,DRY,Weekday,6,2023,339,06/30/2023,bus,471465,Weekday
1,9a29d74f70c39201a6c1ccd252169a04b0d128489dd77d...,06/30/2023,NO CONTROLS,15,CLEAR,DARKNESS,PARKING LOT,NO DEFECTS,DRY,Weekday,6,2023,339,06/30/2023,train,321367,Weekday
2,0569bdafd9409427eff8b552e94278356635ddb88b8965...,06/30/2023,TRAFFIC SIGNAL,30,CLEAR,"DARKNESS, LIGHTED ROAD",OTHER,NO DEFECTS,DRY,Weekday,6,2023,339,06/30/2023,bus,471465,Weekday
3,0569bdafd9409427eff8b552e94278356635ddb88b8965...,06/30/2023,TRAFFIC SIGNAL,30,CLEAR,"DARKNESS, LIGHTED ROAD",OTHER,NO DEFECTS,DRY,Weekday,6,2023,339,06/30/2023,train,321367,Weekday
4,2353d700d24baf088bfd184193134d97559492951dfb2c...,06/30/2023,UNKNOWN,30,CLEAR,"DARKNESS, LIGHTED ROAD",FOUR WAY,NO DEFECTS,DRY,Weekday,6,2023,339,06/30/2023,bus,471465,Weekday
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1367155,3549c74f4db93e8e75a857adf0a1557a31dab5f1a39e18...,01/01/2017,NO CONTROLS,35,CLEAR,"DARKNESS, LIGHTED ROAD",NOT DIVIDED,NO DEFECTS,DRY,Sunday/Holiday,1,2017,138,01/01/2017,train,237792,Sunday/Holiday
1367156,53ec4bb593f0a5f45e24e3d5d029d2e6cb7be03a4a184e...,01/01/2017,TRAFFIC SIGNAL,30,CLEAR,"DARKNESS, LIGHTED ROAD",ONE-WAY,NO DEFECTS,DRY,Sunday/Holiday,1,2017,138,01/01/2017,bus,277228,Sunday/Holiday
1367157,53ec4bb593f0a5f45e24e3d5d029d2e6cb7be03a4a184e...,01/01/2017,TRAFFIC SIGNAL,30,CLEAR,"DARKNESS, LIGHTED ROAD",ONE-WAY,NO DEFECTS,DRY,Sunday/Holiday,1,2017,138,01/01/2017,train,237792,Sunday/Holiday
1367158,8b9326183843f152a5a573b92533a4bf8d7e3a9ab56e01...,01/01/2017,UNKNOWN,30,UNKNOWN,DARKNESS,NOT DIVIDED,UNKNOWN,UNKNOWN,Sunday/Holiday,1,2017,138,01/01/2017,bus,277228,Sunday/Holiday


In [80]:
merged_data.head()

Unnamed: 0,CRASH_RECORD_ID,CRASH_DATE,TRAFFIC_CONTROL_DEVICE,POSTED_SPEED_LIMIT,WEATHER_CONDITION,LIGHTING_CONDITION,TRAFFICWAY_TYPE,ROAD_DEFECT,ROADWAY_SURFACE_COND,CRASH_DAY_OF_WEEK,CRASH_MONTH,CRASH_YEAR,NUM_ACCIDENTS,date,transit,rides,daytype
0,9a29d74f70c39201a6c1ccd252169a04b0d128489dd77d...,06/30/2023,NO CONTROLS,15,CLEAR,DARKNESS,PARKING LOT,NO DEFECTS,DRY,Weekday,6,2023,339,06/30/2023,bus,471465,Weekday
1,9a29d74f70c39201a6c1ccd252169a04b0d128489dd77d...,06/30/2023,NO CONTROLS,15,CLEAR,DARKNESS,PARKING LOT,NO DEFECTS,DRY,Weekday,6,2023,339,06/30/2023,train,321367,Weekday
2,0569bdafd9409427eff8b552e94278356635ddb88b8965...,06/30/2023,TRAFFIC SIGNAL,30,CLEAR,"DARKNESS, LIGHTED ROAD",OTHER,NO DEFECTS,DRY,Weekday,6,2023,339,06/30/2023,bus,471465,Weekday
3,0569bdafd9409427eff8b552e94278356635ddb88b8965...,06/30/2023,TRAFFIC SIGNAL,30,CLEAR,"DARKNESS, LIGHTED ROAD",OTHER,NO DEFECTS,DRY,Weekday,6,2023,339,06/30/2023,train,321367,Weekday
4,2353d700d24baf088bfd184193134d97559492951dfb2c...,06/30/2023,UNKNOWN,30,CLEAR,"DARKNESS, LIGHTED ROAD",FOUR WAY,NO DEFECTS,DRY,Weekday,6,2023,339,06/30/2023,bus,471465,Weekday


In [5]:
X = merged_data.drop(["CRASH_RECORD_ID", "CRASH_DATE", "CRASH_MONTH", "CRASH_YEAR", "NUM_ACCIDENTS", "date", "daytype"], axis = 1)
y = merged_data["NUM_ACCIDENTS"]

ct = ColumnTransformer([
    ("dummify", OneHotEncoder(sparse_output = False, handle_unknown='ignore', drop="first"), make_column_selector(dtype_include=object)),
    ("standardize", StandardScaler(), make_column_selector(dtype_include=np.number))
])

In [93]:
# Ridge Regression
my_pipeline = Pipeline(
    [
        ("Preprocessing", ct),
        ("Ridge Regression", Ridge())
    ]
)

degrees = {"Ridge Regression__alpha": [1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, 1e10]}

gscv = GridSearchCV(my_pipeline, degrees, cv = 5, scoring='r2')
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_

In [94]:
gscv_fitted.best_estimator_

In [95]:
gscv_fitted.cv_results_["mean_test_score"]

array([-0.08567924, -0.08447924, -0.08033195, -0.08126421, -0.10367944,
       -0.15394666, -0.16935482, -0.17118401, -0.1713703 ])

Ridge gives below zero r^2 values, probably not the good model

In [8]:
# Ridge Regression
my_pipeline = Pipeline(
    [
        ("Preprocessing", ct),
        ("Lasso Regression", Lasso(max_iter = 1000))
    ]
)

degrees = {"Lasso Regression__alpha": [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4, 1e5]}

gscv = GridSearchCV(my_pipeline, degrees, cv = 5, scoring='r2')
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [10]:
gscv_fitted.best_score_

-0.08560808786145954

In [None]:
my_pipeline = Pipeline(
    [
        ("Preprocessing", ct),
        ("Logistic Regression", LogisticRegression(solver="saga", C=5, l1_ratio=.001, penalty = "elasticnet", max_iter = 1000))
    ]
)

degrees = {"Logistic Regression__l1_ratio": [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 0, 1]}

gscv = GridSearchCV(my_pipeline, degrees, cv = 5, scoring='r2')
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_ 

In [None]:
from sklearn.neighbors import KNeighborsRegressor
my_pipeline = Pipeline(
    [
        ("Preprocessing", ct),
        ("KNN", KNeighborsRegressor())
    ]
)

degrees = {"KNN__n_neighbors": list(range(0,11))}

gscv = GridSearchCV(my_pipeline, degrees, cv = 5, scoring='r2')
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_ 