In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [82]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import numpy as np

In [4]:
import os

# TODO: Fill in the Google Drive path where you uploaded the assignment
# Example: If you create a CS_639 folder and put all the files under PS0 folder, then 'CS_639/PS0'
# GOOGLE_DRIVE_PATH_AFTER_MYDRIVE = 'CS_639/PS0'
GOOGLE_DRIVE_PATH_AFTER_MYDRIVE = 'MadData Hackathon/' 
GOOGLE_DRIVE_PATH = os.path.join('drive', 'My Drive', GOOGLE_DRIVE_PATH_AFTER_MYDRIVE)
print(os.listdir(GOOGLE_DRIVE_PATH))

['data', 'Data Preprocessing + Initial EDA.ipynb', 'Machine Learning Model.ipynb']


### Utility Functions

In [5]:
def one_hot_encode(df, col_name):
    # Drop any rows with missing values in the specified column
    df = df.dropna(subset=[col_name])
    
    # Perform one-hot encoding on the specified column
    one_hot_df = pd.get_dummies(df[col_name], prefix=col_name)
    
    # Concatenate the original DataFrame with the one-hot encoded DataFrame
    new_df = pd.concat([df, one_hot_df], axis=1)
    
    # Drop the original column
    new_df = new_df.drop(columns=col_name)
    return new_df

## Load Dataframe

In [12]:
file_path = os.path.join(GOOGLE_DRIVE_PATH, 'data/data_with_zipcode.csv')

# Read in the CSV file as a pandas dataframe
df_og = pd.read_csv(file_path)



In [28]:
# Remove Columns
df = df_og.drop(columns=["cluster", "DEVICE_CONDITION", "CRASH_DATE", "DOORING_I", "STREET_DIRECTION", "LOCATION","ALIGNMENT","PRIM_CONTRIBUTORY_CAUSE","INTERSECTION_RELATED_I","STREET_NO","STREET_NAME"])


## Feature Extraction + Data Preprocessing

First of all, let us convert the categorical variables in to one hot encodings

In [29]:
df = one_hot_encode(df,"TRAFFIC_CONTROL_DEVICE")
df = one_hot_encode(df,"ROADWAY_SURFACE_COND")
df = one_hot_encode(df,"WEATHER_CONDITION")
df = one_hot_encode(df,"LIGHTING_CONDITION")
df = one_hot_encode(df,"AM_PM")
df = one_hot_encode(df,"TRAFFICWAY_TYPE")

In [30]:
# Drop More Columns
df = df.drop(columns=["LIGHTING_CONDITION_UNKNOWN","ROADWAY_SURFACE_COND_UNKNOWN", "WEATHER_CONDITION_UNKNOWN", "TRAFFICWAY_TYPE_UNKNOWN"])

#### Add the prediction column

In [31]:
damage_dict = {"OVER $1,500":3, '$501 - $1,500':2, '$500 OR LESS':1}
# Replace values in column DAMAGE with the values from the dictionary
df['DAMAGE'] = df['DAMAGE'].map(damage_dict)
# move column 'DAMAGE' to the end of the DataFrame
column_to_move = df['DAMAGE']
df = df.drop('DAMAGE', axis=1)
df['DAMAGE'] = column_to_move

In [39]:
# remove rows with at least one None value
df = df.dropna()
                                     

# MACHINE LEARNING

In [41]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)


LogisticRegression()

In [43]:
from sklearn.metrics import accuracy_score

y_val_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_val_pred)
accuracy

0.6042900049094637

In [44]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

# Split the data into training, validation, and test sets
X = df.iloc[:, :-1]
y = df.iloc[:,-1]
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

# Define a list of models to use
models = [LogisticRegression(), GaussianNB(), RandomForestClassifier(), xgb.XGBClassifier()]

# Loop through the models and perform cross-validation for each
results = []
for model in models:
    # Perform 5-fold cross-validation
    scores = cross_val_score(model, X_train_val, y_train_val, cv=5)
    # Calculate the mean and standard deviation of the scores
    mean_score = scores.mean()
    std_score = scores.std()
    # Add the results to the list
    results.append({'model': type(model).__name__, 'mean_score': mean_score, 'std_score': std_score})

# Convert the results list to a DataFrame
results_df = pd.DataFrame(results)

# Display the results
print(results_df)

                    model  mean_score  std_score
0      LogisticRegression    0.607346   0.000006
1              GaussianNB    0.370861   0.008214
2  RandomForestClassifier    0.613537   0.000820
3           XGBClassifier    0.609387   0.000173


## Make Predictions for Financial Cost incurred due to crashes in different Chicago Zipcodes

### Train RandomForestClassifier Model

In [63]:
X = df.iloc[:, :-1]
y = df.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


model = RandomForestClassifier()
model.fit(X_train, y_train)

RandomForestClassifier()

In [64]:
y_test_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_test_pred)
accuracy

0.6155240130533977

In [None]:
# Save model
dump(model, 'cost_frm_crash_estimator.joblib')

In [65]:
# Get Predictions for all of Chicago

In [71]:
model_pred = model.predict(X)

In [77]:
new_df = df[["zipcode"]]
new_df["cost_pred"] = model_pred


In [87]:
counts = new_df['zipcode'].value_counts()
new_df['count'] = new_df['zipcode'].map(counts)

new_df

Unnamed: 0,zipcode,cost_pred,count
0,60696,3,20324
1,60630,3,15635
2,60644,3,28934
3,60652,2,26812
4,60622,3,19632
...,...,...,...
694033,60612,2,21941
694034,60406,3,3957
694035,60660,3,12084
694036,60605,3,15237


In [80]:
zipcode_cost = new_df.groupby('zipcode')['cost_pred'].mean()

# convert the resulting series to a dataframe
zipcode_cost_df = pd.DataFrame(zipcode_cost)

# reset the index of the dataframe
zipcode_cost_df = zipcode_cost_df.reset_index()
zipcode_cost_df

Unnamed: 0,zipcode,cost_pred
0,60406,2.65858
1,60605,2.547352
2,60607,2.561621
3,60608,2.556634
4,60609,2.674964
5,60611,2.470811
6,60612,2.561232
7,60613,2.469791
8,60614,2.457492
9,60615,2.594656


In [90]:
# group by 'zipcode' and aggregate using weighted average
weighted_avg = new_df.groupby('zipcode').agg(
    cost_pred_weighted_avg=('cost_pred', lambda x: np.average(x, weights=new_df.loc[x.index, 'count'])),
    total_count=('count', 'sum')
)

# reset index
weighted_avg = weighted_avg.reset_index()

# calculate the average weighted cost
weighted_avg['avg_weighted_cost'] = weighted_avg['cost_pred_weighted_avg'] / weighted_avg['total_count']

# drop unnecessary columns
weighted_avg = weighted_avg[['zipcode', 'avg_weighted_cost']]





In [92]:
file_path_to_save = os.path.join(GOOGLE_DRIVE_PATH, 'data/zipcode_and_crash_cost.csv')
weighted_avg.to_csv(file_path_to_save, index=False)

In [94]:
weighted_avg.dtypes

zipcode                int64
avg_weighted_cost    float64
dtype: object