# Final Project
Abigail Allen  
Josh Urry  
Trevor Jex

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
# Load data
df = pd.read_csv("Motor_Vehicle_Crashes_-_Case_Information__Three_Year_Window.csv")

In [None]:
df.head()

In [None]:
# Check number of samples
len(df)

## Preprocessing

In [None]:
# Check NA values
df.isna().sum()

Let's drop the "DOT Reference Marker Location" variable, since the majority of accidents have this missing. Maybe we could just remove the rows where the Municipality is blank. Or just drop it as well and just use the County.

In [None]:
df.drop("DOT Reference Marker Location", axis = 1, inplace = True)

Let's figure out how to handle the date variables.

In [None]:
df.Year.unique()

We could represent these as 0, 1, 2, 3, and 4 as a variable that represents "years from 2017", so it is numeric.

In [None]:
df.Year.replace({2017: 0, 2018: 1, 2019: 2, 2020: 3, 2021: 4}, inplace = True)

Let's convert time and date to date time objects and extract the minutes, hours, month, and day

In [None]:
df.Time = pd.to_datetime(df.Time, infer_datetime_format = True)
df.Date = pd.to_datetime(df.Date, infer_datetime_format = True)

In [None]:
df["month"] = df.Date.dt.month
df["day"] = df.Date.dt.day
df["hour"] = df.Time.dt.hour
df["minute"] = df.Time.dt.minute # My guess is that we'll end up dropping this

In [None]:
# Drop Date and Time variables 
df.drop(["Date", "Time"], axis = 1, inplace = True)

In [None]:
# Now let's convert day of the week to numeric
df["Day of Week"].replace({"Sunday": 0, "Monday": 1, "Tuesday": 2, "Wednesday": 3, "Thursday": 4,
                          "Friday": 5, "Saturday": 6}, inplace = True)

In [None]:
df.head()

Let's figure out the target variable

In [None]:
df["Event Descriptor"].unique()

In [None]:
df["Crash Descriptor"].unique()

Maybe we just do classes from the descriptor variable? That would definitely be a lot easier. It seems like it would be pretty messy to deal with the event descriptor variable.

## Data Visualization

### Can do this chart but will have to put/run before dropping the time column

In [None]:
# Lineplot of accidents over time
df.groupby('Year')['Time'].count().plot()
plt.xlabel('Year')
plt.ylabel('Number of Accidents')
plt.title('Number of Accidents over Time')
plt.show()

----------------------------------------------------------------------------

In [None]:
# Barplot of accidents per year
counts = df['Year'].value_counts().sort_index()
counts.plot.bar()
plt.xlabel('Year')
plt.ylabel('Count')
plt.title('Accidents per Year')
plt.show()

In [None]:
# Barplot of accidents per week
counts = df['Day of Week'].value_counts().sort_index()
counts.plot.bar()
plt.xlabel('Week Day')
plt.ylabel('Count')
plt.title('Accidents per Week Day')
plt.show()

In [None]:
# Barplot of top 10 accidents per county
counts = df['County Name'].value_counts().head(10)
counts.plot.bar()
plt.xlabel('County')
plt.ylabel('Count')
plt.title('Accidents per County')
plt.show()

In [None]:
# Barplot of top 10 event decriptors
counts = df['Event Descriptor'].value_counts().head(10)
counts.plot.bar()
plt.xlabel('Event Descriptor')
plt.ylabel('Count')
plt.title('Event Descriptor Counts')
plt.show()

In [None]:
# Barplot of about of accident descriptors
counts = df['Crash Descriptor'].value_counts()
counts.plot.bar()
plt.xlabel('Accident Descriptor')
plt.ylabel('Count')
plt.title('Accident Descriptor Counts')
plt.show()

In [None]:
# Heatmap for crash descriptors per county
matrix = data.pivot_table(values='Crash Descriptor', index='County Name', columns='Crash Descriptor', aggfunc=len, fill_value=0)

sns.heatmap(matrix, cmap='Blues')
plt.title('Crash Descriptor Counts by County')
plt.xlabel('Crash Descriptor')
plt.ylabel('County')
plt.show()

In [None]:
# Stacked barchart of lighting, weather, road surface, and traffice control device per accident description
conditions = ['Lighting Conditions', 'Weather Conditions', 'Road Surface Conditions', 'Traffic Control Device']

# Create a list of pivot tables and charts for each condition
for condition in conditions:
    pivot_table = df.pivot_table(index=condition, columns='Collision Type Descriptor', values='Year', aggfunc='count')
    ax = pivot_table.plot(kind='bar', stacked=True, figsize=(10, 6))

    ax.set_title(f'Distribution of Crash Types by {condition}')
    ax.set_xlabel(condition)
    ax.set_ylabel('Number of Accidents')
    plt.show()

Visualizations for machine learning portion: Confusion matrix, Precision-Recall Curve, possible Decision tree if applicable (Abbey can do)