# Analysis of Traffic Collision Patterns: Time, Weather and Road Factors

### Step 1: TorontoCollisions Data Loading from CSV to python dataframe: 

In [3]:
#import required libraries
import pandas as pd
import csv

import mysql.connector
from mysql.connector import errorcode
#!pip install tabulate
#from tabulate import tabulate

import matplotlib.pyplot as plt

#!pip install SQLAlchemy pymysql
from sqlalchemy import create_engine

import re

In [5]:
# TorontoCollisions
url_TorontoCollisions = "https://raw.githubusercontent.com/anithamonica/DATA604_TrafficIncidents/main/datasets/TorontoCollisions.csv"
colNamesColsnToronto = ['Month', 'Year', 'RoadAlignment', 'Weather',  
              'NoofCollisions', 'NoofVehicles', 'NoofPersons', 'NoofInjured', 'NoofFatalities']
dfToronto = pd.read_csv(url_TorontoCollisions, skiprows=3, names = colNamesColsnToronto)
display(dfToronto.head())


HTTPError: HTTP Error 404: Not Found

In [11]:
print("----------------------------------------------------------------------------")
print("\033[1m"+"Data Analysis of TorontoCollisions"+"\033[0m")
print("----------------------------------------------------------------------------")

#display shape, columns, and data types
print("1.\tShape of the TorontoCollisions dataset:", dfToronto.shape)
print("2.\tNumber of records or rows of the DataFrame:", dfToronto.shape[0])
print("3.\tColumns and Data types of each column:\n", dfToronto.dtypes)
dfTorontoLength = len(dfToronto)

----------------------------------------------------------------------------
[1mData Analysis of CollisionByWeatherAndRoad[0m
----------------------------------------------------------------------------
1.	Shape of the CollisionByWeatherAndRoad dataset: (11232, 9)
2.	Number of records or rows of the DataFrame: 11232
3.	Columns and Data types of each column:
 Month             object
Year               int64
RoadAlignment     object
Weather           object
NoofCollisions     int64
NoofVehicles       int64
NoofPersons        int64
NoofInjured        int64
NoofFatalities     int64
dtype: object


In [13]:
#Inspecting data for TorontoCollisions Dataframe
missingDataSum = dfToronto.isna().sum()
missingDataPercentage = (dfToronto.isnull().mean() * 100).round(2)
missingData = pd.DataFrame({
    "Missing Count": missingDataSum,
    "Missing Percentage": missingDataPercentage
})

pd.options.display.float_format = '{:.2f}'.format
print("\n\033[1m"+"Missing Count per column:"+"\033[0m")
#print(tabulate(missingData, headers='keys', tablefmt='fancy_grid'))
display(missingData)

print("Unique values")
print("----------------------------------------------------------------------------")
print(dfToronto['Month'].unique())
print(dfToronto['Year'].unique())
print(dfToronto['RoadAlignment'].unique())
print(dfToronto['Weather'].unique())


[1mMissing Count per column:[0m


Unnamed: 0,Missing Count,Missing Percentage
Month,0,0.0
Year,0,0.0
RoadAlignment,0,0.0
Weather,0,0.0
NoofCollisions,0,0.0
NoofVehicles,0,0.0
NoofPersons,0,0.0
NoofInjured,0,0.0
NoofFatalities,0,0.0


Unique values
----------------------------------------------------------------------------
['January' 'February' 'March' 'April' 'May' 'June' 'July' 'August'
 'September' 'October' 'November' 'December' 'Unknown']
[2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021]
['Straight and level' 'Straight with gradient' 'Curved and level'
 'Curved with gradient' 'Top of hill or gradient'
 'Bottom of hill or gradient' 'Other' 'Unknown']
['Clear and sunny' 'Overcast, cloudy but no precipitation' 'Raining'
 'Snowing, not including drifting snow' 'Freezing rain, sleet, hail'
 'Visibility limitation' 'Strong wind' 'Other' 'Unknown']


### Step 2: TorontoCollisions Data Cleaning and preprocessing 

In [15]:
print(len(dfToronto[(dfToronto['NoofCollisions'] == 0
      & (dfToronto['NoofInjured'] != 0 |
         (dfToronto['NoofFatalities'] != 0)))]))

3079


In [17]:
#Drop invalid records
#----------------------------------------------------------------------------

#Below are the records to be dropped
print("Below are the records to be dropped")
print("----------------------------------------------------------------------------")
print("Count of records with NoofCollisions = 0 is:", len(dfToronto[dfToronto['NoofCollisions'] == 0]))

#Drop the records with collisionCount = 0 => invalid records
dfTorontoLength = len(dfToronto)
dfToronto = dfToronto[dfToronto['NoofCollisions'] != 0]
newdfTorontoLen = len(dfToronto)
deleteCount = dfTorontoLength - newdfTorontoLen
print(f"Deleted {deleteCount} records where NoofCollisions != 0")
print("Number of records after deleting with collisions count != 0 criteria is:", newdfTorontoLen)
#display(dfToronto[dfToronto['NoofCollisions'] == 0])

Below are the records to be dropped
----------------------------------------------------------------------------
Count of records with NoofCollisions = 0 is: 3079
Deleted 3079 records where NoofCollisions != 0
Number of records after deleting with collisions count != 0 criteria is: 8153


In [29]:
#Impute missing values
#----------------------------------------------------------------------------

ambiguousValues = [
    'Choice is other than the preceding values',
    'Unknown',
    'Jurisdiction does not provide this data element'
]

#Below are the records to be imputed
print("\nBelow are the records to be imputed")
print("----------------------------------------------------------------------------")
print("Count of records with Month = 'Unknown' is :", len(dfToronto[dfToronto['Month'] == 'Unknown']))
print("Count of records with RoadAlignment = 'Unknown' is:", dfToronto['RoadAlignment'].isin(ambiguousValues).sum())
print("Count of records with Weather = 'Unknown' is:", dfToronto['Weather'].isin(ambiguousValues).sum())


#Impute Month with most frequent value of Month
modeMonth = dfToronto['Month'].mode()[0]
dfToronto['Month'] = dfToronto['Month'].apply(lambda x: modeMonth if x == 'Unknown' else x)
print("\nImputed Month with it's mode: ", modeMonth)

#Impute RoadClass with most frequent value of RoadClass
modeRoadAlignment = dfToronto['RoadAlignment'].mode()[0]
dfToronto['RoadAlignment'] = dfToronto['RoadAlignment'].apply(lambda x: modeRoadAlignment if x == 'Unknown' else x)
print("Imputed RoadAlignment with it's mode: ", modeRoadAlignment)

#Impute Pedestrains with most frequent value of Pedestrains
modeWeather = dfToronto['Weather'].mode()[0]
dfToronto['Weather'] = dfToronto['Weather'].apply(lambda x: modeWeather if x == 'Unknown' else x)
print("Imputed Weather with it's mode: ", modeWeather)      


print("\nCount of records with Month = 'Unknown' is :", len(dfToronto[dfToronto['Month'] == 'Unknown']))
print("Count of records with RoadAlignment = 'Unknown' is:", dfToronto['RoadAlignment'].isin(ambiguousValues).sum())
print("Count of records with Weather = 'Unknown' is:", dfToronto['Weather'].isin(ambiguousValues).sum())
print("\n",dfToronto['Month'].unique())
print(dfToronto['Year'].unique())
print(dfToronto['RoadAlignment'].unique())
print(dfToronto['Weather'].unique())


Below are the records to be imputed
----------------------------------------------------------------------------
Count of records with Month = 'Unknown' is : 39
Count of records with RoadAlignment = 'Unknown' is: 1119
Count of records with Weather = 'Unknown' is: 1018

Imputed Month with it's mode:  December
Imputed RoadAlignment with it's mode:  Straight and level
Imputed Weather with it's mode:  Clear and sunny

Count of records with Month = 'Unknown' is : 0
Count of records with RoadAlignment = 'Unknown' is: 0
Count of records with Weather = 'Unknown' is: 0

 ['January' 'February' 'March' 'April' 'May' 'June' 'July' 'August'
 'September' 'October' 'November' 'December']
[2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021]
['Straight and level' 'Straight with gradient' 'Curved and level'
 'Curved with gradient' 'Top of hill or gradient'
 'Bottom of hill or gradient' 'Other']
['Clear and sunny' 'Overcast, cloudy but no precipitation' 'Raining'
 'Snowing, not including drifti

In [31]:
#Introduce new Column : Season
#----------------------------------------------------------------------------

# Mapping from month name to season
monthToSeason = {
    'December': 'Winter', 'January': 'Winter', 'February': 'Winter',
    'March': 'Spring', 'April': 'Spring', 'May': 'Spring',
    'June': 'Summer', 'July': 'Summer', 'August': 'Summer',
    'September': 'Fall', 'October': 'Fall', 'November': 'Fall'
}
dfToronto['Season'] = dfToronto['Month'].map(monthToSeason)
print("Column 'Season' is inserted")
display(dfToronto.head(5))

Column 'Season' is inserted


Unnamed: 0,Month,Year,RoadAlignment,Weather,NoofCollisions,NoofVehicles,NoofPersons,NoofInjured,NoofFatalities,Season
0,January,2010,Straight and level,Clear and sunny,4166,7746,11765,5726,60,Winter
1,January,2010,Straight and level,"Overcast, cloudy but no precipitation",874,1536,2206,1153,13,Winter
2,January,2010,Straight and level,Raining,425,723,1113,565,3,Winter
3,January,2010,Straight and level,"Snowing, not including drifting snow",928,1537,2297,1275,8,Winter
4,January,2010,Straight and level,"Freezing rain, sleet, hail",74,109,165,102,2,Winter


### Step 3: Import cleaned TorontoCollisions dataset to MySQL database

In [None]:
USER = "student"
DB   = "student"
    
# attempt a connection
myconnection = mysql.connector.connect(user=USER, 
                                       password='Bi3KSjqgrNOOL',
                                       host='127.0.0.1', 
                                       port=3306,
                                       database=DB,
                                       allow_local_infile=True)
myconnection

In [None]:
# Create table TorontoCollisions
queryCreate = '''CREATE TABLE student.TorontoCollisions(
    CollisionMonth varchar(15) NOT NULL,
    CollisionYear int, 
    RoadAlignment varchar(25),
    Weather varchar(25),
    NoofCollisions int,
    NoofVehicles int,
    NoofPersons int,
    NoofInjured int,
    NoofFatalities int,
    Season varchar(10));'''

createCursor = myconnection.cursor()
try:
    createCursor.execute(queryCreate)
    print("TorontoCollisions table created successfully!")
except mysql.connector.Error as err:
    if err.errno == errorcode.ER_TABLE_EXISTS_ERROR:
        print("Error! Table TorontoCollisions is already created.")
    else:
        print(err.msg)

createCursor.close()

In [None]:
# Insert into table TorontoCollisions
insertCursor = myconnection.cursor()

try:
    myconnection.start_transaction()

    for i, currentRow in dfToronto.iterrows():
        insertCommand = "INSERT INTO `TorontoCollisions` VALUES (" + "%s," * (len(currentRow) - 1) + "%s)"
        insertCursor.execute(insertCommand, tuple(currentRow))

    myconnection.commit()
    print("TorontoCollisions: Inserted records successfully")

except Exception as e:
    myconnection.rollback()
    print("Transaction rolled back due to error:", e)

finally:
    insertCursor.close()

In [None]:
read_cursor = myconnection.cursor(buffered=True, dictionary=True)
query_string = ("SELECT COUNT(*) FROM TorontoCollisions;")
read_cursor.execute(query_string)

for (library_value) in read_cursor:
    print(library_value)
read_cursor.close()
myconnection.close()

### Step 4: TorontoCollisions data analysis and visualization

 - Do Toronto’s peak hours/months/day of weeks differ from overall patterns?
 - Compare weather-impacted months (do Toronto’s weather collisions align with national averages?)
 - Are Toronto’s urban road class collisions match national distributions?




In [None]:
#Which season impact collision frequency and severity? 
queryMonthly = '''SELECT 
    Weather,
    RoadAlignment,
    SUM(NoofCollisions) AS TotalCollisions,
    ROUND(SUM(NoofInjured) * 1.0 / SUM(NoofCollisions), 2) AS AvgInjuriesPerCollision,
    ROUND(SUM(NoofFatalities) * 1.0 / SUM(NoofCollisions), 4) AS FatalityRate
FROM 
    student.TorontoCollisions
WHERE 
    NoofCollisions > 0
GROUP BY 
    Weather, RoadAlignment
ORDER BY 
    FatalityRate DESC, AvgInjuriesPerCollision DESC;'''
result = pd.read_sql_query(queryMonthly, engine)
display(result)

# Pivot so each SpeedLimit shows Urban and Rural side-by-side
pivot_df = result.pivot(index='SpeedValue', columns='RoadClass', values='InjurySeverity')

# Plot
pivot_df.plot(kind='bar', figsize=(14, 6))
plt.title('Injury Severity by Speed Limit and Road Class')
plt.ylabel('Injury Severity')
plt.xticks(rotation=45)
plt.ylim(bottom=1.0) 
plt.legend(title='Road Class', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
engine.dispose()