# Analysis of Traffic Collision Patterns: Time, Weather and Road Factors

### Step 1: CollisionByWeatherAndRoad Data Loading from CSV to python dataframe: 

In [1]:
#import required libraries
import pandas as pd
import csv

import mysql.connector
from mysql.connector import errorcode
#!pip install tabulate
#from tabulate import tabulate

import matplotlib.pyplot as plt

#!pip install SQLAlchemy pymysql
from sqlalchemy import create_engine

import re

In [2]:
# CollisionByWeatherAndRoad
url_CollisionByWeatherAndRoad = "https://raw.githubusercontent.com/anithamonica/DATA604_TrafficIncidents/main/datasets/CollisionByWeatherAndRoadAlignment.csv"
colNamesColsnByWeatherAndRoad = ['Month', 'Year', 'RoadAlignment', 'Weather',  
              'NoofCollisions', 'NoofVehicles', 'NoofPersons', 'NoofInjured', 'NoofFatalities']
dfByWeatherRoad = pd.read_csv(url_CollisionByWeatherAndRoad, skiprows=3, names = colNamesColsnByWeatherAndRoad)
display(dfByWeatherRoad.head())


Unnamed: 0,Month,Year,RoadAlignment,Weather,NoofCollisions,NoofVehicles,NoofPersons,NoofInjured,NoofFatalities
0,January,2010,Straight and level,Clear and sunny,4166,7746,11765,5726,60
1,January,2010,Straight and level,"Overcast, cloudy but no precipitation",874,1536,2206,1153,13
2,January,2010,Straight and level,Raining,425,723,1113,565,3
3,January,2010,Straight and level,"Snowing, not including drifting snow",928,1537,2297,1275,8
4,January,2010,Straight and level,"Freezing rain, sleet, hail",74,109,165,102,2


In [3]:
print("----------------------------------------------------------------------------")
print("\033[1m"+"Data Analysis of CollisionByWeatherAndRoad"+"\033[0m")
print("----------------------------------------------------------------------------")

#display shape, columns, and data types
print("1.\tShape of the CollisionByWeatherAndRoad dataset:", dfByWeatherRoad.shape)
print("2.\tNumber of records or rows of the DataFrame:", dfByWeatherRoad.shape[0])
print("3.\tColumns and Data types of each column:\n", dfByWeatherRoad.dtypes)
dfByWeatherRoadLength = len(dfByWeatherRoad)

----------------------------------------------------------------------------
[1mData Analysis of CollisionByWeatherAndRoad[0m
----------------------------------------------------------------------------
1.	Shape of the CollisionByWeatherAndRoad dataset: (11232, 9)
2.	Number of records or rows of the DataFrame: 11232
3.	Columns and Data types of each column:
 Month             object
Year               int64
RoadAlignment     object
Weather           object
NoofCollisions     int64
NoofVehicles       int64
NoofPersons        int64
NoofInjured        int64
NoofFatalities     int64
dtype: object


In [4]:
#Inspecting data for CollisionByWeatherAndRoad Dataframe
missingDataSum = dfByWeatherRoad.isna().sum()
missingDataPercentage = (dfByWeatherRoad.isnull().mean() * 100).round(2)
missingData = pd.DataFrame({
    "Missing Count": missingDataSum,
    "Missing Percentage": missingDataPercentage
})

pd.options.display.float_format = '{:.2f}'.format
print("\n\033[1m"+"Missing Count per column:"+"\033[0m")
#print(tabulate(missingData, headers='keys', tablefmt='fancy_grid'))
display(missingData)

print("Unique values")
print("----------------------------------------------------------------------------")
print(dfByWeatherRoad['Month'].unique())
print(dfByWeatherRoad['Year'].unique())
print(dfByWeatherRoad['RoadAlignment'].unique())
print(dfByWeatherRoad['Weather'].unique())


[1mMissing Count per column:[0m


Unnamed: 0,Missing Count,Missing Percentage
Month,0,0.0
Year,0,0.0
RoadAlignment,0,0.0
Weather,0,0.0
NoofCollisions,0,0.0
NoofVehicles,0,0.0
NoofPersons,0,0.0
NoofInjured,0,0.0
NoofFatalities,0,0.0


Unique values
----------------------------------------------------------------------------
['January' 'February' 'March' 'April' 'May' 'June' 'July' 'August'
 'September' 'October' 'November' 'December' 'Unknown']
[2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021]
['Straight and level' 'Straight with gradient' 'Curved and level'
 'Curved with gradient' 'Top of hill or gradient'
 'Bottom of hill or gradient' 'Other' 'Unknown']
['Clear and sunny' 'Overcast, cloudy but no precipitation' 'Raining'
 'Snowing, not including drifting snow' 'Freezing rain, sleet, hail'
 'Visibility limitation' 'Strong wind' 'Other' 'Unknown']


### Step 2: CollisionByWeatherAndRoad Data Cleaning and preprocessing 

In [5]:
print(len(dfByWeatherRoad[(dfByWeatherRoad['NoofCollisions'] == 0
      & (dfByWeatherRoad['NoofInjured'] != 0 |
         (dfByWeatherRoad['NoofFatalities'] != 0)))]))

3079


In [6]:
#Drop invalid records
#----------------------------------------------------------------------------

#Below are the records to be dropped
print("Below are the records to be dropped")
print("----------------------------------------------------------------------------")
print("Count of records with NoofCollisions = 0 is:", len(dfByWeatherRoad[dfByWeatherRoad['NoofCollisions'] == 0]))

#Drop the records with collisionCount = 0 => invalid records
dfByWeatherRoadLength = len(dfByWeatherRoad)
dfByWeatherRoad = dfByWeatherRoad[dfByWeatherRoad['NoofCollisions'] != 0]
newdfByWeatherRoadLen = len(dfByWeatherRoad)
deleteCount = dfByWeatherRoadLength - newdfByWeatherRoadLen
print(f"Deleted {deleteCount} records where NoofCollisions != 0")
print("Number of records after deleting with collisions count != 0 criteria is:", newdfByWeatherRoadLen)
#display(dfByWeatherRoad[dfByWeatherRoad['NoofCollisions'] == 0])

Below are the records to be dropped
----------------------------------------------------------------------------
Count of records with NoofCollisions = 0 is: 3079
Deleted 3079 records where NoofCollisions != 0
Number of records after deleting with collisions count != 0 criteria is: 8153


In [7]:
#Impute missing values
#----------------------------------------------------------------------------

ambiguousValues = [
    'Choice is other than the preceding values',
    'Unknown',
    'Jurisdiction does not provide this data element'
]

#Below are the records to be imputed
print("\nBelow are the records to be imputed")
print("----------------------------------------------------------------------------")
print("Count of records with Month = 'Unknown' is :", len(dfByWeatherRoad[dfByWeatherRoad['Month'] == 'Unknown']))
print("Count of records with RoadAlignment = 'Unknown' is:", dfByWeatherRoad['RoadAlignment'].isin(ambiguousValues).sum())
print("Count of records with Weather = 'Unknown' is:", dfByWeatherRoad['Weather'].isin(ambiguousValues).sum())


#Impute Month with most frequent value of Month
modeMonth = dfByWeatherRoad['Month'].mode()[0]
dfByWeatherRoad['Month'] = dfByWeatherRoad['Month'].apply(lambda x: modeMonth if x == 'Unknown' else x)
print("\nImputed Month with it's mode: ", modeMonth)

#Impute RoadClass with most frequent value of RoadClass
modeRoadAlignment = dfByWeatherRoad['RoadAlignment'].mode()[0]
dfByWeatherRoad['RoadAlignment'] = dfByWeatherRoad['RoadAlignment'].apply(lambda x: modeRoadAlignment if x == 'Unknown' else x)
print("Imputed RoadAlignment with it's mode: ", modeRoadAlignment)

#Impute Pedestrains with most frequent value of Pedestrains
modeWeather = dfByWeatherRoad['Weather'].mode()[0]
dfByWeatherRoad['Weather'] = dfByWeatherRoad['Weather'].apply(lambda x: modeWeather if x == 'Unknown' else x)
print("Imputed Weather with it's mode: ", modeWeather)      


print("\nCount of records with Month = 'Unknown' is :", len(dfByWeatherRoad[dfByWeatherRoad['Month'] == 'Unknown']))
print("Count of records with RoadAlignment = 'Unknown' is:", dfByWeatherRoad['RoadAlignment'].isin(ambiguousValues).sum())
print("Count of records with Weather = 'Unknown' is:", dfByWeatherRoad['Weather'].isin(ambiguousValues).sum())
print("\n",dfByWeatherRoad['Month'].unique())
print(dfByWeatherRoad['Year'].unique())
print(dfByWeatherRoad['RoadAlignment'].unique())
print(dfByWeatherRoad['Weather'].unique())


Below are the records to be imputed
----------------------------------------------------------------------------
Count of records with Month = 'Unknown' is : 39
Count of records with RoadAlignment = 'Unknown' is: 1119
Count of records with Weather = 'Unknown' is: 1018

Imputed Month with it's mode:  December
Imputed RoadAlignment with it's mode:  Straight and level
Imputed Weather with it's mode:  Clear and sunny

Count of records with Month = 'Unknown' is : 0
Count of records with RoadAlignment = 'Unknown' is: 0
Count of records with Weather = 'Unknown' is: 0

 ['January' 'February' 'March' 'April' 'May' 'June' 'July' 'August'
 'September' 'October' 'November' 'December']
[2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021]
['Straight and level' 'Straight with gradient' 'Curved and level'
 'Curved with gradient' 'Top of hill or gradient'
 'Bottom of hill or gradient' 'Other']
['Clear and sunny' 'Overcast, cloudy but no precipitation' 'Raining'
 'Snowing, not including drifti

In [8]:
#Introduce new Column : Season
#----------------------------------------------------------------------------

# Mapping from month name to season
monthToSeason = {
    'December': 'Winter', 'January': 'Winter', 'February': 'Winter',
    'March': 'Spring', 'April': 'Spring', 'May': 'Spring',
    'June': 'Summer', 'July': 'Summer', 'August': 'Summer',
    'September': 'Fall', 'October': 'Fall', 'November': 'Fall'
}
dfByWeatherRoad['Season'] = dfByWeatherRoad['Month'].map(monthToSeason)
print("Column 'Season' is inserted")
display(dfByWeatherRoad.head(5))

Column 'Season' is inserted


Unnamed: 0,Month,Year,RoadAlignment,Weather,NoofCollisions,NoofVehicles,NoofPersons,NoofInjured,NoofFatalities,Season
0,January,2010,Straight and level,Clear and sunny,4166,7746,11765,5726,60,Winter
1,January,2010,Straight and level,"Overcast, cloudy but no precipitation",874,1536,2206,1153,13,Winter
2,January,2010,Straight and level,Raining,425,723,1113,565,3,Winter
3,January,2010,Straight and level,"Snowing, not including drifting snow",928,1537,2297,1275,8,Winter
4,January,2010,Straight and level,"Freezing rain, sleet, hail",74,109,165,102,2,Winter


### Step 3: Import cleaned CollisionByWeatherAndRoad dataset to MySQL database

In [9]:
USER = "student"
DB   = "student"
    
# attempt a connection
myconnection = mysql.connector.connect(user=USER, 
                                       password='Bi3KSjqgrNOOL',
                                       host='127.0.0.1', 
                                       port=3306,
                                       database=DB,
                                       allow_local_infile=True)
myconnection

<mysql.connector.connection.MySQLConnection at 0x7a01058ee8d0>

In [14]:
# Create table CollisionByWeatherAndRoad
queryCreate = '''CREATE TABLE student.CollisionByWeatherAndRoad(
    CollisionMonth varchar(15) NOT NULL,
    CollisionYear int, 
    RoadAlignment varchar(50),
    Weather varchar(75),
    NoofCollisions int,
    NoofVehicles int,
    NoofPersons int,
    NoofInjured int,
    NoofFatalities int,
    Season varchar(10));'''

createCursor = myconnection.cursor()
try:
    createCursor.execute(queryCreate)
    print("CollisionByWeatherAndRoad table created successfully!")
except mysql.connector.Error as err:
    if err.errno == errorcode.ER_TABLE_EXISTS_ERROR:
        print("Error! Table CollisionByWeatherAndRoad is already created.")
    else:
        print(err.msg)

createCursor.close()

CollisionByWeatherAndRoad table created successfully!


True

In [15]:
# Insert into table CollisionByWeatherAndRoad
insertCursor = myconnection.cursor()

try:
    myconnection.start_transaction()

    for i, currentRow in dfByWeatherRoad.iterrows():
        insertCommand = "INSERT INTO `CollisionByWeatherAndRoad` VALUES (" + "%s," * (len(currentRow) - 1) + "%s)"
        insertCursor.execute(insertCommand, tuple(currentRow))

    myconnection.commit()
    print("CollisionByWeatherAndRoad: Inserted records successfully")

except Exception as e:
    myconnection.rollback()
    print("Transaction rolled back due to error:", e)

finally:
    insertCursor.close()

CollisionByWeatherAndRoad: Inserted records successfully


In [16]:
read_cursor = myconnection.cursor(buffered=True, dictionary=True)
query_string = ("SELECT COUNT(*) FROM CollisionByWeatherAndRoad;")
read_cursor.execute(query_string)

for (library_value) in read_cursor:
    print(library_value)
read_cursor.close()
myconnection.close()

{'COUNT(*)': 8153}


### Step 4: CollisionByWeatherAndRoad data analysis and visualization

 - How do weather and road alignment influence collision outcomes?



In [20]:
USER = "student"
DB   = "student"
PWD = "Bi3KSjqgrNOOL"

engine = create_engine(
    f"mysql+mysqlconnector://{USER}:{PWD}@127.0.0.1:3306/{DB}"
)

#Which season impact collision frequency and severity? 
queryMonthly = '''SELECT 
    Weather,
    RoadAlignment,
    SUM(NoofCollisions) AS TotalCollisions,
    ROUND(SUM(NoofInjured) * 1.0 / SUM(NoofCollisions), 2) AS AvgInjuriesPerCollision,
    ROUND(SUM(NoofFatalities) * 1.0 / SUM(NoofCollisions), 4) AS FatalityRate
FROM 
    student.CollisionByWeatherAndRoad
WHERE 
    NoofCollisions > 0
GROUP BY 
    Weather, RoadAlignment
ORDER BY 
    FatalityRate DESC, AvgInjuriesPerCollision DESC;'''
result = pd.read_sql_query(queryMonthly, engine)
display(result)
'''
# Pivot so each SpeedLimit shows Urban and Rural side-by-side
pivot_df = result.pivot(index='SpeedValue', columns='RoadClass', values='InjurySeverity')

# Plot
pivot_df.plot(kind='bar', figsize=(14, 6))
plt.title('Injury Severity by Speed Limit and Road Class')
plt.ylabel('Injury Severity')
plt.xticks(rotation=45)
plt.ylim(bottom=1.0) 
plt.legend(title='Road Class', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()
'''

Unnamed: 0,Weather,RoadAlignment,TotalCollisions,AvgInjuriesPerCollision,FatalityRate
0,Strong wind,Other,11.0,1.0,0.18
1,Other,Straight with gradient,184.0,1.16,0.07
2,Other,Other,56.0,1.11,0.05
3,Other,Curved with gradient,80.0,1.41,0.05
4,Visibility limitation,Top of hill or gradient,403.0,1.4,0.04
5,Strong wind,Bottom of hill or gradient,90.0,1.44,0.04
6,Clear and sunny,Curved with gradient,29617.0,1.31,0.04
7,"Overcast, cloudy but no precipitation",Curved with gradient,8457.0,1.33,0.04
8,Visibility limitation,Straight with gradient,1765.0,1.39,0.04
9,Clear and sunny,Curved and level,64692.0,1.28,0.04


"\n# Pivot so each SpeedLimit shows Urban and Rural side-by-side\npivot_df = result.pivot(index='SpeedValue', columns='RoadClass', values='InjurySeverity')\n\n# Plot\npivot_df.plot(kind='bar', figsize=(14, 6))\nplt.title('Injury Severity by Speed Limit and Road Class')\nplt.ylabel('Injury Severity')\nplt.xticks(rotation=45)\nplt.ylim(bottom=1.0) \nplt.legend(title='Road Class', bbox_to_anchor=(1.05, 1), loc='upper left')\nplt.tight_layout()\nplt.show()\n"

In [21]:
engine.dispose()