# Analysis of Traffic Collision Patterns: Time, Weather and Road Factors

This file is to generate the Toronto collisions csv file using the dataset downloaded from Toronto Collisions database.

### Step 1: TorontoCollisions Data Loading from CSV to python dataframe 

In [189]:
#import required libraries
import pandas as pd
import csv

In [191]:
# Load the TorontoCollision csv with required columns for the analysis, from the downloaded csv file obtained from the ToronotoCollision open dataset
torontoDF = pd.DataFrame()
currentRow = []

csvLocalURL = '/Users/anithajoseph/Documents/UofC/DATA604/Project/Datasets/TorontoCollisionsOriginal.csv'

with open(csvLocalURL, 'r') as file:
    reader = csv.DictReader(file)
    for row in reader:
        currentRow.append({
            #'Date': row.get('OCC_DATE'),
            'Year': row.get('OCC_YEAR'),
            'Month': row.get('OCC_MONTH'),
            'DayofWeek': row.get('OCC_DOW'),
            'CollisionHour': row.get('OCC_HOUR'),
            'NoofFatalities': row.get('FATALITIES'),
            'IsInjuryCollsn': row.get('INJURY_COLLISIONS'),
            'IsPedestrian': row.get('PEDESTRIAN')
        })

torontoDF = pd.DataFrame(currentRow)
display(torontoDF.head())

torontoDF["Year"] = torontoDF["Year"].astype(int)
torontoDF["CollisionHour"] = torontoDF["CollisionHour"].astype(int)
torontoDF["NoofFatalities"] = torontoDF["NoofFatalities"].astype(int)


Unnamed: 0,Year,Month,DayofWeek,CollisionHour,NoofFatalities,IsInjuryCollsn,IsPedestrian
0,2014,January,Wednesday,17,0,NO,NO
1,2014,January,Wednesday,14,0,NO,NO
2,2014,January,Wednesday,2,0,YES,NO
3,2014,January,Wednesday,3,0,NO,NO
4,2014,January,Wednesday,5,0,YES,NO


In [192]:
#function to map CollisionHour to a HourRange
def categorize_time(hour):
    if 0 <= hour <= 2:
        return "Late Night"
    elif 3 <= hour <= 5:
        return "Early Morning"
    elif 6 <= hour <= 8:
        return "Morning Rush"
    elif 9 <= hour <= 11:
        return "Mid Morning"
    elif 12 <= hour <= 14:
        return "Early Afternoon"
    elif 15 <= hour <= 17:
        return "Late Afternoon"
    elif 18 <= hour <= 20:
        return "Evening"
    elif 21 <= hour <= 23:
        return "Night"
    else:
        return "Invalid Hour"
torontoDF["HourRange"] = torontoDF["CollisionHour"].apply(categorize_time)

display(torontoDF.head())

Unnamed: 0,Year,Month,DayofWeek,CollisionHour,NoofFatalities,IsInjuryCollsn,IsPedestrian,HourRange
0,2014,January,Wednesday,17,0,NO,NO,Late Afternoon
1,2014,January,Wednesday,14,0,NO,NO,Early Afternoon
2,2014,January,Wednesday,2,0,YES,NO,Late Night
3,2014,January,Wednesday,3,0,NO,NO,Early Morning
4,2014,January,Wednesday,5,0,YES,NO,Early Morning


In [195]:
# Identify duplicate rows
duplicates = torontoDF[torontoDF.duplicated()]
print("Duplicate records: ", len(duplicates))

#display(torontoDF.head())

# Group by all columns to generate new columns: CountofCollisions, NoofFatalities, CountofInjuryCollsn, CountOfPedestrianCollsn
group_cols = ["Year", "Month", "DayofWeek", "HourRange"]

#newTorontoDF = torontoDF.groupby(group_cols).size().reset_index(name="NoofCollisions")

#CountofCollisions means the total number of collisions occured based on an hourrange for a specific day
#NoofFatalities means the total number of fatalities happened based on an hourrange for a specific day => sum of NoofFatalities 
#CountofInjuryCollsn means the total number of injury collisions occured based on an hourrange for a specific day
#CountOfPedestrianCollsn means the total number of pedestrian collisions occured based on an hourrange for a specific day

newTorontoDF = torontoDF.groupby(group_cols).agg(
    CountofCollisions=("NoofFatalities", "count"),
    NoofFatalities=("NoofFatalities", "sum"),
    CountofInjuryCollsn=("IsInjuryCollsn", lambda x: (x.str.upper() == "YES").sum()),
    CountOfPedestrianCollsn=("IsPedestrian", lambda x: (x.str.upper() == "YES").sum())
).reset_index()

# Add Fatality Collision Count (increment if TotalFatalities > 0)
newTorontoDF["CountofFatalityCollsn"] = (newTorontoDF["NoofFatalities"] > 0).astype(int)

display(newTorontoDF.head())

print("Lengh of torontoDF: ", len(torontoDF))
print("Lengh of newTorontoDF: ", len(newTorontoDF))

#display(newTorontoDF[newTorontoDF["NoofFatalities"] > 3])
#display(newTorontoDF.head())

Duplicate records:  712055


Unnamed: 0,Year,Month,DayofWeek,HourRange,CountofCollisions,NoofFatalities,CountofInjuryCollsn,CountOfPedestrianCollsn,CountofFatalityCollsn
0,2014,April,Friday,Early Afternoon,133,0,11,5,0
1,2014,April,Friday,Early Morning,9,0,4,1,0
2,2014,April,Friday,Evening,77,0,10,1,0
3,2014,April,Friday,Late Afternoon,176,0,21,4,0
4,2014,April,Friday,Late Night,11,0,3,0,0


Lengh of torontoDF:  772516
Lengh of newTorontoDF:  7895


Unnamed: 0,Year,Month,DayofWeek,HourRange,CountofCollisions,NoofFatalities,CountofInjuryCollsn,CountOfPedestrianCollsn,CountofFatalityCollsn
7315,2024,October,Thursday,Late Night,20,4,2,1,1


In [131]:
#Load data from dataframe to csv file
newTorontoDF.to_csv("TorontoCollisionsSummary.csv", index=False)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 772516 entries, 0 to 772515
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   Date            772516 non-null  object
 1   Year            772516 non-null  int64 
 2   Month           772516 non-null  object
 3   DayofWeek       772516 non-null  object
 4   CollisionHour   772516 non-null  int64 
 5   NoofFatalities  772516 non-null  int64 
 6   IsInjuryCollsn  772516 non-null  object
 7   IsPedestrian    772516 non-null  object
dtypes: int64(3), object(5)
memory usage: 47.2+ MB
None


Unnamed: 0,Date,Year,Month,DayofWeek,CollisionHour,IsInjuryCollsn,IsPedestrian,NoofFatalities
0,2014-01-01 5:00,2014,January,Wednesday,0,NO,NO,0
1,2014-01-01 5:00,2014,January,Wednesday,0,YES,NO,0
2,2014-01-01 5:00,2014,January,Wednesday,1,NO,NO,0
3,2014-01-01 5:00,2014,January,Wednesday,2,NO,NO,0
4,2014-01-01 5:00,2014,January,Wednesday,2,YES,NO,0


Lengh of torontoDF:  772516
Lengh of newDF:  164492
