# Tfl Accident Stats

*by Anas Razak*

Transport for London (tfl). accident stats 2019-2021

## Fetching data from API

import packages

In [9]:
# importing libraries; requests, time, json
import requests
import time
import json

In [10]:
def acc_stats_year(year):
    '''
    Each query is made for a specific year. Luckily, the url has the same format, thus we can write as in below. We use requests.get to make the query to the API server.
    '''
    url = 'https://api.tfl.gov.uk/AccidentStats/' + str(year)
    r = requests.get(url)
    data = r.json()

    '''
    In this function, we want to generate 3 separate json format files of data.
    accident_{year} : contains the general info of each accident (list1)
    accident_{year}_casualties : contains descriptions of individuals involved in each accident (list2)
    accident_{year}_vehicles : contains descriptions of vehicles involved in each accident (list3)
    '''
    list1 = []
    list2 = []
    list3 = []

    # collecting real-time time before the loop starts
    t1 = time.perf_counter()

    # type(data) is a list, whose elements are accident occurs during that year. We iterate throughout the list to collect and distribute into appropriate groups.
    for accident in data:
        accident_casualties, accident_vehicles = accident['casualties'], accident['vehicles']

        n, m = len(accident_casualties), len(accident_vehicles)

        # removing unnecessary part of the data.
        for i in range(n):
            del accident_casualties[i]['$type']
        
        for j in range(m):
            del accident_vehicles[j]['$type']

        # defining dictionaries that corresponds to each group of data.
        info1 = {
            'id': accident['id'],
            'lat': accident['lat'],
            'lon': accident['lon'],
            'location': accident['location'],
            'date': accident['date'],
            'severity': accident['severity'],
            'borough' : accident['borough']
        }
        
        list1.append(info1)

        info2 = {
            'id': accident['id'],
            'casualties': accident_casualties
        }

        list2.append(info2)

        info3 = {
            'id': accident['id'],
            'vehicles': accident_vehicles
        }

        list3.append(info3)

        # putting a resting time between each query, to avoid data rate limit and traffic.
        time.sleep(r.elapsed.total_seconds())

        print(f"Got accident {accident['id']} info in {r.elapsed.total_seconds()}")
    
    # collecting the real-time of when the loop finished
    t2 = time.perf_counter()

    print(f"Finished looping for {t2-t1} seconds")

    # converting list1,list2,list3 into json format files
    with open(f'accident_{year}.json', 'w') as f:
        json.dump(list1, f, indent=2)

    with open(f'accident_{year}_casualties.json', 'w') as h:
        json.dump(list2, h, indent=2)
    
    with open(f'accident_{year}_vehicles.json', 'w') as k:
        json.dump(list3, k, indent=2)

In [4]:
acc_stats_year(2019)

Got accident 345906 info in 0.222191
Got accident 345907 info in 0.222191
Got accident 345908 info in 0.222191
Got accident 345909 info in 0.222191
Got accident 345910 info in 0.222191
Got accident 345911 info in 0.222191
Got accident 345912 info in 0.222191
Got accident 345913 info in 0.222191
Got accident 345914 info in 0.222191
Got accident 345915 info in 0.222191
Got accident 345916 info in 0.222191
Got accident 345917 info in 0.222191
Got accident 345918 info in 0.222191
Got accident 345919 info in 0.222191
Got accident 345920 info in 0.222191
Got accident 345921 info in 0.222191
Got accident 345922 info in 0.222191
Got accident 345923 info in 0.222191
Got accident 345924 info in 0.222191
Got accident 345925 info in 0.222191
Got accident 345926 info in 0.222191
Got accident 345927 info in 0.222191
Got accident 345928 info in 0.222191
Got accident 345929 info in 0.222191
Got accident 345930 info in 0.222191
Got accident 345931 info in 0.222191
Got accident 345932 info in 0.222191
G

In [10]:
acc_stats_year(2018)

Got accident 320652 info in 4.181643
Got accident 320653 info in 4.181643
Got accident 320654 info in 4.181643
Got accident 320655 info in 4.181643
Got accident 320656 info in 4.181643
Got accident 320657 info in 4.181643
Got accident 320658 info in 4.181643
Got accident 320659 info in 4.181643
Got accident 320660 info in 4.181643
Got accident 320661 info in 4.181643
Got accident 320662 info in 4.181643
Got accident 320663 info in 4.181643
Got accident 320664 info in 4.181643
Got accident 320665 info in 4.181643
Got accident 320666 info in 4.181643
Got accident 320667 info in 4.181643
Got accident 320668 info in 4.181643
Got accident 320669 info in 4.181643
Got accident 320670 info in 4.181643
Got accident 320671 info in 4.181643
Got accident 320672 info in 4.181643
Got accident 320673 info in 4.181643
Got accident 320674 info in 4.181643
Got accident 320675 info in 4.181643
Got accident 320676 info in 4.181643
Got accident 320677 info in 4.181643
Got accident 320678 info in 4.181643
G

In [6]:
acc_stats_year(2017)

Got accident 295194 info in 0.104026
Got accident 295195 info in 0.104026
Got accident 295196 info in 0.104026
Got accident 295197 info in 0.104026
Got accident 295198 info in 0.104026
Got accident 295199 info in 0.104026
Got accident 295200 info in 0.104026
Got accident 295201 info in 0.104026
Got accident 295202 info in 0.104026
Got accident 295203 info in 0.104026
Got accident 295204 info in 0.104026
Got accident 295205 info in 0.104026
Got accident 295206 info in 0.104026
Got accident 295207 info in 0.104026
Got accident 295208 info in 0.104026
Got accident 295209 info in 0.104026
Got accident 295210 info in 0.104026
Got accident 295211 info in 0.104026
Got accident 295212 info in 0.104026
Got accident 295213 info in 0.104026
Got accident 295214 info in 0.104026
Got accident 295215 info in 0.104026
Got accident 295216 info in 0.104026
Got accident 295217 info in 0.104026
Got accident 295218 info in 0.104026
Got accident 295219 info in 0.104026
Got accident 295220 info in 0.104026
G

In [13]:
acc_stats_year(2016)

Got accident 269566 info in 0.226533
Got accident 269567 info in 0.226533
Got accident 269568 info in 0.226533
Got accident 269569 info in 0.226533
Got accident 269570 info in 0.226533
Got accident 269571 info in 0.226533
Got accident 269572 info in 0.226533
Got accident 269573 info in 0.226533
Got accident 269574 info in 0.226533
Got accident 269575 info in 0.226533
Got accident 269576 info in 0.226533
Got accident 269577 info in 0.226533
Got accident 269578 info in 0.226533
Got accident 269579 info in 0.226533
Got accident 269580 info in 0.226533
Got accident 269581 info in 0.226533
Got accident 269582 info in 0.226533
Got accident 269583 info in 0.226533
Got accident 269584 info in 0.226533
Got accident 269585 info in 0.226533
Got accident 269586 info in 0.226533
Got accident 269587 info in 0.226533
Got accident 269588 info in 0.226533
Got accident 269589 info in 0.226533
Got accident 269590 info in 0.226533
Got accident 269591 info in 0.226533
Got accident 269592 info in 0.226533
G

## Data Pre-processing

Before we go to visualizations or modelling, the data must go through some "cleaning" processes. This includes checking the validity of the data, missing values and also formatting special features like datetime and geospatial data. To do that, we will use the library pandas that allows us to process the data as an object called DataFrame. DataFrame is a table-like format and it is easy to comprehend. It gives a good rough idea how the data looks like, and it also provides great tools to engineer the data.  

In [14]:
import pandas as pd
import numpy as np
import datetime

### Converting json into DataFrame

Pandas has a simple method to convert json format into DataFrame.

In [21]:
# convert accident_{year}.json to a dataframe
t16 = pd.read_json("accident_2016.json")
t17 = pd.read_json("accident_2017.json")
t18 = pd.read_json("accident_2018.json")
t19 = pd.read_json("accident_2019.json")

# combining all 4 into a single data frame 
df_acc = pd.concat([t16, t17, t18, t19])
df_acc.head()

Unnamed: 0,id,lat,lon,location,date,severity,borough
0,269566,51.505949,-0.127106,Whitehall junction with Great Scotland Yard,2016-06-16 23:14:00+00:00,Slight,City of Westminster
1,269567,51.510148,-0.188325,Bayswater Road junction with Caroline Close,2016-06-16 09:48:00+00:00,Slight,City of Westminster
2,269568,51.510515,-0.182978,Bayswater Road junction with Leinster Terrace,2016-06-05 12:10:00+00:00,Slight,City of Westminster
3,269569,51.508983,-0.136925,Piccadilly 20 metres north east junction with ...,2016-06-11 04:20:00+00:00,Slight,City of Westminster
4,269570,51.520539,-0.191516,Harrow Road junction with Bourne Terrace,2016-06-17 19:00:00+00:00,Slight,City of Westminster


The casualties data and vehicles data contain nested dictionaries. So, we may need to tackle them differently. In particular, wa want to flatten the nested dictionary by using json_normalize method. Before that we need to deserialized our json files.

In [34]:
with open('accident_2016_casualties.json') as f:
    dc16 = json.load(f)
with open('accident_2017_casualties.json') as f:
    dc17 = json.load(f)
with open('accident_2018_casualties.json') as f:
    dc18 = json.load(f)
with open('accident_2019_casualties.json') as f:
    dc19 = json.load(f)


t16c = pd.json_normalize(dc16, record_path=["casualties"], meta=["id"])
t17c = pd.json_normalize(dc17, record_path=["casualties"], meta=["id"])
t18c = pd.json_normalize(dc18, record_path=["casualties"], meta=["id"])
t19c = pd.json_normalize(dc19, record_path=["casualties"], meta=["id"])

df_cas = pd.concat([t16c, t17c, t18c, t19c])
df_cas.shape


(246870, 6)

In [36]:
with open('accident_2016_vehicles.json') as f:
    dv16 = json.load(f)
with open('accident_2017_vehicles.json') as f:
    dv17 = json.load(f)
with open('accident_2018_vehicles.json') as f:
    dv18 = json.load(f)
with open('accident_2019_vehicles.json') as f:
    dv19 = json.load(f)


t16v = pd.json_normalize(dv16, record_path=["vehicles"], meta=["id"])
t17v = pd.json_normalize(dv17, record_path=["vehicles"], meta=["id"])
t18v = pd.json_normalize(dv18, record_path=["vehicles"], meta=["id"])
t19v = pd.json_normalize(dv19, record_path=["vehicles"], meta=["id"])

df_veh = pd.concat([t16v, t17v, t18v, t19v])
df_veh.shape

(367796, 2)