# Task 2
This uses the requests library to scrape the planecrashinfo website. I managed to get all the information required without requesting each individual crash page, which may have resulted in excessive load on the source website and being blocked by the server.

In [3]:
import requests
import sqlite3 as sq
from bs4 import BeautifulSoup

from IPython.display import HTML, display
import tabulate

base_url = 'http://www.planecrashinfo.com'
database_url = base_url + '/database.htm'

In [4]:
r = requests.get(database_url)

In [5]:
# Parse initial list
soup = BeautifulSoup(r.text, 'html.parser')

In [6]:
crashes = {}
for a in soup.find_all('a'):
    year = a.string.strip()
    if (year.isnumeric()):
        crashes[year] = {'url': a['href'].strip()}

In [7]:
# Fetch webpage data
for i, p in crashes.items():
    r = requests.get(base_url + '/' + p['url'])
    crashes[i]['data'] = r.text

In [8]:
# list to store each crash
crashdata = []

for i, p in crashes.items():
    soup = BeautifulSoup(p['data'], 'html.parser')
    crashrows = soup.table.find_all('tr')
    itercrash = iter(crashrows)
    # skip the header row
    next(itercrash)
    
    for row in itercrash:
        crashattr = row.find_all('td')
        
        crashlocop = crashattr[1].get_text(separator='|').strip().split('|')
        crashtypereg = crashattr[2].get_text(separator='|').strip().split('|')
        crash = (
            crashattr[0].text, # date
            i, # year
            crashlocop[0], #location
            crashlocop[1], #operator
            crashtypereg[0], # aircraft type
            crashtypereg[1], #registration
            crashattr[3].text.split("/")[0] #fatalities
        )
        crashdata.append(crash)

In [9]:
# Add data to an sqlite database so we are not being abnoxious scrapers
conn = sq.connect('../crashes.db')
c = conn.cursor()

In [10]:
c.execute('DROP TABLE IF EXISTS crashes;')
c.execute('''
CREATE TABLE crashes (
            'date' varchar,
            'year' int,
            'location' varchar,
            'operator' varchar,
            'aircraft_type' varchar,
            'registration' varchar,
            'fatalities' int
)''')

c.executemany('INSERT INTO crashes VALUES (?,?,?,?,?,?,?)', crashdata)

<sqlite3.Cursor at 0x7feb74d5d1f0>

In [11]:
c.execute('select * from crashes limit 10')
c.fetchone()

('17 Sep 1908',
 1920,
 'Fort Myer, Virginia',
 'Military - U.S. Army',
 'Wright Flyer III',
 '?',
 1)

In [12]:
# Total Fatalities between the 1920-2016 period
c.execute('''
    SELECT '1920-2016', SUM(fatalities) FROM crashes
    WHERE (year = 1920 AND date LIKE '%1920%') -- deal with dates earlier than 1920
    OR year BETWEEN 1921 AND 2016
''')
display(HTML(tabulate.tabulate(c.fetchall(), ('Time Period', 'Total Fatalities'), tablefmt='html')))

Time Period,Total Fatalities
1920-2016,113666


In [13]:
# Top 3 airlines with highest rate of incidents
c.execute('''
    SELECT operator, count(*) FROM crashes
    GROUP BY operator
    ORDER BY count(*) DESC
    LIMIT 3
''')

display(HTML(tabulate.tabulate(c.fetchall(), ('Operator', 'Incidents'), tablefmt='html')))

Operator,Incidents
Aeroflot,260
Military - U.S. Air Force,177
Air France,72


In [14]:
# Year with the highest number of incidentss
c.execute('''
    SELECT year, count(*) FROM crashes
    GROUP BY year
    ORDER BY count(*) DESC
    LIMIT 1
''')

display(HTML(tabulate.tabulate(c.fetchall(), ('Year', 'Incidents'), tablefmt='html')))

Year,Incidents
1972,105


In [15]:
conn.commit()
conn.close()