## Plane Crash Data Exploration and Analysis

#### Budhajit Roy Chanamthabam

In [1]:
# import necessary libraries
#https://www.kaggle.com/budhajit/plane-crash-information-dataset

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import json
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings('ignore')

In [2]:
def set_data(year, dates):
    date = []
    time = []
    location = []
    operator = []
    flight_number = []
    route = []
    aircraft_type = []
    registration = []
    cn_ln = []
    aboard = []
    fatalities = []
    ground = []
    summary = []
    final_data = {}

    for i in range(len(dates)):
        url1 = "http://www.planecrashinfo.com/"+str(year)+"/"+str(year)+"-"+str(i+1)+".htm"
        #print(url)
        r1 = requests.get(url1)
        htm_doc = r1.text  

        soup1 = BeautifulSoup(htm_doc)
        td_tags = soup1.find_all('td')

        test = []
        for items in td_tags:
            test.append(items.text)

        # add the data details to their corresponding lists
        date.append(test[3])
        time.append(test[5])
        location.append(test[7])
        operator.append(test[9])
        flight_number.append(test[11])
        route.append(test[13])
        aircraft_type.append(test[15])
        registration.append(test[17])
        cn_ln.append(test[19])
        aboard.append(test[21])
        fatalities.append(test[23])
        ground.append(test[25])
        summary.append(test[27])
        
    final_data['date'] = date
    final_data['time'] = time
    final_data['location'] = location
    final_data['operator'] = operator
    final_data['flight_number'] = flight_number
    final_data['route'] = route
    final_data['aircraft_type'] = aircraft_type
    final_data['registration'] = registration
    final_data['cn_ln'] = cn_ln
    final_data['aboard'] = aboard
    final_data['fatalities'] = fatalities
    final_data['ground'] = ground
    final_data['summary'] = summary
    
    return final_data

In [4]:
# setting up the data for each year and creating the dataframe

for year in range(1920,2020):    
    url = "http://www.planecrashinfo.com/"+str(year)+"/"+str(year)+".htm"
    r = requests.get(url)
    html_doc = r.text
    soup = BeautifulSoup(html_doc)
    a_tags = soup.find_all('a')
    dates = []
    for link in a_tags:
        #print(link.text) # this returns the text
        dates.append(link.text)
        #print(link.get('href')) # this returns the link
    dates = dates[:-1]
    year_df = set_data(year,dates)
    if year!= 1920:
        temp_df = pd.DataFrame(year_df)
        final_df = pd.concat([final_df,temp_df])
    else:
        final_df = pd.DataFrame(year_df)
    

In [5]:
final_df.shape

(5242, 13)

In [7]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5242 entries, 0 to 0
Data columns (total 13 columns):
date             5242 non-null object
time             5242 non-null object
location         5242 non-null object
operator         5242 non-null object
flight_number    5242 non-null object
route            5242 non-null object
aircraft_type    5242 non-null object
registration     5242 non-null object
cn_ln            5242 non-null object
aboard           5242 non-null object
fatalities       5242 non-null object
ground           5242 non-null object
summary          5242 non-null object
dtypes: object(13)
memory usage: 573.3+ KB


In [8]:
final_df.head()

Unnamed: 0,date,time,location,operator,flight_number,route,aircraft_type,registration,cn_ln,aboard,fatalities,ground,summary
0,"September 17, 1908",17:18,"Fort Myer, Virginia",Military - U.S. Army,?,Demonstration,Wright Flyer III,?,1,2 (passengers:1 crew:1),1 (passengers:1 crew:0),0,"During a demonstration flight, a U.S. Army fly..."
1,"September 07, 1909",?,"Juvisy-sur-Orge, France",?,?,Air show,Wright Byplane,SC1,?,1 (passengers:0 crew:1),1 (passengers:0 crew:0),0,Eugene Lefebvre was the first pilot to ever be...
2,"July 12, 1912",06:30,"Atlantic City, New Jersey",Military - U.S. Navy,?,Test flight,Dirigible,?,?,5 (passengers:0 crew:5),5 (passengers:0 crew:5),0,First U.S. dirigible Akron exploded just offsh...
3,"August 06, 1913",?,"Victoria, British Columbia, Canada",Private,?,?,Curtiss seaplane,?,?,1 (passengers:0 crew:1),1 (passengers:0 crew:1),0,The first fatal airplane accident in Canada oc...
4,"September 09, 1913",c 18:30,Over the North Sea,Military - German Navy,?,?,Zeppelin L-1 (airship),?,?,20 (passengers:? crew:?),14 (passengers:? crew:?),0,The airship flew into a thunderstorm and encou...


In [9]:
final_df.tail()

Unnamed: 0,date,time,location,operator,flight_number,route,aircraft_type,registration,cn_ln,aboard,fatalities,ground,summary
15,"September 28, 2018",1010,"Chuuk, Micronesia",Air Niugini,?,Pohnpei - Chuuk,Boeing 737-8BK,P2-PXE,33024/1688,47 (passengers:35 crew:12),1 (passengers:1 crew:0),0,The aircraft was approaching for a landing at ...
16,"October 29, 2018",631,"Off Jakarta, Indonesia",Lion Air,610,Jakarta - Pangkal Pinang,Boeing 737-MAX 8,PK-LQP,43000/7058,189 (passengers:181 crew:8),189 (passengers:181 crew:8),0,"The airliner crashed into the Jakarta Sea, 13 ..."
17,"November 06, 2018",253,"Georgetown, Guyana",Fly Jamaica Airways,?,Georgetown - Toronto,Boeing 757-N23,N524AT,30233/895,128 (passengers:120 crew:8),1 (passengers:1 crew:0),0,"After taking off and reaching FL200, the crew ..."
18,"November 18, 2018",2300,"Near Mandan, North Dakota",Metro Area Ambulance Services,?,Bismark - Sloulin Field,Cessna 441 Conquest II,N441CX,441-0305,3 (passengers:2 crew:1),3 (passengers:2 crew:1),0,The air ambulance en route to pick up a patien...
0,"January 14, 2019",830,"Karaj, Iran",Saha Air,?,Bishkek - Payam,Boeing 707-3J9C,EP-CPP,21128/917,16 (passengers:13 crew:3),15 (passengers:13 crew:2),0,The cargo plane was operated by the Iranian Ai...


In [11]:
final_df = final_df.reset_index(drop=True)
final_df.to_csv("plane_crash_data.csv", index = False)

In [12]:
final_df.columns

Index(['date', 'time', 'location', 'operator', 'flight_number', 'route',
       'aircraft_type', 'registration', 'cn_ln', 'aboard', 'fatalities',
       'ground', 'summary'],
      dtype='object')