## Plane Crash Data Exploration and Analysis

#### Budhajit Roy Chanamthabam

In [1]:
# import necessary libraries
#https://www.kaggle.com/budhajit/plane-crash-information-dataset

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import json
from bs4 import BeautifulSoup
import warnings
import re
import datetime
warnings.filterwarnings('ignore')

In [2]:
def set_data(year, dates):
    date = []
    time = []
    location = []
    operator = []
    flight_number = []
    route = []
    aircraft_type = []
    registration = []
    cn_ln = []
    aboard = []
    fatalities = []
    ground = []
    summary = []
    final_data = {}

    for i in range(len(dates)):
        url1 = "http://www.planecrashinfo.com/"+str(year)+"/"+str(year)+"-"+str(i+1)+".htm"
        #print(url)
        r1 = requests.get(url1)
        htm_doc = r1.text  

        soup1 = BeautifulSoup(htm_doc)
        td_tags = soup1.find_all('td')

        test = []
        for items in td_tags:
            test.append(items.text)

        # add the data details to their corresponding lists
        date.append(test[3])
        time.append(test[5])
        location.append(test[7])
        operator.append(test[9])
        flight_number.append(test[11])
        route.append(test[13])
        aircraft_type.append(test[15])
        registration.append(test[17])
        cn_ln.append(test[19])
        aboard.append(test[21])
        fatalities.append(test[23])
        ground.append(test[25])
        summary.append(test[27])
        
    final_data['date'] = date
    final_data['time'] = time
    final_data['location'] = location
    final_data['operator'] = operator
    final_data['flight_number'] = flight_number
    final_data['route'] = route
    final_data['aircraft_type'] = aircraft_type
    final_data['registration'] = registration
    final_data['cn_ln'] = cn_ln
    final_data['aboard'] = aboard
    final_data['fatalities'] = fatalities
    final_data['ground'] = ground
    final_data['summary'] = summary
    
    return final_data

In [4]:
# setting up the data for each year and creating the dataframe

for year in range(1920,2020):    
    url = "http://www.planecrashinfo.com/"+str(year)+"/"+str(year)+".htm"
    r = requests.get(url)
    html_doc = r.text
    soup = BeautifulSoup(html_doc)
    a_tags = soup.find_all('a')
    dates = []
    for link in a_tags:
        #print(link.text) # this returns the text
        dates.append(link.text)
        #print(link.get('href')) # this returns the link
    dates = dates[:-1]
    year_df = set_data(year,dates)
    if year!= 1920:
        temp_df = pd.DataFrame(year_df)
        final_df = pd.concat([final_df,temp_df])
    else:
        final_df = pd.DataFrame(year_df)
    

In [5]:
final_df.shape

(5242, 13)

In [7]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5242 entries, 0 to 0
Data columns (total 13 columns):
date             5242 non-null object
time             5242 non-null object
location         5242 non-null object
operator         5242 non-null object
flight_number    5242 non-null object
route            5242 non-null object
aircraft_type    5242 non-null object
registration     5242 non-null object
cn_ln            5242 non-null object
aboard           5242 non-null object
fatalities       5242 non-null object
ground           5242 non-null object
summary          5242 non-null object
dtypes: object(13)
memory usage: 573.3+ KB


In [8]:
final_df.head()

Unnamed: 0,date,time,location,operator,flight_number,route,aircraft_type,registration,cn_ln,aboard,fatalities,ground,summary
0,"September 17, 1908",17:18,"Fort Myer, Virginia",Military - U.S. Army,?,Demonstration,Wright Flyer III,?,1,2 (passengers:1 crew:1),1 (passengers:1 crew:0),0,"During a demonstration flight, a U.S. Army fly..."
1,"September 07, 1909",?,"Juvisy-sur-Orge, France",?,?,Air show,Wright Byplane,SC1,?,1 (passengers:0 crew:1),1 (passengers:0 crew:0),0,Eugene Lefebvre was the first pilot to ever be...
2,"July 12, 1912",06:30,"Atlantic City, New Jersey",Military - U.S. Navy,?,Test flight,Dirigible,?,?,5 (passengers:0 crew:5),5 (passengers:0 crew:5),0,First U.S. dirigible Akron exploded just offsh...
3,"August 06, 1913",?,"Victoria, British Columbia, Canada",Private,?,?,Curtiss seaplane,?,?,1 (passengers:0 crew:1),1 (passengers:0 crew:1),0,The first fatal airplane accident in Canada oc...
4,"September 09, 1913",c 18:30,Over the North Sea,Military - German Navy,?,?,Zeppelin L-1 (airship),?,?,20 (passengers:? crew:?),14 (passengers:? crew:?),0,The airship flew into a thunderstorm and encou...


In [9]:
final_df.tail()

Unnamed: 0,date,time,location,operator,flight_number,route,aircraft_type,registration,cn_ln,aboard,fatalities,ground,summary
15,"September 28, 2018",1010,"Chuuk, Micronesia",Air Niugini,?,Pohnpei - Chuuk,Boeing 737-8BK,P2-PXE,33024/1688,47 (passengers:35 crew:12),1 (passengers:1 crew:0),0,The aircraft was approaching for a landing at ...
16,"October 29, 2018",631,"Off Jakarta, Indonesia",Lion Air,610,Jakarta - Pangkal Pinang,Boeing 737-MAX 8,PK-LQP,43000/7058,189 (passengers:181 crew:8),189 (passengers:181 crew:8),0,"The airliner crashed into the Jakarta Sea, 13 ..."
17,"November 06, 2018",253,"Georgetown, Guyana",Fly Jamaica Airways,?,Georgetown - Toronto,Boeing 757-N23,N524AT,30233/895,128 (passengers:120 crew:8),1 (passengers:1 crew:0),0,"After taking off and reaching FL200, the crew ..."
18,"November 18, 2018",2300,"Near Mandan, North Dakota",Metro Area Ambulance Services,?,Bismark - Sloulin Field,Cessna 441 Conquest II,N441CX,441-0305,3 (passengers:2 crew:1),3 (passengers:2 crew:1),0,The air ambulance en route to pick up a patien...
0,"January 14, 2019",830,"Karaj, Iran",Saha Air,?,Bishkek - Payam,Boeing 707-3J9C,EP-CPP,21128/917,16 (passengers:13 crew:3),15 (passengers:13 crew:2),0,The cargo plane was operated by the Iranian Ai...


In [11]:
final_df = final_df.reset_index(drop=True)
final_df.to_csv("plane_crash_data.csv", index = False)

In [12]:
final_df.columns

Index(['date', 'time', 'location', 'operator', 'flight_number', 'route',
       'aircraft_type', 'registration', 'cn_ln', 'aboard', 'fatalities',
       'ground', 'summary'],
      dtype='object')

## Read data from CSV

In [2]:
df = pd.read_csv("plane_crash_data.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5242 entries, 0 to 5241
Data columns (total 13 columns):
date             5242 non-null object
time             5242 non-null object
location         5242 non-null object
operator         5242 non-null object
flight_number    5242 non-null object
route            5242 non-null object
aircraft_type    5242 non-null object
registration     5242 non-null object
cn_ln            5242 non-null object
aboard           5242 non-null object
fatalities       5242 non-null object
ground           5242 non-null object
summary          5242 non-null object
dtypes: object(13)
memory usage: 532.5+ KB


In [3]:
df.describe()

Unnamed: 0,date,time,location,operator,flight_number,route,aircraft_type,registration,cn_ln,aboard,fatalities,ground,summary
count,5242,5242,5242,5242,5242,5242,5242,5242,5242,5242,5242,5242,5242
unique,4742,1287,4327,2633,808,3529,2557,4888,3781,982,871,52,4876
top,"February 28, 1973",?,"Moscow, Russia",Aeroflot,?,?,Douglas DC-3,?,?,2 (passengers:0 crew:2),1 (passengers:0 crew:1),0,?
freq,4,1917,16,217,4043,1338,310,321,1092,224,272,4964,210


In [4]:
df.tail()

Unnamed: 0,date,time,location,operator,flight_number,route,aircraft_type,registration,cn_ln,aboard,fatalities,ground,summary
5237,"September 28, 2018",1010,"Chuuk, Micronesia",Air Niugini,?,Pohnpei - Chuuk,Boeing 737-8BK,P2-PXE,33024/1688,47 (passengers:35 crew:12),1 (passengers:1 crew:0),0,The aircraft was approaching for a landing at ...
5238,"October 29, 2018",631,"Off Jakarta, Indonesia",Lion Air,610,Jakarta - Pangkal Pinang,Boeing 737-MAX 8,PK-LQP,43000/7058,189 (passengers:181 crew:8),189 (passengers:181 crew:8),0,"The airliner crashed into the Jakarta Sea, 13 ..."
5239,"November 06, 2018",253,"Georgetown, Guyana",Fly Jamaica Airways,?,Georgetown - Toronto,Boeing 757-N23,N524AT,30233/895,128 (passengers:120 crew:8),1 (passengers:1 crew:0),0,"After taking off and reaching FL200, the crew ..."
5240,"November 18, 2018",2300,"Near Mandan, North Dakota",Metro Area Ambulance Services,?,Bismark - Sloulin Field,Cessna 441 Conquest II,N441CX,441-0305,3 (passengers:2 crew:1),3 (passengers:2 crew:1),0,The air ambulance en route to pick up a patien...
5241,"January 14, 2019",830,"Karaj, Iran",Saha Air,?,Bishkek - Payam,Boeing 707-3J9C,EP-CPP,21128/917,16 (passengers:13 crew:3),15 (passengers:13 crew:2),0,The cargo plane was operated by the Iranian Ai...


- convert date in to date format
- related to time data:
    - time data also contains the character 'c' in 322 observations.
    - time data has missing semi-colons.
    - time data has "?" for missing data
    - many other unwanted characters.
- location can be split into region and country columns.
- Operator can again be split into private, passenger service and military. (will have to find the operations first ).
- flight number: there are missing flight number informations represented by "?"
- route: can be split into : source and destination locations
- aboard can be further split into : total_aboard, passenger, crew
- fatalities can be again further split into passenger and crew.
- 

### Convert data into Date Format

In [10]:
#using different functions
def get_month(month_string):
    month_string = month_string.lower()
    return {
        'january': "1',
        'february': '2',
        'march':'3',
        'april':'4',
        'may':'5',
        'june':'6',
        'july':'7',
        'august':'8',
        'september':'9',
        'october':'10',
        'november':'11',
        'december':'12'
    }[month_string]

In [25]:
def convert_date(date_string):
    month_val = date_string.split(" ")[0]
    month_str = get_month(month_val)
    
    date_val = date_string.split(" ")[1].split(",")[0]
    year_val = date_string.split(" ")[2]
    
    final_date = str(month_str)+"-"+str(date_val)+"-"+str(year_val)
    
    return final_date


In [31]:
test_df['date_new'] = test_df['date'].apply(convert_date)

In [43]:
#https://discuss.analyticsvidhya.com/t/how-to-convert-string-to-date-in-pandas-dataframe-using-python/17674
s=pd.Series(["Jan.3,2017", "feb.4,2016", "mar.2,2017", "apr.3,2015" , "apr.3,2016"])
sf=pd.DataFrame(s, columns =["date_col"])
print (pd.to_datetime(sf.date_col, format="%b.%d,%Y"))

0   2017-01-03
1   2016-02-04
2   2017-03-02
3   2015-04-03
4   2016-04-03
Name: date_col, dtype: datetime64[ns]


In [48]:
s = test_df['date']
#sf = pd.DataFrame(s, columns = ['date_col'])
#test_df["new_date1"] = 
#print(pd.to_datetime(s,format = "%B %d, %Y"))

0      1908-09-17
1      1909-09-07
2      1912-07-12
3      1913-08-06
4      1913-09-09
5      1913-10-17
6      1915-03-05
7      1915-09-03
8      1916-07-28
9      1916-09-24
10     1916-10-01
11     1916-11-21
12     1916-11-28
13     1917-03-04
14     1917-03-30
15     1917-05-14
16     1917-06-14
17     1917-06-17
18     1917-08-21
19     1917-10-20
20     1918-04-07
21     1918-05-10
22     1918-08-11
23     1918-12-16
24     1919-05-25
25     1919-07-19
26     1919-08-02
27     1919-10-02
28     1919-10-14
29     1919-10-20
          ...    
5212   2017-05-05
5213   2017-05-15
5214   2017-05-27
5215   2017-06-07
5216   2017-06-28
5217   2017-07-10
5218   2017-10-14
5219   2017-11-15
5220   2017-12-13
5221   2017-12-31
5222   2018-01-29
5223   2018-02-10
5224   2018-02-11
5225   2018-02-18
5226   2018-03-06
5227   2018-03-11
5228   2018-03-12
5229   2018-03-17
5230   2018-04-11
5231   2018-04-17
5232   2018-05-02
5233   2018-05-18
5234   2018-06-05
5235   2018-07-10
5236   201

In [3]:
# use this date conversion **********************************
s = test_df['date']
test_df['new_date'] = pd.to_datetime(s,format = "%B %d, %Y")

In [4]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5242 entries, 0 to 5241
Data columns (total 14 columns):
date             5242 non-null object
time             5242 non-null object
location         5242 non-null object
operator         5242 non-null object
flight_number    5242 non-null object
route            5242 non-null object
aircraft_type    5242 non-null object
registration     5242 non-null object
cn_ln            5242 non-null object
aboard           5242 non-null object
fatalities       5242 non-null object
ground           5242 non-null object
summary          5242 non-null object
new_date         5242 non-null datetime64[ns]
dtypes: datetime64[ns](1), object(13)
memory usage: 573.4+ KB


## Do this in the last
### time data has "?" for missing data. 
### Convert it into nan

In [24]:
# count the number of observations with "?"
test_df.query('time == "?"').shape[0]

1917

In [25]:
test_df['time'] = test_df['time'].replace('?', np.nan)

In [26]:
# check the dataset
test_df.query('time == "?"').shape[0]

0

### time data also contains the character 'c' in 322 observations.

In [5]:
#pattern = re.compile('^\d{2}\:\d{2}$') 
def check_pattern(time_val):
    pattern = re.compile('^\d{2}\:\d{2}$') 
    result = pattern.match(time_val)
    return bool(result)

In [6]:
# lets check if the time data has the pattern 00:00
pattern_check = df['time'].apply(check_pattern)

In [7]:
pattern_check[pattern_check == False].count()

2436

In [9]:
#test_df.query('"c" in time').shape[0]
df[df['time'].str.contains("c")].shape[0]

322

In [10]:
df[df['time'].str.contains("c: ")].shape[0]

2

In [11]:
df[df['time'].str.contains("Z")].shape[0]

47

In [12]:
df[df['time'] == "?"].shape[0]

1917

Since it is observed that the character "c" is present in the start of the time data, might be due to data input error. Anyways lets remove this from the observations.

In [110]:
#ignore this
ignore_list = ["c:","c: ","c","c ","z","Z"," "]
time_data = "c: 2:00"
for tag in ignore_list:
    if tag == 'z' or tag == 'Z':
        time_data = time_data.replace(tag, '')
    else:
        time_data = time_data.replace(tag, '0')
print(time_data)

002:00


In [171]:
#ignore this
time_data = "2:00"
if len(time_data) < 5:
    add_zero = 5 - len(time_data)
    time_data = time_data.zfill(5)
    print(time_data)
    print(add_zero)

#my_string = "2.00"
#print(my_string.zfill(5))

02:00
1


In [13]:
# function to replace "?", remove "c" and to add ":" for all the missing : in time data

def remove_c(time_data):
    time_data = time_data.strip()
    
    ignore_list = ["c:","c: ","c","c ","z","Z"," "]
    
    if time_data == "?":
        time_data = "00:00"
        
    if ":" not in time_data:
        time_data = time_data[:2] + ':' + time_data[2:]
    
    if len(time_data) < 5:
        time_data = time_data.zfill(5)
    
    for tag in ignore_list:
        if tag == 'z' or tag == 'Z':
            time_data = time_data.replace(tag, '')
        else:
            time_data = time_data.replace(tag, '0')

        
    new_str = time_data[-5:]
    return new_str

In [14]:
df['time'] = df['time'].apply(remove_c)

In [15]:
test = df['time'].apply(check_pattern)

In [16]:
test[test == False].count()

12

In [17]:
test_index = test[test == False].index

In [18]:
df['time'].iloc[test_index]

398     10;00
549     13;00
552     4:;10
644     12;30
841     6:;15
905     3:;00
974     2:;00
1782    2:"20
2029    8:.40
4706    6:;30
4791    6:;30
5039    0:900
Name: time, dtype: object

In [19]:
def remove_extra_characters(time_data):

    ignore_list = [":;",":.",":\"",";"]

    for tag in ignore_list:
        time_data = time_data.replace(tag, ':')

    if len(time_data) < 5:
        time_data = time_data.zfill(5)
    
    return time_data

In [20]:
df['time'] = df['time'].apply(remove_extra_characters)

In [21]:
t2 = df['time'].apply(check_pattern)

In [22]:
t2_index = t2[t2 == False].index

In [23]:
df['time'].iloc[t2_index]

5039    0:900
Name: time, dtype: object

Now we can clean the data for the index 5039. And since there cannot be 90 minutes, it seems that the data has been input wrongly. So, I think it should be 09:00

In [24]:
df.iloc[5039]['time'] = '09:00'

In [25]:
df.iloc[5039]['time']

'09:00'

In [7]:
# ignore this
pattern = re.compile('^\d{2}\:\d{2}$') 
def check_pattern(time_val):
    pattern = re.compile('^\d{2}\:\d{2}$') 
    result = pattern.match(time_val)
    return bool(result)

### location can be split into region and country columns.

In [330]:
df.sample(5)

Unnamed: 0,date,time,location,operator,flight_number,route,aircraft_type,registration,cn_ln,aboard,fatalities,ground,summary
2459,"May 25, 1971",16:26,"Santiago, Chile",\tAerolineas Flecha Austral - ALFA,?,?,\tCurtiss C-46,\tCC-CAZ,?,4 (passengers:2 crew:2),4 (passengers:2 crew:2),0,The cargo plane lost an engine while taking of...
3512,"January 30, 1987",00:00,"Off Monrovia, Liberia",Air Reconnaissance Unit of Liberia,?,?,Cessna U-27A Caravan I,ARU-021,208-00044,18 (passengers:? crew:?),16 (passengers:? crew:?),0,?
5135,"March 08, 2014",02:41,South Indian Ocean,Malaysia Airlines,370,Kuala Lumpur - Beijing,Boeing 777-200,9M-MRO,28420/404,239 (passengers:227 crew:12),239 (passengers:227 crew:12),0,Radar contact was lost with the aircraft 2 hou...
2687,"September 07, 1974",00:00,"Tandjung-karang, Indonesia",Garuda Indonesia Airlines,?,?,Fokker F-27 Friendship 600,PK-GFJ,10422,39 (passengers:36 crew:3),36 (passengers:33 crew:3),0,Crashed into a building while landing during p...
5118,"November 09, 2013",10:30,"Pujungan, Malinau district, Borneo",Military - Indonesian Army,?,Tarakan - Malinau,Mil- Mi-17B-5,?,?,21 (passengers:13 crew:8),13 (passengers:7 crew:6),0,"After losing power, the helicopter crashed int..."


### first remove the escape charaters from the location column and then do the split into region and country

In [101]:
df[df['location'] == "?"].shape[0]

5

In [5]:
def remove_escape_characters(loc_string):
    ignore_list_escape_characters = ["\n","\t","\r"]

    for char in ignore_list_escape_characters:
        loc_string = loc_string.replace(char, '')
        
    return loc_string

In [91]:
df['location'] = df['location'].apply(remove_escape_characters)

In [421]:
#ignore this
t_Str = "August 09, 2013"
pattern = re.compile('^[a-zA-Z]{1,20}\s\d{2}\,\s\d{4}$')
result = pattern.match(t_Str)
print(result)

<_sre.SRE_Match object; span=(0, 15), match='August 09, 2013'>


In [332]:
#ignore this
pattern = re.compile('^[a-zA-Z\, \.\-]{1,50}, [a-zA-Z ]{1,50}$')
result = pattern.match("Mt, Taylor, near Grants")
print(result)

<_sre.SRE_Match object; span=(0, 23), match='Mt, Taylor, near Grants'>


In [419]:
#ignore this
t10 = "Mt, Taylor, near Grants, test2,tyriri12"
n = t10.count(",")
str1 = ""
for i in range(n):
    str1 = str1 + t10.split(",")[i].strip()+ " "
str2 = t10.split(",")[n].strip()
print(str1)
print(str2)

Mt Taylor near Grants test2 
tyriri12


In [97]:
#mystr.count("!") == 2:

def split_region_country(location):
    region = ""
    country= ""
    if location == "?":
        region = "-"
        country = "-"          
    elif location.count(",") == 0:
        region = location
        country = "-"       
    elif location.count(",") == 1:
        region = location.split(",")[0]
        country = location.split(",")[1]
    elif location.count(",") >= 2:
        n = location.count(",")
        for i in range(n):
            region = region + location.split(",")[i].strip()+ " "
        country = location.split(",")[n].strip()
    ret_val = pd.Series([region, country])

    return ret_val

# not able to return tuples : https://stackoverflow.com/questions/23690284/pandas-apply-function-that-returns-multiple-values-to-rows-in-pandas-dataframe

In [98]:
df['region'], df['country'] = df['location'].apply(split_region_country)[0],df['location'].apply(split_region_country)[1]

In [99]:
df.head()

Unnamed: 0,date,time,location,operator,flight_number,route,aircraft_type,registration,cn_ln,aboard,fatalities,ground,summary,region,country
0,"September 17, 1908",17:18,"Fort Myer, Virginia",Military - U.S. Army,?,Demonstration,Wright Flyer III,?,1,2 (passengers:1 crew:1),1 (passengers:1 crew:0),0,"During a demonstration flight, a U.S. Army fly...",Fort Myer,Virginia
1,"September 07, 1909",00:00,"Juvisy-sur-Orge, France",?,?,Air show,Wright Byplane,SC1,?,1 (passengers:0 crew:1),1 (passengers:0 crew:0),0,Eugene Lefebvre was the first pilot to ever be...,Juvisy-sur-Orge,France
2,"July 12, 1912",06:30,"Atlantic City, New Jersey",Military - U.S. Navy,?,Test flight,Dirigible,?,?,5 (passengers:0 crew:5),5 (passengers:0 crew:5),0,First U.S. dirigible Akron exploded just offsh...,Atlantic City,New Jersey
3,"August 06, 1913",00:00,"Victoria, British Columbia, Canada",Private,?,?,Curtiss seaplane,?,?,1 (passengers:0 crew:1),1 (passengers:0 crew:1),0,The first fatal airplane accident in Canada oc...,Victoria British Columbia,Canada
4,"September 09, 1913",18:30,Over the North Sea,Military - German Navy,?,?,Zeppelin L-1 (airship),?,?,20 (passengers:? crew:?),14 (passengers:? crew:?),0,The airship flew into a thunderstorm and encou...,Over the North Sea,-


In [100]:
ch = df.query('region == "?"')
ch

Unnamed: 0,date,time,location,operator,flight_number,route,aircraft_type,registration,cn_ln,aboard,fatalities,ground,summary,region,country


In [383]:
#ignore this
def check_pattern_location(loc_string):
    pattern = re.compile("^[a-zA-Z0-9\/\~\\'\, \.\-]{1,50}, [a-zA-Z0-9 ]{1,50}$")
    result = pattern.match(loc_string)
    return bool(result)

In [405]:
df[df['location'] == "?"].shape[0]

5

In [384]:
t5 = df['location'].apply(check_pattern_location)

In [385]:
t5_index = t5[t5 == False].index

In [403]:
df.iloc[t5_index]['location'].sample(5)

683                         China
2798                 Havana. Cuba
1137    Mt. Bukit, Besar,Thailand
590                  Sea of Japan
4              Over the North Sea
Name: location, dtype: object

- Operator contains some escape characters.
- flight number: there are missing flight number informations represented by "?"
- route: can be split into : source and destination locations
- aboard can be further split into : total_aboard, passenger, crew
- fatalities can be again further split into passenger and crew.

### Operator contains some escape characters.

In [86]:
df.operator.sample(5)

1027    Tech - Aeronautical Explotiation
3339           Military - U.S. Air Force
3645                       Burma Airways
3710                     Federal Express
2914                                TABA
Name: operator, dtype: object

In [89]:
df.sample(5)

Unnamed: 0,date,time,location,operator,flight_number,route,aircraft_type,registration,cn_ln,aboard,fatalities,ground,summary
1629,"October 15, 1958",00:00,"Villa Montes, Bolivia",TAM (Bolivia),?,Fortin Campero - Tarija,Douglas C-47A,TAM-03,13839,20 (passengers:17 crew:3),20 (passengers:17 crew:3),0,Crashed into mountain while en route.
1591,"April 02, 1958",00:00,"Quito, Ecuador",Transportes Aereos Orientales,?,Quito - Esmeraldas,Junkers JU-52/3m,HC-SND,J5109,14 (passengers:12 crew:2),3 (passengers:3 crew:0),0,Crashed after experiencing engine trouble.
2143,"August 03, 1966",00:00,"Benito, Bolivia",Lloyd Aéreo Boliviano,?,Riberalta - Cochabamba,Curtiss C-46D,CP-730,33457,3 (passengers:0 crew:3),3 (passengers:0 crew:3),0,The cargo aircraft was not being flown at the...
1612,"August 11, 1958",00:00,"Near Belem, Brazil",\tLoide Aéreo Nacional,?,Manaus - Belem,Douglas DC-4,PP-LEQ,10544,11 (passengers:5 crew:6),10 (passengers:4 crew:6),0,Crashed on approach 10 minutes from the airpor...
3673,"June 27, 1989",00:00,"Kaohsiung, Taiwan",Formosa Airlines,?,Kaohsiung - Chi-mei,Cessna 404 Titan Courier II,B-12206,404-0418,13 (passengers:? crew:?),12 (passengers:? crew:?),0,Crashed into houses on takeoff and exploded.


In [160]:
df['operator'].sample(5)

4823            Safe Air Complany
31      Aircraft Travel Transport
5152         Ukraine Air Alliance
4512                    Agco Corp
1997          Commercial Air Taxi
Name: operator, dtype: object

In [6]:
# remove escape characters from operator
df['operator'] = df['operator'].apply(remove_escape_characters)

### create a new column to specify operator type: Military or Commercial

In [122]:
# function to create if the aircraft is a military or a commercial type
check_type = lambda x: "Military" if "military" in x.lower() else "Commercial"

In [170]:
type_check = map(lambda x: "Military" if "military" in x.lower() else "Commercial", df['operator'])

In [171]:
### ignore this Do it as df['operator_type'] = pd.Series(type_check)
df['operator_type'] = pd.Series(list(type_check))

In [185]:
df[['operator','operator_type']].sample(5)

Unnamed: 0,operator,operator_type
4792,Military - Russian Army,Military
5007,Sky Dive New Zealand,Commercial
854,China National Aviation Corporation,Commercial
3995,TAME,Commercial
2813,Philippine Air Lines,Commercial


### flight number: there are missing flight number informations represented by "?"

In [3]:
df['flight_number'].value_counts()

?           4043
-             63
1             11
4              7
301            6
6              6
201            6
21             6
101            6
901            5
701            5
601            5
200            5
3              5
706            5
202            5
7              5
542            4
191            4
11             4
703            4
114            4
205            4
261            4
902            4
610            4
105            4
8              4
214            4
10             4
            ... 
5050           1
4844-C         1
H926           1
1994           1
493            1
1363           1
763/1907       1
280D           1
805            1
441            1
1-10           1
183            1
59             1
123            1
201/8          1
228            1
661            1
A-513          1
1-6-6A         1
3456           1
434            1
E-46           1
615            1
511            1
1802           1
716 / -        1
035            1
523           

In [20]:
df['flight_number'] = df['flight_number'].apply(lambda x: "-" if x =="?" else x)

In [21]:
df['flight_number'].value_counts()

-             4106
1               11
4                7
6                6
101              6
21               6
201              6
301              6
701              5
3                5
7                5
601              5
200              5
202              5
706              5
901              5
542              4
105              4
11               4
703              4
114              4
205              4
214              4
10               4
8                4
610              4
112              4
304              4
191              4
261              4
              ... 
2174             1
731              1
4815             1
7425             1
18               1
199              1
3132             1
3597             1
163              1
181              1
557              1
518              1
 SU-065          1
277              1
4509             1
636              1
390              1
42               1
385              1
93               1
148              1
6427        

In [22]:
df.head()

Unnamed: 0,date,time,location,operator,flight_number,route,aircraft_type,registration,cn_ln,aboard,fatalities,ground,summary
0,"September 17, 1908",17:18,"Fort Myer, Virginia",Military - U.S. Army,-,Demonstration,Wright Flyer III,?,1,2 (passengers:1 crew:1),1 (passengers:1 crew:0),0,"During a demonstration flight, a U.S. Army fly..."
1,"September 07, 1909",?,"Juvisy-sur-Orge, France",?,-,Air show,Wright Byplane,SC1,?,1 (passengers:0 crew:1),1 (passengers:0 crew:0),0,Eugene Lefebvre was the first pilot to ever be...
2,"July 12, 1912",06:30,"Atlantic City, New Jersey",Military - U.S. Navy,-,Test flight,Dirigible,?,?,5 (passengers:0 crew:5),5 (passengers:0 crew:5),0,First U.S. dirigible Akron exploded just offsh...
3,"August 06, 1913",?,"Victoria, British Columbia, Canada",Private,-,?,Curtiss seaplane,?,?,1 (passengers:0 crew:1),1 (passengers:0 crew:1),0,The first fatal airplane accident in Canada oc...
4,"September 09, 1913",c 18:30,Over the North Sea,Military - German Navy,-,?,Zeppelin L-1 (airship),?,?,20 (passengers:? crew:?),14 (passengers:? crew:?),0,The airship flew into a thunderstorm and encou...
