## Plane Crash Data Exploration and Analysis

#### Budhajit Roy Chanamthabam

In [1]:
# import necessary libraries
#https://www.kaggle.com/budhajit/plane-crash-information-dataset

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import json
from bs4 import BeautifulSoup
import warnings
import re
import datetime
warnings.filterwarnings('ignore')

In [2]:
def set_data(year, dates):
    date = []
    time = []
    location = []
    operator = []
    flight_number = []
    route = []
    aircraft_type = []
    registration = []
    cn_ln = []
    aboard = []
    fatalities = []
    ground = []
    summary = []
    final_data = {}

    for i in range(len(dates)):
        url1 = "http://www.planecrashinfo.com/"+str(year)+"/"+str(year)+"-"+str(i+1)+".htm"
        #print(url)
        r1 = requests.get(url1)
        htm_doc = r1.text  

        soup1 = BeautifulSoup(htm_doc)
        td_tags = soup1.find_all('td')

        test = []
        for items in td_tags:
            test.append(items.text)

        # add the data details to their corresponding lists
        date.append(test[3])
        time.append(test[5])
        location.append(test[7])
        operator.append(test[9])
        flight_number.append(test[11])
        route.append(test[13])
        aircraft_type.append(test[15])
        registration.append(test[17])
        cn_ln.append(test[19])
        aboard.append(test[21])
        fatalities.append(test[23])
        ground.append(test[25])
        summary.append(test[27])
        
    final_data['date'] = date
    final_data['time'] = time
    final_data['location'] = location
    final_data['operator'] = operator
    final_data['flight_number'] = flight_number
    final_data['route'] = route
    final_data['aircraft_type'] = aircraft_type
    final_data['registration'] = registration
    final_data['cn_ln'] = cn_ln
    final_data['aboard'] = aboard
    final_data['fatalities'] = fatalities
    final_data['ground'] = ground
    final_data['summary'] = summary
    
    return final_data

In [4]:
# setting up the data for each year and creating the dataframe

for year in range(1920,2020):    
    url = "http://www.planecrashinfo.com/"+str(year)+"/"+str(year)+".htm"
    r = requests.get(url)
    html_doc = r.text
    soup = BeautifulSoup(html_doc)
    a_tags = soup.find_all('a')
    dates = []
    for link in a_tags:
        #print(link.text) # this returns the text
        dates.append(link.text)
        #print(link.get('href')) # this returns the link
    dates = dates[:-1]
    year_df = set_data(year,dates)
    if year!= 1920:
        temp_df = pd.DataFrame(year_df)
        final_df = pd.concat([final_df,temp_df])
    else:
        final_df = pd.DataFrame(year_df)
    

In [5]:
final_df.shape

(5242, 13)

In [7]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5242 entries, 0 to 0
Data columns (total 13 columns):
date             5242 non-null object
time             5242 non-null object
location         5242 non-null object
operator         5242 non-null object
flight_number    5242 non-null object
route            5242 non-null object
aircraft_type    5242 non-null object
registration     5242 non-null object
cn_ln            5242 non-null object
aboard           5242 non-null object
fatalities       5242 non-null object
ground           5242 non-null object
summary          5242 non-null object
dtypes: object(13)
memory usage: 573.3+ KB


In [8]:
final_df.head()

Unnamed: 0,date,time,location,operator,flight_number,route,aircraft_type,registration,cn_ln,aboard,fatalities,ground,summary
0,"September 17, 1908",17:18,"Fort Myer, Virginia",Military - U.S. Army,?,Demonstration,Wright Flyer III,?,1,2 (passengers:1 crew:1),1 (passengers:1 crew:0),0,"During a demonstration flight, a U.S. Army fly..."
1,"September 07, 1909",?,"Juvisy-sur-Orge, France",?,?,Air show,Wright Byplane,SC1,?,1 (passengers:0 crew:1),1 (passengers:0 crew:0),0,Eugene Lefebvre was the first pilot to ever be...
2,"July 12, 1912",06:30,"Atlantic City, New Jersey",Military - U.S. Navy,?,Test flight,Dirigible,?,?,5 (passengers:0 crew:5),5 (passengers:0 crew:5),0,First U.S. dirigible Akron exploded just offsh...
3,"August 06, 1913",?,"Victoria, British Columbia, Canada",Private,?,?,Curtiss seaplane,?,?,1 (passengers:0 crew:1),1 (passengers:0 crew:1),0,The first fatal airplane accident in Canada oc...
4,"September 09, 1913",c 18:30,Over the North Sea,Military - German Navy,?,?,Zeppelin L-1 (airship),?,?,20 (passengers:? crew:?),14 (passengers:? crew:?),0,The airship flew into a thunderstorm and encou...


In [9]:
final_df.tail()

Unnamed: 0,date,time,location,operator,flight_number,route,aircraft_type,registration,cn_ln,aboard,fatalities,ground,summary
15,"September 28, 2018",1010,"Chuuk, Micronesia",Air Niugini,?,Pohnpei - Chuuk,Boeing 737-8BK,P2-PXE,33024/1688,47 (passengers:35 crew:12),1 (passengers:1 crew:0),0,The aircraft was approaching for a landing at ...
16,"October 29, 2018",631,"Off Jakarta, Indonesia",Lion Air,610,Jakarta - Pangkal Pinang,Boeing 737-MAX 8,PK-LQP,43000/7058,189 (passengers:181 crew:8),189 (passengers:181 crew:8),0,"The airliner crashed into the Jakarta Sea, 13 ..."
17,"November 06, 2018",253,"Georgetown, Guyana",Fly Jamaica Airways,?,Georgetown - Toronto,Boeing 757-N23,N524AT,30233/895,128 (passengers:120 crew:8),1 (passengers:1 crew:0),0,"After taking off and reaching FL200, the crew ..."
18,"November 18, 2018",2300,"Near Mandan, North Dakota",Metro Area Ambulance Services,?,Bismark - Sloulin Field,Cessna 441 Conquest II,N441CX,441-0305,3 (passengers:2 crew:1),3 (passengers:2 crew:1),0,The air ambulance en route to pick up a patien...
0,"January 14, 2019",830,"Karaj, Iran",Saha Air,?,Bishkek - Payam,Boeing 707-3J9C,EP-CPP,21128/917,16 (passengers:13 crew:3),15 (passengers:13 crew:2),0,The cargo plane was operated by the Iranian Ai...


In [11]:
final_df = final_df.reset_index(drop=True)
final_df.to_csv("plane_crash_data.csv", index = False)

In [12]:
final_df.columns

Index(['date', 'time', 'location', 'operator', 'flight_number', 'route',
       'aircraft_type', 'registration', 'cn_ln', 'aboard', 'fatalities',
       'ground', 'summary'],
      dtype='object')

## Read data from CSV

In [2]:
test_df = pd.read_csv("plane_crash_data.csv")
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5242 entries, 0 to 5241
Data columns (total 13 columns):
date             5242 non-null object
time             5242 non-null object
location         5242 non-null object
operator         5242 non-null object
flight_number    5242 non-null object
route            5242 non-null object
aircraft_type    5242 non-null object
registration     5242 non-null object
cn_ln            5242 non-null object
aboard           5242 non-null object
fatalities       5242 non-null object
ground           5242 non-null object
summary          5242 non-null object
dtypes: object(13)
memory usage: 532.5+ KB


In [3]:
test_df.describe()

Unnamed: 0,date,time,location,operator,flight_number,route,aircraft_type,registration,cn_ln,aboard,fatalities,ground,summary
count,5242,5242,5242,5242,5242,5242,5242,5242,5242,5242,5242,5242,5242
unique,4742,1287,4327,2633,808,3529,2557,4888,3781,982,871,52,4876
top,"August 31, 1988",?,"Moscow, Russia",Aeroflot,?,?,Douglas DC-3,?,?,2 (passengers:0 crew:2),1 (passengers:0 crew:1),0,?
freq,4,1917,16,217,4043,1338,310,321,1092,224,272,4964,210


In [7]:
test_df.tail()

Unnamed: 0,date,time,location,operator,flight_number,route,aircraft_type,registration,cn_ln,aboard,fatalities,ground,summary
5237,"September 28, 2018",1010,"Chuuk, Micronesia",Air Niugini,?,Pohnpei - Chuuk,Boeing 737-8BK,P2-PXE,33024/1688,47 (passengers:35 crew:12),1 (passengers:1 crew:0),0,The aircraft was approaching for a landing at ...
5238,"October 29, 2018",631,"Off Jakarta, Indonesia",Lion Air,610,Jakarta - Pangkal Pinang,Boeing 737-MAX 8,PK-LQP,43000/7058,189 (passengers:181 crew:8),189 (passengers:181 crew:8),0,"The airliner crashed into the Jakarta Sea, 13 ..."
5239,"November 06, 2018",253,"Georgetown, Guyana",Fly Jamaica Airways,?,Georgetown - Toronto,Boeing 757-N23,N524AT,30233/895,128 (passengers:120 crew:8),1 (passengers:1 crew:0),0,"After taking off and reaching FL200, the crew ..."
5240,"November 18, 2018",2300,"Near Mandan, North Dakota",Metro Area Ambulance Services,?,Bismark - Sloulin Field,Cessna 441 Conquest II,N441CX,441-0305,3 (passengers:2 crew:1),3 (passengers:2 crew:1),0,The air ambulance en route to pick up a patien...
5241,"January 14, 2019",830,"Karaj, Iran",Saha Air,?,Bishkek - Payam,Boeing 707-3J9C,EP-CPP,21128/917,16 (passengers:13 crew:3),15 (passengers:13 crew:2),0,The cargo plane was operated by the Iranian Ai...


- convert date in to date format
- related to time data:
    - time data also contains the character 'c' in 322 observations.
    - time data has missing semi-colons.
    - time data has "?" for missing data
- location can be split into region and country columns.
- Operator can again be split into private, passenger service and military. (will have to find the operations first ).
- flight number: there are missing flight number informations represented by "?"
- route: can be split into : source and destination locations
- aboard can be further split into : total_aboard, passenger, crew
- fatalities can be again further split into passenger and crew.
- 

### Convert data into Date Format

In [10]:
#using different functions
def get_month(month_string):
    month_string = month_string.lower()
    return {
        'january': "1',
        'february': '2',
        'march':'3',
        'april':'4',
        'may':'5',
        'june':'6',
        'july':'7',
        'august':'8',
        'september':'9',
        'october':'10',
        'november':'11',
        'december':'12'
    }[month_string]

In [25]:
def convert_date(date_string):
    month_val = date_string.split(" ")[0]
    month_str = get_month(month_val)
    
    date_val = date_string.split(" ")[1].split(",")[0]
    year_val = date_string.split(" ")[2]
    
    final_date = str(month_str)+"-"+str(date_val)+"-"+str(year_val)
    
    return final_date


In [31]:
test_df['date_new'] = test_df['date'].apply(convert_date)

In [43]:
#https://discuss.analyticsvidhya.com/t/how-to-convert-string-to-date-in-pandas-dataframe-using-python/17674
s=pd.Series(["Jan.3,2017", "feb.4,2016", "mar.2,2017", "apr.3,2015" , "apr.3,2016"])
sf=pd.DataFrame(s, columns =["date_col"])
print (pd.to_datetime(sf.date_col, format="%b.%d,%Y"))

0   2017-01-03
1   2016-02-04
2   2017-03-02
3   2015-04-03
4   2016-04-03
Name: date_col, dtype: datetime64[ns]


In [48]:
s = test_df['date']
#sf = pd.DataFrame(s, columns = ['date_col'])
#test_df["new_date1"] = 
#print(pd.to_datetime(s,format = "%B %d, %Y"))

0      1908-09-17
1      1909-09-07
2      1912-07-12
3      1913-08-06
4      1913-09-09
5      1913-10-17
6      1915-03-05
7      1915-09-03
8      1916-07-28
9      1916-09-24
10     1916-10-01
11     1916-11-21
12     1916-11-28
13     1917-03-04
14     1917-03-30
15     1917-05-14
16     1917-06-14
17     1917-06-17
18     1917-08-21
19     1917-10-20
20     1918-04-07
21     1918-05-10
22     1918-08-11
23     1918-12-16
24     1919-05-25
25     1919-07-19
26     1919-08-02
27     1919-10-02
28     1919-10-14
29     1919-10-20
          ...    
5212   2017-05-05
5213   2017-05-15
5214   2017-05-27
5215   2017-06-07
5216   2017-06-28
5217   2017-07-10
5218   2017-10-14
5219   2017-11-15
5220   2017-12-13
5221   2017-12-31
5222   2018-01-29
5223   2018-02-10
5224   2018-02-11
5225   2018-02-18
5226   2018-03-06
5227   2018-03-11
5228   2018-03-12
5229   2018-03-17
5230   2018-04-11
5231   2018-04-17
5232   2018-05-02
5233   2018-05-18
5234   2018-06-05
5235   2018-07-10
5236   201

In [3]:
# use this date conversion **********************************
s = test_df['date']
test_df['new_date'] = pd.to_datetime(s,format = "%B %d, %Y")

In [4]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5242 entries, 0 to 5241
Data columns (total 14 columns):
date             5242 non-null object
time             5242 non-null object
location         5242 non-null object
operator         5242 non-null object
flight_number    5242 non-null object
route            5242 non-null object
aircraft_type    5242 non-null object
registration     5242 non-null object
cn_ln            5242 non-null object
aboard           5242 non-null object
fatalities       5242 non-null object
ground           5242 non-null object
summary          5242 non-null object
new_date         5242 non-null datetime64[ns]
dtypes: datetime64[ns](1), object(13)
memory usage: 573.4+ KB


## Do this in the last
### time data has "?" for missing data. 
### Convert it into nan

In [24]:
# count the number of observations with "?"
test_df.query('time == "?"').shape[0]

1917

In [25]:
test_df['time'] = test_df['time'].replace('?', np.nan)

In [26]:
# check the dataset
test_df.query('time == "?"').shape[0]

0

### time data also contains the character 'c' in 322 observations.

In [4]:
df = test_df.copy()

In [4]:
#test_df.query('"c" in time').shape[0]
test_df[test_df['time'].str.contains("c")].shape[0]

322

In [12]:
test_df[test_df['time'].str.contains("c")].sample(5)

Unnamed: 0,date,time,location,operator,flight_number,route,aircraft_type,registration,cn_ln,aboard,fatalities,ground,summary,new_date
1151,"March 23, 1951",c 23:50,Over the North Atlantic,Military - U.S. Air Force,?,Loring AFB - Lakeheath AFB,Douglas C-124A Globemaster,49-0244,43173,53 (passengers:48 crew:5),53 (passengers:48 crew:5),0,The aircraft may have suffered an catastrophic...,1951-03-23
476,"February 02, 1938",c 20:40,"Off San Clemente Island, California",Military - U.S. Navy / Military - U.S. Navy,?,?,Consolidated PBY-2 / Consolidated PBY-2,04062 / 0463,?,14 (passengers:0 crew:14),11 (passengers:0 crew:11),0,The two aircraft crashed in mid-air in heavy r...,1938-02-02
1038,"August 02, 1949",c 11:30,"Jaquirana, Brazil",Varig,?,Sao Paulo - Porto Alegre,Curtiss C-46D-10-CU,PP-VBI,33100,36 (passengers:30 crew:6),5 (passengers:4 crew:1),0,"Twenty minutes before arriving a Porto Alegre,...",1949-08-02
499,"July 24, 1938",c 13:00,Near Bogota Colombia,Military - Colombian Army,?,?,Curtiss-Wright Hawk,?,?,1 (passengers:0 crew:1),1 (passengers:0 crew:1),52,One aircraft from a squadron performing aeroba...,1938-07-24
237,"November 18, 1930",c: 2:00,"Techachapi Mountains, California",Pacific Air Transport,?,"Burbank, CA - Oakland, CA",Boeing 40,NC5340,?,3 (passengers:0 crew:3),3 (passengers:0 crew:3),?,Crashed into a mountainside at an altitude of ...,1930-11-18


Since it is observed that the character "c" is present in the start of the time data, might be due to data input error. Anyways lets remove this from the observations.

In [19]:
# function to replace "?", remove "c" and to add ":" for all the missing : in time data

def remove_c(time_data):
    time_data = time_data.strip()
    
    if time_data == "?":
        time_data = "00:00"
    
    if "c:" in time_data: #time_data.str.contains("c:"):
        time_data = time_data.replace("c:","")

    if "c " in time_data: #time_data.str.contains("c:"):
        time_data = time_data.replace("c ","")
        
    if "c" in time_data: #time_data.str.contains("c:"):
        time_data = time_data.replace("c","")
        
    if ":" not in time_data:
        new_str = time_data[:2] + ':' + time_data[2:]
    else:
        new_str = time_data       
        
    return new_str

In [20]:
test = df['time'].apply(remove_c)
print(test)

0       17:18
1       00:00
2       06:30
3       00:00
4       18:30
5       10:30
6       01:00
7       15:20
8       00:00
9       01:00
10      23:45
11      00:00
12      23:45
13      00:00
14      00:00
15      05:15
16      08:45
17      00:00
18      07:00
19      07:45
20      21:30
21      00:00
22      10:00
23      00:00
24      00:00
25      00:00
26      00:00
27      00:00
28      00:00
29      00:00
        ...  
5212    06:53
5213    15:30
5214    14:02
5215    13:35
5216    10:15
5217    16:00
5218    08:30
5219    08:00
5220    18:15
5221    12:16
5222    00:00
5223    17:31
5224    14:31
5225    09:26
5226    14:51
5227    19:08
5228    14:15
5229    11:20
5230    08:00
5231    10:04
5232    11:30
5233    12:08
5234    17:00
5235    07:44
5236    16:55
5237    10:10
5238    06:31
5239    02:53
5240    23:00
5241    08:30
Name: time, Length: 5242, dtype: object


In [21]:
not_match = []
t1 = test.apply(check_pattern)
t1

0       True
1       True
2       True
3       True
4       True
5       True
6       True
7       True
8       True
9       True
10      True
11      True
12      True
13      True
14      True
15      True
16      True
17      True
18      True
19      True
20      True
21      True
22      True
23      True
24      True
25      True
26      True
27      True
28      True
29      True
        ... 
5212    True
5213    True
5214    True
5215    True
5216    True
5217    True
5218    True
5219    True
5220    True
5221    True
5222    True
5223    True
5224    True
5225    True
5226    True
5227    True
5228    True
5229    True
5230    True
5231    True
5232    True
5233    True
5234    True
5235    True
5236    True
5237    True
5238    True
5239    True
5240    True
5241    True
Name: time, Length: 5242, dtype: bool

In [22]:
t1[t1 == False]

156     False
237     False
356     False
398     False
442     False
546     False
549     False
552     False
644     False
759     False
841     False
905     False
974     False
1033    False
1109    False
1166    False
1211    False
1240    False
1243    False
1325    False
1359    False
1407    False
1440    False
1475    False
1481    False
1511    False
1650    False
1659    False
1666    False
1679    False
        ...  
1925    False
1927    False
1941    False
1962    False
2024    False
2029    False
2086    False
2088    False
2172    False
2227    False
2366    False
2688    False
2815    False
3118    False
3432    False
3954    False
4371    False
4436    False
4508    False
4518    False
4528    False
4529    False
4692    False
4701    False
4706    False
4791    False
4833    False
4958    False
5039    False
5146    False
Name: time, Length: 72, dtype: bool

In [34]:
df.iloc[4692:4693,:]

Unnamed: 0,date,time,location,operator,flight_number,route,aircraft_type,registration,cn_ln,aboard,fatalities,ground,summary
4692,"November 27, 2004",11:20Z,"Near Bagram, Afghanstan",Presidental Airways Inc. - Air Taxi,-,"Bagram - Farah, Afghanstan",CASA 212-CC,N960BW,231,6 (passengers:3 crew:3),6 (passengers:3 crew:3),0,Crashed into a mountain 80 miles west of Bagra...


In [18]:
df['new_time'] = df['time'].apply(remove_c)
df.head(3)

Unnamed: 0,date,time,location,operator,flight_number,route,aircraft_type,registration,cn_ln,aboard,fatalities,ground,summary,new_time
0,"September 17, 1908",17:18,"Fort Myer, Virginia",Military - U.S. Army,?,Demonstration,Wright Flyer III,?,1,2 (passengers:1 crew:1),1 (passengers:1 crew:0),0,"During a demonstration flight, a U.S. Army fly...",17:18
1,"September 07, 1909",?,"Juvisy-sur-Orge, France",?,?,Air show,Wright Byplane,SC1,?,1 (passengers:0 crew:1),1 (passengers:0 crew:0),0,Eugene Lefebvre was the first pilot to ever be...,00:00
2,"July 12, 1912",06:30,"Atlantic City, New Jersey",Military - U.S. Navy,?,Test flight,Dirigible,?,?,5 (passengers:0 crew:5),5 (passengers:0 crew:5),0,First U.S. dirigible Akron exploded just offsh...,06:30


In [30]:
df['new_date_time'] = df['date'].str.strip() + " "+ df['new_time'].str.strip()
df.head()

Unnamed: 0,date,time,location,operator,flight_number,route,aircraft_type,registration,cn_ln,aboard,fatalities,ground,summary,new_time,new_date_time
0,"September 17, 1908",17:18,"Fort Myer, Virginia",Military - U.S. Army,?,Demonstration,Wright Flyer III,?,1,2 (passengers:1 crew:1),1 (passengers:1 crew:0),0,"During a demonstration flight, a U.S. Army fly...",17:18,"September 17, 1908 17:18"
1,"September 07, 1909",?,"Juvisy-sur-Orge, France",?,?,Air show,Wright Byplane,SC1,?,1 (passengers:0 crew:1),1 (passengers:0 crew:0),0,Eugene Lefebvre was the first pilot to ever be...,00:00,"September 07, 1909 00:00"
2,"July 12, 1912",06:30,"Atlantic City, New Jersey",Military - U.S. Navy,?,Test flight,Dirigible,?,?,5 (passengers:0 crew:5),5 (passengers:0 crew:5),0,First U.S. dirigible Akron exploded just offsh...,06:30,"July 12, 1912 06:30"
3,"August 06, 1913",?,"Victoria, British Columbia, Canada",Private,?,?,Curtiss seaplane,?,?,1 (passengers:0 crew:1),1 (passengers:0 crew:1),0,The first fatal airplane accident in Canada oc...,00:00,"August 06, 1913 00:00"
4,"September 09, 1913",c 18:30,Over the North Sea,Military - German Navy,?,?,Zeppelin L-1 (airship),?,?,20 (passengers:? crew:?),14 (passengers:? crew:?),0,The airship flew into a thunderstorm and encou...,18:30,"September 09, 1913 18:30"


In [33]:
import datetime
test_1 = datetime.datetime.strptime(df['new_date_time'].str, '%B %d, %Y, %H:%M')
test_1

TypeError: strptime() argument 1 must be str, not StringMethods

In [40]:
import re
pattern = re.compile('^\d{2}\:\d{2}$')
result = pattern.match(":00:33")
print(bool(result))

False


In [7]:
pattern = re.compile('^\d{2}\:\d{2}$') 
def check_pattern(time_val):
    pattern = re.compile('^\d{2}\:\d{2}$') 
    result = pattern.match(time_val)
    return bool(result)

In [8]:
not_match = []
t1 = test.apply(check_pattern)
t1

0        True
1        True
2        True
3        True
4       False
5       False
6       False
7       False
8        True
9       False
10      False
11       True
12      False
13       True
14       True
15      False
16      False
17       True
18      False
19      False
20      False
21       True
22      False
23       True
24       True
25       True
26       True
27       True
28       True
29       True
        ...  
5212     True
5213     True
5214     True
5215     True
5216     True
5217     True
5218     True
5219     True
5220     True
5221     True
5222     True
5223     True
5224     True
5225     True
5226     True
5227     True
5228     True
5229     True
5230     True
5231     True
5232     True
5233     True
5234     True
5235     True
5236     True
5237     True
5238     True
5239     True
5240     True
5241     True
Name: time, Length: 5242, dtype: bool

In [10]:
t1[t1 == False]

4       False
5       False
6       False
7       False
9       False
10      False
12      False
15      False
16      False
18      False
19      False
20      False
22      False
59      False
69      False
79      False
82      False
85      False
93      False
100     False
115     False
126     False
155     False
156     False
167     False
212     False
226     False
237     False
244     False
247     False
        ...  
4517    False
4518    False
4520    False
4522    False
4527    False
4528    False
4529    False
4535    False
4540    False
4557    False
4563    False
4580    False
4601    False
4619    False
4654    False
4692    False
4701    False
4706    False
4708    False
4791    False
4833    False
4851    False
4868    False
4883    False
4896    False
4942    False
4956    False
4958    False
5039    False
5146    False
Name: time, Length: 381, dtype: bool

In [18]:
df.iloc[3:4,:]

Unnamed: 0,date,time,location,operator,flight_number,route,aircraft_type,registration,cn_ln,aboard,fatalities,ground,summary
3,"August 06, 1913",?,"Victoria, British Columbia, Canada",Private,?,?,Curtiss seaplane,?,?,1 (passengers:0 crew:1),1 (passengers:0 crew:1),0,The first fatal airplane accident in Canada oc...


In [37]:
df.query('new_date_time == "April 18, 1930 :17:00"')

Unnamed: 0,date,time,location,operator,flight_number,route,aircraft_type,registration,cn_ln,aboard,fatalities,ground,summary,new_time,new_date_time
220,"April 18, 1930",c:17:00,"Jersey City, New Jersey",Canadian Colonial Airways,?,"Albany, NY - Jersey City, NJ",Fairchild 71,NC9148,634,4 (passengers:3 crew:1),4 (passengers:3 crew:1),0,Crashed into power lines 150 feet above the gr...,:17:00,"April 18, 1930 :17:00"


In [36]:
#'April 18, 1930 :17:00' does not match format '%B %d, %Y %H:%M' (match)
s = df['new_date_time']
test_1 = pd.to_datetime(s,format = "%B %d, %Y %H:%M")
test_1

ValueError: time data 'April 18, 1930 :17:00' does not match format '%B %d, %Y %H:%M' (match)