## ------------------------ D A T A --- E X P L O R A T I O N ------------------------

### We start by importing the necessary libraries

In [71]:
import pandas as pd
import numpy as np
import chart_studio.plotly as py
import cufflinks as cf
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import requests
import pycountry
import re
import json

%matplotlib inline

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from datetime import datetime
from random import randint

init_notebook_mode(connected=True)
cf.go_offline()

In [2]:
# We t week the amount of rows that pandas dataframes can display
pd.options.display.max_rows = 999

In [3]:
# We import the original dataset into a pandas dataframe
df = pd.read_csv('datasets/AccidentesAviones.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,fecha,HORA declarada,Ruta,OperadOR,flight_no,route,ac_type,registration,cn_ln,all_aboard,PASAJEROS A BORDO,crew_aboard,cantidad de fallecidos,passenger_fatalities,crew_fatalities,ground,summary
0,0,"September 17, 1908",1718,"Fort Myer, Virginia",Military - U.S. Army,?,Demonstration,Wright Flyer III,?,1,2,1,1,1,1,0,0,"During a demonstration flight, a U.S. Army fly..."
1,1,"September 07, 1909",?,"Juvisy-sur-Orge, France",?,?,Air show,Wright Byplane,SC1,?,1,0,1,1,0,0,0,Eugene Lefebvre was the first pilot to ever be...
2,2,"July 12, 1912",0630,"Atlantic City, New Jersey",Military - U.S. Navy,?,Test flight,Dirigible,?,?,5,0,5,5,0,5,0,First U.S. dirigible Akron exploded just offsh...
3,3,"August 06, 1913",?,"Victoria, British Columbia, Canada",Private,?,?,Curtiss seaplane,?,?,1,0,1,1,0,1,0,The first fatal airplane accident in Canada oc...
4,4,"September 09, 1913",1830,Over the North Sea,Military - German Navy,?,?,Zeppelin L-1 (airship),?,?,20,?,?,14,?,?,0,The airship flew into a thunderstorm and encou...


In [4]:
df.shape, df.columns

((5008, 18),
 Index(['Unnamed: 0', 'fecha', 'HORA declarada', 'Ruta', 'OperadOR',
        'flight_no', 'route', 'ac_type', 'registration', 'cn_ln', 'all_aboard',
        'PASAJEROS A BORDO', 'crew_aboard', 'cantidad de fallecidos',
        'passenger_fatalities', 'crew_fatalities', 'ground', 'summary'],
       dtype='object'))

In [5]:
# We check whether the 'Unnamed: 0' column has any valuable information (spoiler: it doesn't)
df.drop('Unnamed: 0', axis=1).duplicated().value_counts()

False    5008
dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5008 entries, 0 to 5007
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Unnamed: 0              5008 non-null   int64 
 1   fecha                   5008 non-null   object
 2   HORA declarada          5008 non-null   object
 3   Ruta                    5008 non-null   object
 4   OperadOR                5008 non-null   object
 5   flight_no               5008 non-null   object
 6   route                   5008 non-null   object
 7   ac_type                 5008 non-null   object
 8   registration            5008 non-null   object
 9   cn_ln                   5008 non-null   object
 10  all_aboard              5008 non-null   object
 11  PASAJEROS A BORDO       5008 non-null   object
 12  crew_aboard             5008 non-null   object
 13  cantidad de fallecidos  5008 non-null   object
 14  passenger_fatalities    5008 non-null   object
 15  crew

In [7]:
# Here we can get an idea of the distribution of values in each column, the output is too large to execute in the notebook
'''
for col in df:
    print(f'----------{col}----------\n{df[col].value_counts()}\n\n')
'''

"\nfor col in df:\n    print(f'----------{col}----------\n{df[col].value_counts()}\n\n')\n"

Now we will delete the 'Unnamed: 0' column, as it contains a unique identifier for each row and is not needed. We will also rename the others columns in order to make more clear what information is contained on each of them.

Both of this changes will be performed in a copy of the original dataset so we can preserve the original data and access it every time we need to.

In [8]:
df2 = df.copy()

In [9]:
df.columns

Index(['Unnamed: 0', 'fecha', 'HORA declarada', 'Ruta', 'OperadOR',
       'flight_no', 'route', 'ac_type', 'registration', 'cn_ln', 'all_aboard',
       'PASAJEROS A BORDO', 'crew_aboard', 'cantidad de fallecidos',
       'passenger_fatalities', 'crew_fatalities', 'ground', 'summary'],
      dtype='object')

In [10]:
new_col_names = {'fecha':'date',
                 'HORA declarada': 'time',
                 'Ruta': 'accident_location',
                 'OperadOR': 'operator',
                 'PASAJEROS A BORDO': 'passengers_aboard',
                 'crew_aboard': 'crew_aboard',
                 'cantidad de fallecidos': 'total_fatalities',
                 'ground': 'ground_fatalities'}

df2.rename(columns = new_col_names, inplace = True)
df2.columns

Index(['Unnamed: 0', 'date', 'time', 'accident_location', 'operator',
       'flight_no', 'route', 'ac_type', 'registration', 'cn_ln', 'all_aboard',
       'passengers_aboard', 'crew_aboard', 'total_fatalities',
       'passenger_fatalities', 'crew_fatalities', 'ground_fatalities',
       'summary'],
      dtype='object')

In [11]:
df2.drop('Unnamed: 0', axis=1, inplace=True)
df2.columns

Index(['date', 'time', 'accident_location', 'operator', 'flight_no', 'route',
       'ac_type', 'registration', 'cn_ln', 'all_aboard', 'passengers_aboard',
       'crew_aboard', 'total_fatalities', 'passenger_fatalities',
       'crew_fatalities', 'ground_fatalities', 'summary'],
      dtype='object')

## Per column data transformations

In [12]:
# As we noticed that some columns have '?' imputed wherever a value is missing, we count the real missing values
for x in df2:
    res = False
    cnt = 0
    for y in df2[x]:
        if y == '?':
            res = True
            cnt += 1
    print(f"{x} contains '?': {res} ({cnt})")

date contains '?': False (0)
time contains '?': True (1504)
accident_location contains '?': True (5)
operator contains '?': True (10)
flight_no contains '?': True (3682)
route contains '?': True (762)
ac_type contains '?': True (13)
registration contains '?': True (272)
cn_ln contains '?': True (667)
all_aboard contains '?': True (17)
passengers_aboard contains '?': True (221)
crew_aboard contains '?': True (219)
total_fatalities contains '?': True (8)
passenger_fatalities contains '?': True (235)
crew_fatalities contains '?': True (235)
ground_fatalities contains '?': True (44)
summary contains '?': True (59)


### • Date and time columns

We will start by transforming the strings on this column into datetime objects and then creating a new column with only the year from it.

In [13]:
format_data = '%B %d, %Y'
def str_2_dt(val):
    return datetime.strptime(val, format_data)
def dt_year(val):
    return val.year

In [14]:
df2['date'] = df2['date'].apply(str_2_dt)
df2['year'] = df2['date'].apply(dt_year)
df2.year.value_counts().head(10)

1946    88
1989    83
1947    82
1948    78
1962    78
1972    77
1945    75
1951    75
1994    74
1970    73
Name: year, dtype: int64

In [15]:
# Here we check from when to when does our dataset contain registers for
df2.year.min(), df2.year.max()

(1908, 2021)

In [16]:
# We check, for instance, how many 2021 accidents are registered
df2.loc[df2.year == 2021].shape

(7, 18)

In [17]:
# Now we check whether every object in the 'time' column is a string
time_dtypes = set()
for x in df2.time:
    time_dtypes.add(type(x))
print(time_dtypes)

{<class 'str'>}


In [18]:
# Here we visualize all time values in order to quick check for noticeable bad formats
'''
for x in df2.time.unique():
    print(x)
'''

'\nfor x in df2.time.unique():\n    print(x)\n'

In [19]:
# Now we look for the possible first charachters of each time string and the amount of ocurrences from it
time_1st_char = {}
for x in df2.time:
    if x[0] in time_1st_char.keys():
        time_1st_char[x[0]] += 1
    else:
        time_1st_char[x[0]] = 1
print(time_1st_char)

{'1': 1884, '?': 1504, '0': 1078, '2': 503, 'c': 38, '9': 1}


In [20]:
for x in df2.time:
    if x[0] == '9':
        print(x)


900


In [21]:
for x in df2.time:
    if x[0] == 'c':
        print(x)

c 18:00
c 02:30
c 12:00
c 13:15
c 11:00
c 9:15
c 10:00
c 17:00
c:17:00
c 15:45
c: 2:00
c 16:00
c 08:00
c:09:00
c 13:00
c 20:40
c 23:00
c 09:00
c 10:00
c 20:35
c 20:40
c 15:00
c 11:15
c 01:00
c 02:00
c 03:30
c 20:15
c 10:30
c 20:00
c 13:40
c 11:30
c 02:50
c 11:45
c 20:15
c 06:30
c 02:00
c 12:00
c 07:00


In [22]:
# We also noticed some registers en on a Z.  The Z stands for the Zero timezone, as it is offset by 0 from the Coordinated Universal Time (UTC)
for x in df2.time:
    if x[-1] == 'Z':
        print(x)

02:09Z
03:50Z
17:34Z
01:00Z
0500Z


In [23]:
# Now we check for possibility of replacing missing values with '00:00' by determining whether this string is already registered in the column
'0000' in df2.time.unique(), '00:00' in df2.time.unique(), '0:00' in df2.time.unique()

(True, False, True)

In [24]:
# We check the amount of appereances of '0000' and '0:00'
midnight_count = 0
midnight_count2 = 0
for x in df2.time:
    if x == '0000':
        midnight_count += 1
    elif x == '0:00':
        midnight_count2 += 1
print(midnight_count)
print(midnight_count2)

1
1


In [25]:
# We also found this little guy that can not be easily interpreted as a time (1:75? Obviously not, but 17:05? 17:50??)
df.loc[df['HORA declarada'] == '175']

Unnamed: 0.1,Unnamed: 0,fecha,HORA declarada,Ruta,OperadOR,flight_no,route,ac_type,registration,cn_ln,all_aboard,PASAJEROS A BORDO,crew_aboard,cantidad de fallecidos,passenger_fatalities,crew_fatalities,ground,summary
3794,3794,"October 04, 1992",175,"Amsterdam, Netherlands",El Al,1862,Amsterdam - Tel Aviv,Boeing B-747-258F,4X-AXG,21737/362,4,1,3,4,1,3,39,Shortly after taking off from Schiphol Airport...


In [26]:
# Now we create a function that deals with all of the format problems mentioned above
# UPDATE: The further I got into the dataset exploration, the less convinced I got that this column was going to be useful in the data analysis approach that I intended to take.
def time_formatter(series):
    new_series = []
    for i in range(len(series)):
        time_ok = series[i].replace(';',':')
        if series[i][:2] == 'c ':
            time_ok = time_ok.replace('c ', '')
        elif series[i][:3] == 'c: ':
            time_ok = time_ok.replace('c: ', '')
        elif series[i][:2] == 'c:':
            time_ok = time_ok.replace('c:', '')
        elif series[i][-1] == 'Z':
            time_ok = time_ok.replace('Z', '')
        elif series[i][0] == '?':
            time_ok = '00:00'
        if len(time_ok) < 4:
            if int(time_ok[-2:]) < 59:
                #print(f'MENOR: {time_ok}')
                time_ok = time_ok.zfill(4)
            else:
                #print(f'MAYOR: {time_ok}')
                time_ok = '00:00'
        if (len(time_ok) == 4):
            cnt = 0
            for x in range(4):
                if time_ok[x] == ':':
                    cnt += 1
            if cnt > 0:
                spl = time_ok.split(':')
                if len(spl[0]) == 1:
                    spl[0] = spl[0].zfill(2)
                else:
                    spl[1] = spl[1].zfill(2)
                time_ok = f'{spl[0]}:{spl[1]}'
        if len(time_ok) < 5:
            spl = [time_ok[0:2], time_ok[2:]]
            time_ok = f'{spl[0]}:{spl[1]}'

        #time_ok = pd.to_datetime(time_ok, format='%H:%M')
        new_series.append(time_ok)
    return pd.Series(new_series)

In [27]:
# We do the transformation
df2['time'] = time_formatter(df2.time)

In [28]:
# We check for problems
for x in df2.time:
    for y in x:
        if y == ';':
            print(x)

In [29]:
time_lens = set()
for x in df2.time:
    time_lens.add(len(x))
print(time_lens)

{5}


In [30]:
for time in df2.time:
    cnt = 0
    for char in range(5):
        if time[char] == ':':
            cnt += 1
    if cnt > 1:
        print(f'PROBLEM FOUND: {x}')

In [31]:
df2.time = df2.time.apply(pd.to_datetime, format="%H:%M").dt.time

In [32]:
# Now we create a quick df to graph the number of accidents per year
accidents_by_year = {}
for x in range(df2.year.min(), df2.year.max() + 1):
    try:
        accidents_by_year[x] = df2.year.value_counts()[x]
    except KeyError:
        accidents_by_year[x] = 0
df_acc_yr = pd.DataFrame({'year':accidents_by_year.keys(), 'accidents':accidents_by_year.values()})


In [33]:
# df_acc_yr.iplot(x='year', y='accidents')
# px.line(df_acc_yr, x='year', y='accidents', labels={'year':'Year', 'accidents':'Number of Accidents'}, title='Accidents per Year', )
fig = go.Figure()
#fig.add_trace(go.Scatter(x= df_acc_yr.year[-50:], y= df_acc_yr.accidents[-50:], name='Accidents per year'))
fig.add_trace(go.Scatter(x= df_acc_yr.year, y= df_acc_yr.accidents, name='Accidents per year'))
fig.update_layout(title='Accidents per Year',
                  xaxis_title='Year',
                  yaxis_title='Number of Accidents',
                  xaxis=dict(
                    showline=True,
                    showgrid=True,
                    #showticklabels=False,
                    linecolor= '#38A3A5',
                    linewidth=2,
                    ticks='outside',
                    tickcolor='#38A3A5'
                  ),
                  yaxis=dict(
                    showline=True,
                    zeroline=False,
                    showgrid=True,
                    #showticklabels=False,
                    linecolor= '#38A3A5',
                    linewidth=2,
                  ),
                  autosize=False,
                  margin=dict(
                    autoexpand=False,
                  ),
                  )

In [34]:
# df2.loc[df2.year == 1945]

### • Flight information columns (operator, flight_no, route, total_aboard, passengers_aboard, crew_aboard)

In [35]:
iran_airlines = []
for x in df2.operator:
    spl = x.lower().split()
    if 'iran' in spl:
        iran_airlines.append(x)
    elif 'iranian' in spl:
        iran_airlines.append(x)
print(len(iran_airlines))

25


In [36]:
df2.operator.unique().shape

(2268,)

In [37]:
'?' in df2.operator.unique()

True

In [38]:
"""
url = "https://aviation-reference-data.p.rapidapi.com/airline/search"

operator_dict = {}

for x in pd.Series(df2.operator.unique()).sample(20):
    if x != '?':
        querystring = {"name":x}
        headers = {
            "X-RapidAPI-Key": "ad8fe13e2fmsheba48590580e9bbp15cb61jsn64c6c2c5036e",
            "X-RapidAPI-Host": "aviation-reference-data.p.rapidapi.com"
        }
        response = requests.request("GET", url, headers=headers, params=querystring)
        print(f'-----------------------------\n{x}')
        print(response.text)
        operator_dict[x] = response.text
"""

'\nurl = "https://aviation-reference-data.p.rapidapi.com/airline/search"\n\noperator_dict = {}\n\nfor x in pd.Series(df2.operator.unique()).sample(20):\n    if x != \'?\':\n        querystring = {"name":x}\n        headers = {\n            "X-RapidAPI-Key": "ad8fe13e2fmsheba48590580e9bbp15cb61jsn64c6c2c5036e",\n            "X-RapidAPI-Host": "aviation-reference-data.p.rapidapi.com"\n        }\n        response = requests.request("GET", url, headers=headers, params=querystring)\n        print(f\'-----------------------------\n{x}\')\n        print(response.text)\n        operator_dict[x] = response.text\n'

In [39]:
'''
url = "https://aviation-reference-data.p.rapidapi.com/airline/search"

querystring = {"name":"Cabo Verde Airlines"}

headers = {
	"X-RapidAPI-Key": "ad8fe13e2fmsheba48590580e9bbp15cb61jsn64c6c2c5036e",
	"X-RapidAPI-Host": "aviation-reference-data.p.rapidapi.com"
}

response = requests.request("GET", url, headers=headers, params=querystring)

print(response.text)
'''

'\nurl = "https://aviation-reference-data.p.rapidapi.com/airline/search"\n\nquerystring = {"name":"Cabo Verde Airlines"}\n\nheaders = {\n\t"X-RapidAPI-Key": "ad8fe13e2fmsheba48590580e9bbp15cb61jsn64c6c2c5036e",\n\t"X-RapidAPI-Host": "aviation-reference-data.p.rapidapi.com"\n}\n\nresponse = requests.request("GET", url, headers=headers, params=querystring)\n\nprint(response.text)\n'

In [40]:
df2.loc[df2.all_aboard == '?'].shape

(17, 18)

### • Aircraft information columns (ac_type, registration, cn_ln)

### • Accident information columns (accident__location, total_fatalities, passenger_fatalities, crew_fatalities, ground_fatalities, summary)

In [41]:
# df_t.loc[df2.passenger_fatalities != (df2.crew_fatalities + df2.passenger_fatalities +)]
df2.loc[df2.passenger_fatalities == df2.crew_fatalities].passenger_fatalities.value_counts()

?     231
1      78
0      74
2      63
3      23
4      11
5       4
7       3
8       3
9       3
6       2
21      1
Name: passenger_fatalities, dtype: int64

In [42]:
df2.loc[df2.passenger_fatalities == '?'].ground_fatalities.value_counts()

0     212
?      16
53      1
37      1
36      1
5       1
1       1
4       1
3       1
Name: ground_fatalities, dtype: int64

In [43]:
# df2.loc[(df2.passenger_fatalities == '?') & (df2.crew_fatalities == '?')].shape
df2.loc[(df2.passenger_fatalities == '?') & (df2.crew_fatalities == '?') & (df2.total_fatalities == '?')].shape

(8, 18)

In [44]:
from geopy.geocoders import Nominatim, ArcGIS
from geopy.extra.rate_limiter import RateLimiter

geolocator = Nominatim(user_agent='acidminded')
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)
nom = ArcGIS()

In [45]:
loc_coors = {}
problematic_locations = []

In [46]:
'''
for x in df2.accident_location:
    coor = nom.geocode(query=x)
    try:
        loc_coors[x] = {'lat':coor.latitude, 'lon':coor.longitude}
        #print(f'\nQuery: {x}\nResult: {coor}\n • Lat: {coor.latitude}\n • Lon: {coor.longitude}\n--------------------------------')
    except AttributeError:
        problematic_locations.append(x)
        #print(f'Problem found: {x}')
    

print(problematic_locations)

# OUTPUT:

# ['Rossaugpt, Czechoslovakia', '?', '?', '?', '?', 'Batataevka, USSR', 'Near Hourghada, UAR', 'Near Adler, USSR', '?', 'Near Enisseysk, USSR', 'Southern Belarus, USSR', 'Near Kutayissi, USSR', 'Talourow Island, USSR', 'Nagoro-Karabak, USSR']
'''

"\nfor x in df2.accident_location:\n    coor = nom.geocode(query=x)\n    try:\n        loc_coors[x] = {'lat':coor.latitude, 'lon':coor.longitude}\n        #print(f'\nQuery: {x}\nResult: {coor}\n • Lat: {coor.latitude}\n • Lon: {coor.longitude}\n--------------------------------')\n    except AttributeError:\n        problematic_locations.append(x)\n        #print(f'Problem found: {x}')\n    \n\nprint(problematic_locations)\n\n# OUTPUT:\n\n# ['Rossaugpt, Czechoslovakia', '?', '?', '?', '?', 'Batataevka, USSR', 'Near Hourghada, UAR', 'Near Adler, USSR', '?', 'Near Enisseysk, USSR', 'Southern Belarus, USSR', 'Near Kutayissi, USSR', 'Talourow Island, USSR', 'Nagoro-Karabak, USSR']\n"

In [114]:
loc_dict = {}
no_coor_locations = []
no_cc_coors = []

In [None]:
for x in df2.accident_location:
    coor = nom.geocode(query=x)
    try:
        loc_dict[x] = {'lat':coor.latitude, 'lon':coor.longitude}
        #print(f'\nQuery: {x}\nResult: {coor}\n • Lat: {coor.latitude}\n • Lon: {coor.longitude}\n--------------------------------')
    except AttributeError:
        no_coor_locations.append(x)
        #print(f'Problem found: {x}')
    try:
        laln = f'{coor.latitude}, {coor.longitude}'
        locat = geolocator.reverse(laln)
        loc_dict[x]['country_code_a2'] = locat.raw['address']['country_code']
    except AttributeError:
        no_cc_coors.append([x, coor.latitude, coor.longitude])


In [47]:
updated_locations = {'Rossaugpt, Czechoslovakia': ['Rozvadov, Czech Republic'],
                     'Batataevka, USSR': ['Batayevka, Russia'],
                     'Near Hourghada, UAR': ['Hurghada, Egypt'],
                     'Near Adler, USSR': ['Adler, Russia'],
                     'Near Enisseysk, USSR': ['Yeniseysk , Russia'],
                     'Southern Belarus, USSR': ['Pinsk, Belarus'],
                     'Near Kutayissi, USSR': ['Kutaisi, Georgia'],
                     'Talourow Island, USSR': ['Taymyr Island, Russia'],
                     'Nagoro-Karabak, USSR': ['Nagorno-Karabakh,  Azerbaijan']
                     }

In [48]:
# Here we get the coordinates for the problematic locations using their updated names
'''
problematic_locations2 = []
for x in updated_locations:
    new_q = updated_locations[x][0]
    coor = nom.geocode(query=new_q)
    try:
        loc_coors[x] = {'lat':coor.latitude, 'lon':coor.longitude}
        #print(f'\nQuery: {x}\nResult: {coor}\n • Lat: {coor.latitude}\n • Lon: {coor.longitude}\n--------------------------------')
    except AttributeError:
        problematic_locations2.append(x)
        print(f'Problem found: {x}')
'''

"\nproblematic_locations2 = []\nfor x in updated_locations:\n    new_q = updated_locations[x][0]\n    coor = nom.geocode(query=new_q)\n    try:\n        loc_coors[x] = {'lat':coor.latitude, 'lon':coor.longitude}\n        #print(f'\nQuery: {x}\nResult: {coor}\n • Lat: {coor.latitude}\n • Lon: {coor.longitude}\n--------------------------------')\n    except AttributeError:\n        problematic_locations2.append(x)\n        print(f'Problem found: {x}')\n"

In [49]:
# loc_coors['Kaneko, Japan']

In [50]:
# loc_coors['Near Adler, USSR']

In [51]:
# loc_coors['?'] = {'lat':0, 'lon':0}

In [52]:
# Here we save the queried data into a json file
'''
with open(r'.\coordinates.json', 'w') as coor_file:
    json.dump(loc_coors, coor_file)
'''

"\nwith open(r'.\\coordinates.json', 'w') as coor_file:\n    json.dump(loc_coors, coor_file)\n"

In [53]:
with open('coordinates.json') as json_file:
    loc_coors_r = json.load(json_file)

In [54]:
loc_coors_r['?']

{'lat': 0, 'lon': 0}

In [55]:
def get_lat(val):
    return loc_coors_r[val]["lat"]
def get_lon(val):
    return loc_coors_r[val]["lon"]

In [56]:
df2["lat"] = df2.accident_location.apply(get_lat)
df2["lon"] = df2.accident_location.apply(get_lon)
df2.head()

Unnamed: 0,date,time,accident_location,operator,flight_no,route,ac_type,registration,cn_ln,all_aboard,passengers_aboard,crew_aboard,total_fatalities,passenger_fatalities,crew_fatalities,ground_fatalities,summary,year,lat,lon
0,1908-09-17,17:18:00,"Fort Myer, Virginia",Military - U.S. Army,?,Demonstration,Wright Flyer III,?,1,2,1,1,1,1,0,0,"During a demonstration flight, a U.S. Army fly...",1908,38.88243,-77.08075
1,1909-09-07,00:00:00,"Juvisy-sur-Orge, France",?,?,Air show,Wright Byplane,SC1,?,1,0,1,1,0,0,0,Eugene Lefebvre was the first pilot to ever be...,1909,48.69029,2.37385
2,1912-07-12,06:30:00,"Atlantic City, New Jersey",Military - U.S. Navy,?,Test flight,Dirigible,?,?,5,0,5,5,0,5,0,First U.S. dirigible Akron exploded just offsh...,1912,39.36287,-74.42637
3,1913-08-06,00:00:00,"Victoria, British Columbia, Canada",Private,?,?,Curtiss seaplane,?,?,1,0,1,1,0,1,0,The first fatal airplane accident in Canada oc...,1913,48.428409,-123.365642
4,1913-09-09,18:30:00,Over the North Sea,Military - German Navy,?,?,Zeppelin L-1 (airship),?,?,20,?,?,14,?,?,0,The airship flew into a thunderstorm and encou...,1913,40.93082,-72.417


In [65]:
countries = {}
#for x in range(len(df2)):
for y in range(10):
    x = randint(0,5008)
    laln = f'{df2.iloc[x].lat}, {df2.iloc[x].lon}'
    print(f'• {df2.iloc[x].accident_location} ({laln})')
    #print(laln)
    locat = geolocator.reverse(laln)
    #print(f"\t{locat.raw['country']}")
    try:
        print(f"\tCountry: {locat.raw['address']['country']}\n\tCountry code: {locat.raw['address']['country_code']}")
    except AttributeError:
        print('ATTRIBUTE ERROR')
    #try:
        #countries[laln] = [loc.raw['country'], loc.raw['country_code']]
    #except:
        #countries[laln] = ['No country', 'No country code']
    

#print(problematic_locations)

# OUTPUT:

# ['Rossaugpt, Czechoslovakia', '?', '?', '?', '?', 'Batataevka, USSR', 'Near Hourghada, UAR', 'Near Adler, USSR', '?', 'Near Enisseysk, USSR', 'Southern Belarus, USSR', 'Near Kutayissi, USSR', 'Talourow Island, USSR', 'Nagoro-Karabak, USSR']


• London, England (51.507408360000056, -0.12769869299995662)
	Country: United Kingdom
	Country code: gb
• Akshi, Russia (47.32621000000006, 142.79263000000003)
	Country: Россия
	Country code: ru
• Maravatio, Mexico (19.891850000000034, -100.44277999999997)
	Country: México
	Country code: mx
• Francistown, Botswana (-21.169569999999965, 27.51194000000004)
	Country: Botswana
	Country code: bw
• Bastipur, Nepal (27.325140000000033, 85.80316000000005)
	Country: नेपाल
	Country code: np
• Near Bilma, Niger (20.39391896500007, 13.151341031000072)
	Country: Niger
	Country code: ne
• Sukhumi, Georgia, USSR (43.00711000000007, 41.009160000000065)
	Country: საქართველო
	Country code: ge
• Athens, Greece (37.97614000000004, 23.73640000000006)
	Country: Ελλάς
	Country code: gr
• Near Marrakech, Morocco (31.633830000000046, -8.002219999999966)
	Country: Maroc / ⵍⵎⵖⵔⵉⴱ / المغرب
	Country code: ma
• Chungking, China (29.560260000000028, 106.55771000000004)
	Country: 中国
	Country code: cn


Add some extra databases containing information about passengers carried in air transport from the World Bank (https://data.worldbank.org/indicator/IS.AIR.PSGR?end=2020&most_recent_value_desc=true&start=1970&view=chart&year=2020)

In [101]:
def countrycode_a2(val):
    try:
        new_val = pycountry.countries.get(alpha_3=val).alpha_2.lower()
    except AttributeError:
        new_val = 'ERROR'
    return new_val

In [102]:
# Passengers per year dataframe
ppy_df = pd.read_csv('.\datasets\API_IS.AIR.PSGR_DS2_en_csv_v2_4700545.csv',skiprows=4)
ppy_df['country_code_a2'] = ppy_df['Country Code'].apply(countrycode_a2)
ppy_df.shape

(266, 68)

In [108]:
# Country information dataframe
country_info = pd.read_csv('.\datasets\Metadata_Country_API_IS.AIR.PSGR_DS2_en_csv_v2_4700545.csv')
country_info['country_code_a2'] = country_info['Country Code'].apply(countrycode_a2)
country_info.shape

(265, 7)

In [105]:
ppy_df.loc[ppy_df.country_code_a2 == 'cn']

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2014,2015,2016,2017,2018,2019,2020,2021,Unnamed: 66,country_code_a2
40,China,CHN,"Air transport, passengers carried",IS.AIR.PSGR,,,,,,,...,390878784.0,436183969.0,487960477.0,551234509.0,611439830.0,659629070.0,417255845.0,,,cn


In [110]:
country_info.loc[country_info.country_code_a2 == 'cn']

Unnamed: 0,Country Code,Region,IncomeGroup,SpecialNotes,TableName,Unnamed: 5,country_code_a2
40,CHN,East Asia & Pacific,Upper middle income,On 1 July 1997 China resumed its exercise of s...,China,,cn


In [None]:
url = "https://dark-sky.p.rapidapi.com/37.774929,-122.419418,2019-02-20"

headers = {
	"X-RapidAPI-Key": "ad8fe13e2fmsheba48590580e9bbp15cb61jsn64c6c2c5036e",
	"X-RapidAPI-Host": "dark-sky.p.rapidapi.com"
}

response = requests.request("GET", url, headers=headers)

print(response.text)

In [45]:
url = "https://dark-sky.p.rapidapi.com/37.774929,-122.419418,1919-02-20"

headers = {
	"X-RapidAPI-Key": "ad8fe13e2fmsheba48590580e9bbp15cb61jsn64c6c2c5036e",
	"X-RapidAPI-Host": "dark-sky.p.rapidapi.com"
}

response = requests.request("GET", url, headers=headers)

print(response.text)

{"message":"You are not subscribed to this API."}


### Connect to SQL

In [47]:
from sqlalchemy import create_engine as ce

In [61]:
mysql_engine = ce("mysql://root:SeQueLas22!@192.168.100.112:3306/henry_pi")

In [62]:
df2.to_sql("plane_accidents", mysql_engine)

5008