In [12]:
import pandas as pd
import numpy as np
from geopy.geocoders import Nominatim as nom
from geopy.extra.rate_limiter import RateLimiter

Wrangling and cleaning flight passenger data from Eurostat

In [44]:
#selecting only relevant countries and airports

countries={'_bg', '_de', '_el', '_es', '_fr', '_hu', '_pl', '_pt'}

keep_countries = ['_BG_', '_DE_', '_EL_', '_ES_', '_FR_', '_HU_', '_PL_', '_PT_']
keep_co = '|'.join(r"{}".format(x) for x in keep_countries)
islands = ['EL_LGIR', 'EL_LGRP', 'ES_LEPA', 'ES_GCLP', 'ES_GCTS', 'EL_LGKO', 'EL_LGKR', 'ES_GCFV']
keep_is = '|'.join(r"{}".format(x) for x in islands)
dep_airports = ['PT_LPPT', 'ES_LEMD', 'ES_LEBL', 'FR_LFPG', 'DE_EDDB', 'PL_EPWA', 'HU_LHBP', 'BG_LBSF', 'EL_LGAV']
keep_air = '|'.join(r"{}".format(x) for x in dep_airports)
neworder =  ['airports', 'dep_airp', 'dep_lat', 'dep_lon', 'arr_airp', 'arr_lat', 'arr_lon', '2022', '2022Q1', '2022Q2', '2022Q3','2022Q4']


In [14]:
geolocator = nom(user_agent='caesaravgvstvs')
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

In [45]:
def parse_dataset(countries):
    # make filepath
    path = f'../data_raw/eurostat/airport/avia_par{countries}.tsv'
    # read data using the filepath
    dftemp = pd.read_csv(path, sep='\t|,', engine='python')
    
    dftemp.columns = [c.strip() for c in dftemp.columns]
    dftemp.rename(columns={'airp_pr\\time':'airports'}, inplace=True)
    dftemp = dftemp[['unit', 'tra_meas', 'airports', '2022', '2022Q1', '2022Q2', '2022Q3', '2022Q4']]
    dftemp = dftemp[~dftemp['unit'].isin(['FLIGHT', 'SEAT'])]
    dftemp = dftemp[dftemp['tra_meas'].isin(['PAS_BRD'])]
    dftemp = dftemp[dftemp['airports'].str.contains(keep_co)]
    dftemp = dftemp[~dftemp['airports'].str.contains(keep_is)]
    dftemp.drop(columns=['unit', 'tra_meas'], inplace=True)
    dftemp = dftemp.replace(': ', np.nan)
    dftemp.dropna(subset='2022', inplace=True)
    dftemp['dep_airp'] = dftemp['airports'].str.slice(0, 7)
    dftemp['arr_airp'] = dftemp['airports'].str.slice(-7,)
    dftemp = dftemp[dftemp['dep_airp'].str.contains(keep_air)]
    
    dftemp['dep_lat']=''
    dftemp['dep_lon']=''
    dftemp['arr_lat']=''
    dftemp['arr_lon']=''

#adding coordinated to each airport, using geopy api

    for i in dftemp.index:
        location = geocode(dftemp['arr_airp'][i])
        dftemp.loc[i,'arr_lat'] = location.latitude
        dftemp.loc[i,'arr_lon'] = location.longitude
        
    location = geocode(dftemp['dep_airp'][i])
    for i in dftemp.index:
        dftemp.loc[i,'dep_lat'] = location.latitude
        dftemp.loc[i,'dep_lon'] = location.longitude
        
    dftemp = dftemp.reindex(columns=neworder)
    
    dftemp.to_csv(f'../data/flight_data/flights{countries}.csv', index=False)
    return dftemp

In [46]:
parse_dataset('_de')

Unnamed: 0,airports,dep_airp,dep_lat,dep_lon,arr_airp,arr_lat,arr_lon,2022,2022Q1,2022Q2,2022Q3,2022Q4
6150,DE_EDDB_BG_LBSF,DE_EDDB,52.366528,13.488063,BG_LBSF,42.695565,23.414208,91336,9550,26709,31683,23394
6151,DE_EDDB_BG_LBWN,DE_EDDB,52.366528,13.488063,BG_LBWN,43.233936,27.823342,77387,8302,19142,33576,16367
6160,DE_EDDB_DE_EDDF,DE_EDDB,52.366528,13.488063,DE_EDDF,50.022943,8.524937,1018072,164485,286818,283327,283442
6162,DE_EDDB_DE_EDDK,DE_EDDB,52.366528,13.488063,DE_EDDK,50.867782,7.138961,252737,47810,85126,59757,60044
6163,DE_EDDB_DE_EDDL,DE_EDDB,52.366528,13.488063,DE_EDDL,51.28723,6.762063,220130,30535,62761,60740,66094
6164,DE_EDDB_DE_EDDM,DE_EDDB,52.366528,13.488063,DE_EDDM,48.353962,11.778592,752199,104810,225730,204156,217503
6167,DE_EDDB_DE_EDDS,DE_EDDB,52.366528,13.488063,DE_EDDS,48.688422,9.205393,274763,40457,80169,77242,76895
6179,DE_EDDB_EL_LGAV,DE_EDDB,52.366528,13.488063,EL_LGAV,37.937253,23.954622,225334,50202,71218,56373,47541
6186,DE_EDDB_EL_LGTS,DE_EDDB,52.366528,13.488063,EL_LGTS,36.689274,3.361113,182940,22461,59094,64067,37318
6193,DE_EDDB_ES_LEBL,DE_EDDB,52.366528,13.488063,ES_LEBL,41.296944,2.079047,445870,75659,115309,124990,129912


In [43]:
location = geocode('ES_GCFV')
location

Location(Aeropuerto de Fuerteventura, FV-2, Puerto del Rosario, Las Palmas, Canarias, 35310, España, (28.447124199999998, -13.866147581598085, 0.0))