In [44]:
# import dependencies
import os, inspect
import pandas as pd

# get current dir
curr_dir = os.path.dirname(inspect.getabsfile(inspect.currentframe()))
root_dir = os.path.dirname(curr_dir)

In [45]:
# read airport data
fname = os.path.join(root_dir, "airboard", "data", "raw", "L_UNIQUE_CARRIERS.csv_")
carriers_df = pd.read_csv(fname)
print(os.linesep, " columns:" , os.linesep, 40*"-")
print(repr(carriers_df.columns))




# # keep only the last airport recorded
carriers_df.drop_duplicates("Description", keep="last", inplace=True)
carriers_df.dropna(how="any",
                   axis=0,
                   subset=["Description", "Code"],
                   inplace=True)

carriers_df.rename(index=str, 
                   columns={"Code": "CARRIER_CODE",
                            "Description": "CARRIER_NAME"},
                   inplace=True)

carriers_df.sort_values(by="CARRIER_NAME", inplace=True)
carriers_df.set_index("CARRIER_CODE", inplace=True)

# explore data frame
print(os.linesep, " columns:" , os.linesep, 40*"-")
print(repr(carriers_df.columns))
print(os.linesep, " dtypes:" , os.linesep, 40*"-")
print(carriers_df.dtypes)
print(os.linesep, " nulls:" , os.linesep, 40*"-")
print(carriers_df.isnull().any())
print(os.linesep, " shape:" , os.linesep, 40*"-")
print(carriers_df.shape)

fname = os.path.join(root_dir,
                     "airboard",
                     "data",
                     "ext",
                     "616228237_CARRIER_MASTER_CORD_CLEAN_V0")
carriers_df.to_csv(fname + ".csv")
carriers_df.to_json(fname + ".json", orient="index")
carriers_df.head()


  columns: 
 ----------------------------------------
Index(['Code', 'Description'], dtype='object')

  columns: 
 ----------------------------------------
Index(['CARRIER_NAME'], dtype='object')

  dtypes: 
 ----------------------------------------
CARRIER_NAME    object
dtype: object

  nulls: 
 ----------------------------------------
CARRIER_NAME    False
dtype: bool

  shape: 
 ----------------------------------------
(1648, 1)


Unnamed: 0_level_0,CARRIER_NAME
CARRIER_CODE,Unnamed: 1_level_1
2PQ,21 Air LLC
Q5,40-Mile Air
CIQ,A/S Conair
AAE,AAA Airlines
ACI,AAA-Action Air Carrier Inc.


In [46]:
# read airport data
fname = os.path.join(root_dir, "airboard", "data", "raw", "616228237_T_MASTER_CORD.zip")
airports_df = pd.read_csv(fname)
print(os.linesep, " columns:" , os.linesep, 40*"-")
print(repr(airports_df.columns))

# trim data frame
airports_df = airports_df[[
#     "AIRPORT_ID",
    "AIRPORT",
    "LATITUDE",
    "LONGITUDE",
    "DISPLAY_AIRPORT_NAME",
    "DISPLAY_AIRPORT_CITY_NAME_FULL",
    "AIRPORT_STATE_NAME",
    "AIRPORT_STATE_CODE",
    "AIRPORT_COUNTRY_NAME",]]

airports_df["AIRPORT_CITY_NAME"] = airports_df["DISPLAY_AIRPORT_CITY_NAME_FULL"].map(lambda x: x.split(",")[0])
airports_df.drop("DISPLAY_AIRPORT_CITY_NAME_FULL", axis=1, inplace=True)

# keep only the last airport recorded
airports_df.drop_duplicates("AIRPORT", keep="last", inplace=True)
airports_df.dropna(how="any",
                   axis=0,
                   subset=["AIRPORT", "AIRPORT_STATE_CODE", "LATITUDE"],
                   inplace=True)

airports_df.rename(index=str, 
                   columns={"AIRPORT": "AIRPORT_CODE",
                           "AIRPORT_STATE_CODE": "STATE_CODE",
                           "AIRPORT_STATE_NAME": "STATE_NAME",
                            "AIRPORT_CITY_NAME": "CITY_NAME",
                           "AIRPORT_COUNTRY_NAME": "COUNTRY_NAME"}, inplace=True)
airports_df["DISPLAY_AIRPORT_NAME"] = \
    airports_df["DISPLAY_AIRPORT_NAME"].str.replace("'", " ")

airports_df.sort_values(by="DISPLAY_AIRPORT_NAME", inplace=True)


airports_df.set_index("AIRPORT_CODE", inplace=True)
# explore data frame
print(os.linesep, " columns:" , os.linesep, 40*"-")
print(repr(airports_df.columns))
print(os.linesep, " dtypes:" , os.linesep, 40*"-")
print(airports_df.dtypes)
print(os.linesep, " nulls:" , os.linesep, 40*"-")
print(airports_df.isnull().any())
print(os.linesep, " shape:" , os.linesep, 40*"-")
print(airports_df.shape)

fname = os.path.join(root_dir,
                     "airboard",
                     "data",
                     "ext",
                     "616228237_AIRPORT_MASTER_CORD_CLEAN_V0")
airports_df.to_csv(fname + ".csv")
airports_df.to_json(fname + ".json", orient="index")
airports_df.head()


  columns: 
 ----------------------------------------
Index(['AIRPORT_SEQ_ID', 'AIRPORT_ID', 'AIRPORT', 'DISPLAY_AIRPORT_NAME',
       'DISPLAY_AIRPORT_CITY_NAME_FULL', 'AIRPORT_WAC_SEQ_ID2', 'AIRPORT_WAC',
       'AIRPORT_COUNTRY_NAME', 'AIRPORT_COUNTRY_CODE_ISO',
       'AIRPORT_STATE_NAME', 'AIRPORT_STATE_CODE', 'AIRPORT_STATE_FIPS',
       'CITY_MARKET_SEQ_ID', 'CITY_MARKET_ID', 'DISPLAY_CITY_MARKET_NAME_FULL',
       'CITY_MARKET_WAC_SEQ_ID2', 'CITY_MARKET_WAC', 'LAT_DEGREES',
       'LAT_HEMISPHERE', 'LAT_MINUTES', 'LAT_SECONDS', 'LATITUDE',
       'LON_DEGREES', 'LON_HEMISPHERE', 'LON_MINUTES', 'LON_SECONDS',
       'LONGITUDE', 'UTC_LOCAL_TIME_VARIATION', 'AIRPORT_START_DATE',
       'AIRPORT_THRU_DATE', 'AIRPORT_IS_CLOSED', 'AIRPORT_IS_LATEST',
       'Unnamed: 32'],
      dtype='object')

  columns: 
 ----------------------------------------
Index(['LATITUDE', 'LONGITUDE', 'DISPLAY_AIRPORT_NAME', 'STATE_NAME',
       'STATE_CODE', 'COUNTRY_NAME', 'CITY_NAME'],
      dtype='o

Unnamed: 0_level_0,LATITUDE,LONGITUDE,DISPLAY_AIRPORT_NAME,STATE_NAME,STATE_CODE,COUNTRY_NAME,CITY_NAME
AIRPORT_CODE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
UXM,61.044444,-158.145278,47-Mile Mine Airport,Alaska,AK,United States,47-Mile Mine
APH,38.068611,-77.318889,A P Hill AAF,Virginia,VA,United States,Fort A P Hill
ODW,48.251667,-122.673611,A.J. Eisenberg,Washington,WA,United States,Oak Harbor
YXX,49.019167,-122.38,Abbotsford Airport,British Columbia,BC,Canada,Abbotsford
U36,42.921111,-112.880833,Aberdeen Municipal,Idaho,ID,United States,Aberdeen


In [47]:

airports_df["DISPLAY_AIRPORT_NAME"].loc["COE"]

'Coeur d Alene - Pappy Boyington Field'

In [48]:
# create state latitude and longitude data frame
state_df = airports_df.groupby(["STATE_CODE", "STATE_NAME"])[["LATITUDE","LONGITUDE"]].mean()

state_df.reset_index(inplace=True)

state_df.set_index("STATE_CODE", inplace=True)
fname = os.path.join(root_dir, "airboard", "data", "ext", "616228237_STATE_CORD_V0")
state_df.to_csv(fname + ".csv")
state_df.to_json(fname + ".json", orient="index")
state_df.head()

Unnamed: 0_level_0,STATE_NAME,LATITUDE,LONGITUDE
STATE_CODE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AB,Alberta,54.010936,-113.990804
AK,Alaska,60.803531,-149.81643
AL,Alabama,32.720152,-86.628245
AR,Arkansas,35.282701,-92.476513
AZ,Arizona,34.232213,-111.898056


In [50]:
# create city latitude and longitude data frame
city_df = airports_df.groupby(["CITY_NAME"])[["LATITUDE","LONGITUDE"]].mean()

city_df.reset_index(inplace=True)
city_df.sort_values(by="CITY_NAME", inplace=True)

city_df.set_index("CITY_NAME", inplace=True)
fname = os.path.join(root_dir, "airboard", "data", "ext", "616228237_CITY_CORD_V0")
city_df.to_csv(fname + ".csv")
city_df.to_json(fname + ".json", orient="index")
city_df.head()

Unnamed: 0_level_0,LATITUDE,LONGITUDE
CITY_NAME,Unnamed: 1_level_1,Unnamed: 2_level_1
47-Mile Mine,61.044444,-158.145278
Abbotsford,49.019167,-122.38
Aberdeen,42.611111,-95.823889
Abilene,34.577315,-98.923333
Ada,34.804167,-96.671111


In [4]:
df = pd.DataFrame()
for year in range(2000, 2019):
    fname =  os.path.join(root_dir,
                           "airboard",
                           "data",
                           "raw",
                           f"{year}_616181125_T_T100D_MARKET_ALL_CARRIER.zip")
    # read csv
    year_df = pd.read_csv(fname,
                      compression="zip")
    year_df = year_df[["PASSENGERS",
                       "FREIGHT",
                       "MAIL",
                       "DISTANCE",
                       "AIRLINE_ID", 
                       "UNIQUE_CARRIER_ENTITY",
                       "UNIQUE_CARRIER_NAME",
                       "REGION",
                       "ORIGIN_AIRPORT_ID",
                       "DEST_AIRPORT_ID",
                       "YEAR",
                       "QUARTER",
                       "MONTH",
                       "DISTANCE_GROUP",
                       "CLASS"]]
    
    # cast data types
    year_df[["YEAR", "QUARTER", "MONTH", "DISTANCE_GROUP"]] = \
        year_df[["YEAR", "QUARTER", "MONTH", "DISTANCE_GROUP"]].astype("int32")

    
    # drop rows that do not have UNIQUE_CARRIER_NAME (drop rows with null UNIQUE_CARRIER_NAME)
    year_df.dropna(how="any",
                   axis=0,
                   subset=["UNIQUE_CARRIER_NAME"],
                   inplace=True)
    # reset index
    year_df.reset_index(inplace=True)
    
    
    fname =  os.path.join(
        root_dir,
        "airboard",
        "data",
        "ext",
        f"{year}_616181125_T_T100D_MARKET_ALL_CARRIER_CLEAN_V1.csv")
    
    #     year_df.to_csv(fname, compression="zip")
    year_df.to_csv(fname)

    print(os.linesep,
          16*"-", f" {year} ", 16*"-")
#     print(os.linesep, " dtypes:" , os.linesep, 40*"-")
#     print(year_df.dtypes)
    print(os.linesep, " nulls:" , os.linesep, 40*"-")
    print(year_df.isnull().any())
    print(os.linesep, " shape:" , os.linesep, 40*"-")
    print(year_df.shape)
    airports_df.head()
    


 ----------------  2000  ----------------

  nulls: 
 ----------------------------------------
index                    False
PASSENGERS               False
FREIGHT                  False
MAIL                     False
DISTANCE                 False
AIRLINE_ID               False
UNIQUE_CARRIER_ENTITY    False
UNIQUE_CARRIER_NAME      False
REGION                   False
ORIGIN_AIRPORT_ID        False
DEST_AIRPORT_ID          False
YEAR                     False
QUARTER                  False
MONTH                    False
DISTANCE_GROUP           False
CLASS                    False
dtype: bool

  shape: 
 ----------------------------------------
(143959, 16)

 ----------------  2001  ----------------

  nulls: 
 ----------------------------------------
index                    False
PASSENGERS               False
FREIGHT                  False
MAIL                     False
DISTANCE                 False
AIRLINE_ID               False
UNIQUE_CARRIER_ENTITY    False
UNIQUE_CARRIER_NA


 ----------------  2013  ----------------

  nulls: 
 ----------------------------------------
index                    False
PASSENGERS               False
FREIGHT                  False
MAIL                     False
DISTANCE                 False
AIRLINE_ID               False
UNIQUE_CARRIER_ENTITY    False
UNIQUE_CARRIER_NAME      False
REGION                   False
ORIGIN_AIRPORT_ID        False
DEST_AIRPORT_ID          False
YEAR                     False
QUARTER                  False
MONTH                    False
DISTANCE_GROUP           False
CLASS                    False
dtype: bool

  shape: 
 ----------------------------------------
(251144, 16)

 ----------------  2014  ----------------

  nulls: 
 ----------------------------------------
index                    False
PASSENGERS               False
FREIGHT                  False
MAIL                     False
DISTANCE                 False
AIRLINE_ID               False
UNIQUE_CARRIER_ENTITY    False
UNIQUE_CARRIER_NA

In [5]:
year_df.columns


Index(['index', 'PASSENGERS', 'FREIGHT', 'MAIL', 'DISTANCE', 'AIRLINE_ID',
       'UNIQUE_CARRIER_ENTITY', 'UNIQUE_CARRIER_NAME', 'REGION',
       'ORIGIN_AIRPORT_ID', 'DEST_AIRPORT_ID', 'YEAR', 'QUARTER', 'MONTH',
       'DISTANCE_GROUP', 'CLASS'],
      dtype='object')