In [1]:
# import dependencies
import os, inspect
import pandas as pd

# get current dir
curr_dir = os.path.dirname(inspect.getabsfile(inspect.currentframe()))
root_dir = os.path.dirname(curr_dir)

In [2]:
# read airport data
fname = os.path.join(root_dir, "airboard", "data", "raw", "616228237_T_MASTER_CORD.zip")
airports_df = pd.read_csv(fname)
print(os.linesep, " columns:" , os.linesep, 40*"-")
print(repr(airports_df.columns))
# trim data frame
airports_df = airports_df[[
    "AIRPORT_ID",
    "AIRPORT",
    "LATITUDE",
    "LONGITUDE",
    "DISPLAY_AIRPORT_NAME",
    "DISPLAY_AIRPORT_CITY_NAME_FULL",
    "AIRPORT_STATE_NAME",
    "AIRPORT_STATE_CODE",
    "AIRPORT_COUNTRY_NAME",]]
airports_df["AIRPORT_CITY_NAME"] = airports_df["DISPLAY_AIRPORT_CITY_NAME_FULL"].map(lambda x: x.split(",")[0])
airports_df.drop("DISPLAY_AIRPORT_CITY_NAME_FULL", axis=1, inplace=True)
# keep only the last airport recorded
airports_df.drop_duplicates("AIRPORT_ID", keep="last", inplace=True)
# explore data frame
print(os.linesep, " columns:" , os.linesep, 40*"-")
print(repr(airports_df.columns))
print(os.linesep, " dtypes:" , os.linesep, 40*"-")
print(airports_df.dtypes)
print(os.linesep, " nulls:" , os.linesep, 40*"-")
print(airports_df.isnull().any())
print(os.linesep, " shape:" , os.linesep, 40*"-")
print(airports_df.shape)

fname = os.path.join(root_dir, "airboard", "data", "ext", "616228237_AIRPORT_MASTER_CORD_CLEAN_V0.csv")
airports_df.to_csv(fname)
airports_df.head()


  columns: 
 ----------------------------------------
Index(['AIRPORT_SEQ_ID', 'AIRPORT_ID', 'AIRPORT', 'DISPLAY_AIRPORT_NAME',
       'DISPLAY_AIRPORT_CITY_NAME_FULL', 'AIRPORT_WAC_SEQ_ID2', 'AIRPORT_WAC',
       'AIRPORT_COUNTRY_NAME', 'AIRPORT_COUNTRY_CODE_ISO',
       'AIRPORT_STATE_NAME', 'AIRPORT_STATE_CODE', 'AIRPORT_STATE_FIPS',
       'CITY_MARKET_SEQ_ID', 'CITY_MARKET_ID', 'DISPLAY_CITY_MARKET_NAME_FULL',
       'CITY_MARKET_WAC_SEQ_ID2', 'CITY_MARKET_WAC', 'LAT_DEGREES',
       'LAT_HEMISPHERE', 'LAT_MINUTES', 'LAT_SECONDS', 'LATITUDE',
       'LON_DEGREES', 'LON_HEMISPHERE', 'LON_MINUTES', 'LON_SECONDS',
       'LONGITUDE', 'UTC_LOCAL_TIME_VARIATION', 'AIRPORT_START_DATE',
       'AIRPORT_THRU_DATE', 'AIRPORT_IS_CLOSED', 'AIRPORT_IS_LATEST',
       'Unnamed: 32'],
      dtype='object')

  columns: 
 ----------------------------------------
Index(['AIRPORT_ID', 'AIRPORT', 'LATITUDE', 'LONGITUDE',
       'DISPLAY_AIRPORT_NAME', 'AIRPORT_STATE_NAME', 'AIRPORT_STATE_CODE',
   

Unnamed: 0,AIRPORT_ID,AIRPORT,LATITUDE,LONGITUDE,DISPLAY_AIRPORT_NAME,AIRPORT_STATE_NAME,AIRPORT_STATE_CODE,AIRPORT_COUNTRY_NAME,AIRPORT_CITY_NAME
0,10001,01A,58.109444,-152.906667,Afognak Lake Airport,Alaska,AK,United States,Afognak Lake
1,10003,03A,65.548056,-161.071667,Bear Creek Mining Strip,Alaska,AK,United States,Granite Mountain
2,10004,04A,68.083333,-163.166667,Lik Mining Camp,Alaska,AK,United States,Lik
3,10005,05A,67.57,-148.183889,Little Squaw Airport,Alaska,AK,United States,Little Squaw
4,10006,06A,57.745278,-152.882778,Kizhuyak Bay,Alaska,AK,United States,Kizhuyak


In [3]:
# create state latitude and longitude data frame
state_df = airports_df.groupby("AIRPORT_STATE_CODE")[["LATITUDE","LONGITUDE"]].mean()
state_df.reset_index(inplace=True)
fname = os.path.join(root_dir, "airboard", "data", "ext", "616228237_STATE_CORD_V0.csv")
state_df.to_csv(fname)
state_df.head()

Unnamed: 0,AIRPORT_STATE_CODE,LATITUDE,LONGITUDE
0,AB,54.010936,-113.990804
1,AK,60.797532,-149.810723
2,AL,32.720152,-86.628245
3,AR,35.282701,-92.476513
4,AZ,34.232104,-111.85832


In [4]:
df = pd.DataFrame()
for year in range(2000, 2019):
    fname =  os.path.join(root_dir,
                           "airboard",
                           "data",
                           "raw",
                           f"{year}_616181125_T_T100D_MARKET_ALL_CARRIER.zip")
    # read csv
    year_df = pd.read_csv(fname,
                      compression="zip")
    year_df = year_df[["PASSENGERS",
                       "FREIGHT",
                       "MAIL",
                       "DISTANCE",
                       "AIRLINE_ID", 
                       "UNIQUE_CARRIER_ENTITY",
                       "UNIQUE_CARRIER_NAME",
                       "REGION",
                       "ORIGIN_AIRPORT_ID",
                       "DEST_AIRPORT_ID",
                       "YEAR",
                       "QUARTER",
                       "MONTH",
                       "DISTANCE_GROUP",
                       "CLASS"]]
    
    # cast data types
    year_df[["YEAR", "QUARTER", "MONTH", "DISTANCE_GROUP"]] = \
        year_df[["YEAR", "QUARTER", "MONTH", "DISTANCE_GROUP"]].astype("int32")

    
    # drop rows that do not have UNIQUE_CARRIER_NAME (drop rows with null UNIQUE_CARRIER_NAME)
    year_df.dropna(how="any",
                   axis=0,
                   subset=["UNIQUE_CARRIER_NAME"],
                   inplace=True)
    # reset index
    year_df.reset_index(inplace=True)
    
    
    fname =  os.path.join(
        root_dir,
        "airboard",
        "data",
        "ext",
        f"{year}_616181125_T_T100D_MARKET_ALL_CARRIER_CLEAN_V1.csv")
    
    #     year_df.to_csv(fname, compression="zip")
    year_df.to_csv(fname)

    print(os.linesep,
          16*"-", f" {year} ", 16*"-")
#     print(os.linesep, " dtypes:" , os.linesep, 40*"-")
#     print(year_df.dtypes)
    print(os.linesep, " nulls:" , os.linesep, 40*"-")
    print(year_df.isnull().any())
    print(os.linesep, " shape:" , os.linesep, 40*"-")
    print(year_df.shape)
    airports_df.head()
    


 ----------------  2000  ----------------

  nulls: 
 ----------------------------------------
index                    False
PASSENGERS               False
FREIGHT                  False
MAIL                     False
DISTANCE                 False
AIRLINE_ID               False
UNIQUE_CARRIER_ENTITY    False
UNIQUE_CARRIER_NAME      False
REGION                   False
ORIGIN_AIRPORT_ID        False
DEST_AIRPORT_ID          False
YEAR                     False
QUARTER                  False
MONTH                    False
DISTANCE_GROUP           False
CLASS                    False
dtype: bool

  shape: 
 ----------------------------------------
(143959, 16)

 ----------------  2001  ----------------

  nulls: 
 ----------------------------------------
index                    False
PASSENGERS               False
FREIGHT                  False
MAIL                     False
DISTANCE                 False
AIRLINE_ID               False
UNIQUE_CARRIER_ENTITY    False
UNIQUE_CARRIER_NA


 ----------------  2013  ----------------

  nulls: 
 ----------------------------------------
index                    False
PASSENGERS               False
FREIGHT                  False
MAIL                     False
DISTANCE                 False
AIRLINE_ID               False
UNIQUE_CARRIER_ENTITY    False
UNIQUE_CARRIER_NAME      False
REGION                   False
ORIGIN_AIRPORT_ID        False
DEST_AIRPORT_ID          False
YEAR                     False
QUARTER                  False
MONTH                    False
DISTANCE_GROUP           False
CLASS                    False
dtype: bool

  shape: 
 ----------------------------------------
(251144, 16)

 ----------------  2014  ----------------

  nulls: 
 ----------------------------------------
index                    False
PASSENGERS               False
FREIGHT                  False
MAIL                     False
DISTANCE                 False
AIRLINE_ID               False
UNIQUE_CARRIER_ENTITY    False
UNIQUE_CARRIER_NA

In [5]:
year_df.columns


Index(['index', 'PASSENGERS', 'FREIGHT', 'MAIL', 'DISTANCE', 'AIRLINE_ID',
       'UNIQUE_CARRIER_ENTITY', 'UNIQUE_CARRIER_NAME', 'REGION',
       'ORIGIN_AIRPORT_ID', 'DEST_AIRPORT_ID', 'YEAR', 'QUARTER', 'MONTH',
       'DISTANCE_GROUP', 'CLASS'],
      dtype='object')