# IATA Extraction
This notebook fetches the mapping from airport ID to city, state, and three letter IATA code.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import signac
from tqdm import tqdm

# Show more columns so we can see everything
pd.options.display.max_columns = 50

project = signac.get_project()

In [2]:
dtypes = {
    'ItinID': np.int64,
    'MktID': np.int64,
    'SeqNum': np.int8,
    'Coupons': np.int8,
    'Year': np.int16,
    'OriginAirportID': np.int16,
    'OriginAirportSeqID': np.int32,
    'OriginCityMarketID': np.int32,
    'Quarter': np.int8,
    'Origin': str,
    'OriginCountry': str,
    'OriginStateFips': np.int8,
    'OriginState': str,
    'OriginStateName': str,
    'OriginWac': np.int8,
    'DestAirportID': np.int16,
    'DestAirportSeqID': np.int32,
    'DestCityMarketID': np.int32,
    'Dest': str,
    'DestCountry': str,
    'DestStateFips': np.int8,
    'DestState': str,
    'DestStateName': str,
    'DestWac': np.int8,
    'Break': str,
    'CouponType': str,
    'TkCarrier': str,
    'OpCarrier': str,
    'RPCarrier': str,
    'Passengers': np.float32,
    'FareClass': str,
    'Distance': np.float32,
    'DistanceGroup': np.int8,
    'ItinGeoType': np.int8,
    'CouponGeoType': np.int8}

In [3]:
job = project.find_jobs({"year": 2011, "quarter": 1}).next()
cols = ['OriginAirportID', 'Origin', 'DestAirportID', 'Dest']
dtypes = {c: dtypes[c] for c in cols}
chunksize = 100000
reader = pd.read_csv(job.fn('Coupon.csv'), usecols=dtypes.keys(), dtype=dtypes, chunksize=chunksize)

In [4]:
airport_codes = {}
for chunk in tqdm(reader, total=int(np.ceil(job.doc['Coupon']['shape'][0]/chunksize))):
    for i, origin_id, origin_iata, dest_id, dest_iata in chunk.itertuples():
        if origin_id not in airport_codes:
            airport_codes[origin_id] = origin_iata
        if dest_id not in airport_codes:
            airport_codes[dest_id] = dest_iata

100%|██████████| 86/86 [00:52<00:00,  1.71it/s]


In [5]:
airport_df = pd.DataFrame.from_dict(airport_codes, orient='index').sort_index()
airport_df.index.name = 'ID'
airport_df.columns = ['IATA']
airport_df.to_csv(job.fn('airport_codes.csv'))