In [3]:
import pandas as pd
import aws_keys
import boto3

In [4]:
# load access keys
access_key, secret_key = aws_keys.aws_keys()

In [5]:
# create client object
s3 = boto3.client('s3',
                aws_access_key_id = access_key,
                aws_secret_access_key = secret_key,
                region_name='us-east-2')

In [20]:
# list buckets
response = s3.list_buckets()
response['Buckets']

[{'Name': 'aws-glue-assets-276295123250-us-east-2',
  'CreationDate': datetime.datetime(2023, 6, 11, 13, 18, 17, tzinfo=tzutc())},
 {'Name': 'flights-data-processed',
  'CreationDate': datetime.datetime(2023, 6, 11, 0, 26, 54, tzinfo=tzutc())},
 {'Name': 'flights-data-raw',
  'CreationDate': datetime.datetime(2023, 6, 7, 2, 11, 37, tzinfo=tzutc())}]

In [29]:
import io 

# read raw data in a polars data frame
obj = s3.get_object(Bucket='flights-data-raw', Key='Airline_Delay_Cause.csv')
flights = pd.read_csv(io.BytesIO(obj['Body'].read()))
flights.shape

(1541, 21)

In [30]:
# preview of raw data
flights.head(2)

Unnamed: 0,year,month,carrier,carrier_name,airport,airport_name,arr_flights,arr_del15,carrier_ct,weather_ct,...,security_ct,late_aircraft_ct,arr_cancelled,arr_diverted,arr_delay,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay
0,2023,3,9E,Endeavor Air Inc.,ABY,"Albany, GA: Southwest Georgia Regional",89.0,8.0,4.46,1.0,...,0.0,0.93,1.0,1.0,412.0,262.0,38.0,53.0,0.0,59.0
1,2023,3,9E,Endeavor Air Inc.,AEX,"Alexandria, LA: Alexandria International",62.0,8.0,3.95,0.37,...,0.0,2.4,0.0,0.0,357.0,188.0,7.0,44.0,0.0,118.0


In [31]:
# drop unwanted columns 
flights = flights.drop(['carrier_ct','weather_ct','nas_ct','security_ct','late_aircraft_ct'], axis=1)

In [32]:

flights['city_state']=flights['airport_name'].apply(lambda x: x.split(': ')[0])
flights['airport_name']=flights['airport_name'].apply(lambda x: x.split(': ')[1])
flights['city']=flights['city_state'].apply(lambda x: x.split(', ')[0])
flights['state']=flights['city_state'].apply(lambda x: x.split(', ')[1])
flights = flights.drop('city_state', axis=1)




In [33]:
# preview of data
flights.head(2)

Unnamed: 0,year,month,carrier,carrier_name,airport,airport_name,arr_flights,arr_del15,arr_cancelled,arr_diverted,arr_delay,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,city,state
0,2023,3,9E,Endeavor Air Inc.,ABY,Southwest Georgia Regional,89.0,8.0,1.0,1.0,412.0,262.0,38.0,53.0,0.0,59.0,Albany,GA
1,2023,3,9E,Endeavor Air Inc.,AEX,Alexandria International,62.0,8.0,0.0,0.0,357.0,188.0,7.0,44.0,0.0,118.0,Alexandria,LA


### Create Carrier Table

In [34]:
# create a carrier table

carrier = flights[['carrier', 'carrier_name']].drop_duplicates(['carrier', 'carrier_name'], keep='first').sort_values('carrier')
# create an id column 
id_ = [i for i in range(1, len(carrier)+1)]
carrier['id'] = id_
carrier = carrier[['id', 'carrier', 'carrier_name']]
carrier.head(2)

Unnamed: 0,id,carrier,carrier_name
0,1,9E,Endeavor Air Inc.
95,2,AA,American Airlines Inc.


In [35]:

# replace the carrier and carrier_name columns with a foreign key
flights = flights.merge(carrier, on=['carrier', 'carrier_name']).drop(['carrier', 'carrier_name'], axis=1)
flights = flights.rename(columns={'id': 'carrier_id'})
flights.head(2)
flights.shape

(1541, 17)

### Create Airport Table

In [37]:
airport = flights[['airport', 'airport_name', 'city', 'state']].drop_duplicates('airport', keep='first').sort_values('airport')

id_ = [i for i in range(1, len(airport)+1)]
airport['id'] = id_
airport = airport[['id', 'airport', 'airport_name', 'city', 'state']]
airport.head(2)

Unnamed: 0,id,airport,airport_name,city,state
591,1,ABE,Lehigh Valley International,Allentown/Bethlehem/Easton,PA
736,2,ABI,Abilene Regional,Abilene,TX


In [39]:
airport.head(3)

Unnamed: 0,id,airport,airport_name,city,state
591,1,ABE,Lehigh Valley International,Allentown/Bethlehem/Easton,PA
736,2,ABI,Abilene Regional,Abilene,TX
95,3,ABQ,Albuquerque International Sunport,Albuquerque,NM


In [40]:
flights = flights.merge(airport, on=['airport', 'airport_name', 'city', 'state']).drop(['airport', 'airport_name', 'city', 'state'], axis=1)
flights = flights.rename(columns={'id': 'airport_id'})

arr_flights = Number of Flights
arr_del15 = Number of delayed flights (15 min after schedule)
arr_diverted = number of diverte lights
num_cancelled = number of cancelled flights
arr_delay = total delay in minutes
carrier_delay = carrier delay in minutes
weather_delay = weather delay in minutes
nas_delay = national air system delay in minutes
security_delay = security delay in minutes
late_aircraft_delay = late aircraft delay in minutes


In [41]:
flights.head(5)

Unnamed: 0,year,month,arr_flights,arr_del15,arr_cancelled,arr_diverted,arr_delay,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,carrier_id,id
0,2023,3,89.0,8.0,1.0,1.0,412.0,262.0,38.0,53.0,0.0,59.0,1,5
1,2023,3,62.0,8.0,0.0,0.0,357.0,188.0,7.0,44.0,0.0,118.0,1,11
2,2023,3,93.0,12.0,4.0,0.0,752.0,364.0,267.0,0.0,0.0,121.0,12,11
3,2023,3,11.0,2.0,0.0,0.0,60.0,24.0,0.0,0.0,0.0,36.0,1,12
4,2023,3,58.0,16.0,0.0,0.0,1016.0,154.0,187.0,42.0,0.0,633.0,9,12


In [42]:
# reorder columns
flights = flights[['year','month','carrier_id','airport_id','arr_flights','arr_del15','arr_cancelled','arr_diverted','arr_delay','carrier_delay','weather_delay','nas_delay','security_delay','late_aircraft_delay']]

KeyError: "['airport_id'] not in index"

In [307]:
flights.head()

year,month,carrier_id,airport_id,arr_flights,arr_del15,arr_cancelled,arr_diverted,arr_delay,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay
i64,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
2023,3,1,5,89.0,8.0,1.0,1.0,412.0,262.0,38.0,53.0,0.0,59.0
2023,3,1,11,62.0,8.0,0.0,0.0,357.0,188.0,7.0,44.0,0.0,118.0
2023,3,1,12,11.0,2.0,0.0,0.0,60.0,24.0,0.0,0.0,0.0,36.0
2023,3,1,13,201.0,27.0,7.0,1.0,1336.0,742.0,13.0,220.0,0.0,361.0
2023,3,1,18,1598.0,222.0,8.0,6.0,18248.0,7265.0,774.0,3458.0,0.0,6751.0


In [308]:
carrier.head()

id,carrier,carrier_name
i64,str,str
1,"""9E""","""Endeavor Air I…"
2,"""AA""","""American Airli…"
3,"""AS""","""Alaska Airline…"
4,"""B6""","""JetBlue Airway…"
5,"""DL""","""Delta Air Line…"


In [309]:
airport.head()

id,airport,airport_name,city,state
i64,str,str,str,str
1,"""ABE""","""Lehigh Valley …","""Allentown/Beth…","""PA"""
2,"""ABI""","""Abilene Region…","""Abilene""","""TX"""
3,"""ABQ""","""Albuquerque In…","""Albuquerque""","""NM"""
4,"""ABR""","""Aberdeen Regio…","""Aberdeen""","""SD"""
5,"""ABY""","""Southwest Geor…","""Albany""","""GA"""


### Export Tables to S3

In [310]:
# uploade files to bucket
csv_buffer = io.StringIO()
flights_pd = pd.DataFrame(flights.to_numpy(), columns=flights.columns)
flights_pd.to_csv(csv_buffer, index=False)
s3.put_object(Bucket='flights-data-processed', 
              Body=csv_buffer.getvalue(), 
              Key='flights.csv')


# uploade files to bucket
csv_buffer = io.StringIO()
carrier_pd = pd.DataFrame(carrier.to_numpy(), columns=carrier.columns)
carrier_pd.to_csv(csv_buffer, index=False)
s3.put_object(Bucket='flights-data-processed', 
              Body=csv_buffer.getvalue(), 
              Key='carriers.csv')


# uploade files to bucket
csv_buffer = io.StringIO()
airport_pd = pd.DataFrame(airport.to_numpy(), columns=airport.columns)
airport_pd.to_csv(csv_buffer, index=False)
s3.put_object(Bucket='flights-data-processed', 
              Body=csv_buffer.getvalue(), 
              Key='airports.csv')

{'ResponseMetadata': {'RequestId': '31PBPH8QKGJ5AME0',
  'HostId': 'LN63Ze9oIKICrCOEv3IhuGZC5zHnAK1Au428HsCxMMkg8yoSLuoKYUZK+Kf5Lgu5cUWrk6x3A3A=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'LN63Ze9oIKICrCOEv3IhuGZC5zHnAK1Au428HsCxMMkg8yoSLuoKYUZK+Kf5Lgu5cUWrk6x3A3A=',
   'x-amz-request-id': '31PBPH8QKGJ5AME0',
   'date': 'Sun, 11 Jun 2023 02:30:43 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"2d155295dda89966c4ec3c1b7660ab42"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"2d155295dda89966c4ec3c1b7660ab42"',
 'ServerSideEncryption': 'AES256'}