### Imports

In [1]:
import requests
import pandas as pd
from sqlalchemy import create_engine
import psycopg2
import warnings

### DB Setup

Create a Postgresql DB named FlightDB with pwd:password or modify create_engine config below.  Data should load in under 5 mins.

### Initializations

In [3]:
# warnings
warnings.filterwarnings('ignore')

# postgresql db
engine = create_engine(f'postgresql://postgres:password@localhost/FlightDB')

# hours mins to mins
def hours_to_mins(timetaken):
    t2 = timetaken.split('h')
    smin = t2[1].replace('m','').strip()
    shr = t2[0].strip()
    
    if shr == '': shr = '0'
    if smin == '': smin = '0'

    mins = int((float(shr) * 60) + float(smin))
    return mins

In [3]:
# read in flight data
unionAllDF = pd.read_csv('Resources/unionAllDF.csv')
unionAllDF.columns=['flightdate','airline','ch_code','flightnumber','departuretime','locationfrom','timetaken','stop',
                    'arrivaltime','locationto','price','seatclass']

unionAllDF['timetaken'] = unionAllDF['timetaken'].apply(hours_to_mins)

### Extract Transfer Load

#### Table: airline

In [4]:
# create airline DF
airlineDF = unionAllDF[['airline', 'ch_code']].copy()
airlineDF.drop_duplicates(inplace=True)
airlineDF.sort_values(by=['airline'], inplace=True)
airlineDF.insert(0, 'airlineid', range(1, 1 + len(airlineDF.index)))
airlineDF.columns=['airlineid','airline','designator']

# write DF to SQL
airlineDF.to_sql(name='airline', con=engine, if_exists='replace', index=False)

# build dict for lookup
airline_dict = dict(zip(airlineDF.airline, airlineDF.airlineid))
airline_dict

{'Air India': 1,
 'AirAsia': 2,
 'GO FIRST': 3,
 'Indigo': 4,
 'SpiceJet': 5,
 'StarAir': 6,
 'Trujet': 7,
 'Vistara': 8}

In [5]:
airlineDF.head(3)

Unnamed: 0,airlineid,airline,designator
16,1,Air India,AI
2,2,AirAsia,I5
8,3,GO FIRST,G8


#### Table: location

In [6]:
# create location DF
locationDF = unionAllDF[['locationfrom']].copy()
locationDF.columns=['locationname']

locationtoDF = unionAllDF[['locationto']].copy()
locationtoDF.columns=['locationname']

locationDF = pd.concat([locationDF, locationtoDF])
locationDF.drop_duplicates(inplace=True)
locationDF.sort_values(by=['locationname'], inplace=True)
locationDF.insert(0, 'locationid', range(1, 1 + len(locationDF.index)))

# write DF to SQL
locationDF.to_sql(name='location', con=engine, if_exists='replace', index=False)

# build dict for lookup
location_dict = dict(zip(locationDF.locationname, locationDF.locationid))
location_dict

{'Bangalore': 1,
 'Chennai': 2,
 'Delhi': 3,
 'Hyderabad': 4,
 'Kolkata': 5,
 'Mumbai': 6}

In [7]:
locationDF.head(3)

Unnamed: 0,locationid,locationname
84072,1,Bangalore
180601,2,Chennai
0,3,Delhi


#### Table: seatclass

In [8]:
# create seatclass DF
seatclassDF = unionAllDF[['seatclass']].copy()
seatclassDF.drop_duplicates(inplace=True)
seatclassDF.sort_values(by=['seatclass'], inplace=True)
seatclassDF.insert(0, 'seatclassid', range(1, 1 + len(seatclassDF.index)))

# write DF to SQL
seatclassDF.to_sql(name='seatclass', con=engine, if_exists='replace', index=False)

# build dict for lookup
seatclass_dict = dict(zip(seatclassDF.seatclass, seatclassDF.seatclassid))
seatclass_dict

{'business': 1, 'economy': 2}

In [9]:
seatclassDF.head(3)

Unnamed: 0,seatclassid,seatclass
206772,1,business
0,2,economy


#### Table: stop

In [10]:
# create airline DF
stopDF = unionAllDF[['stop']].copy()
stopDF.drop_duplicates(inplace=True)
stopDF.sort_values(by=['stop'], inplace=True)
stopDF.insert(0, 'stopid', range(1, 1 + len(stopDF.index)))

# write DF to SQL
stopDF.to_sql(name='stop', con=engine, if_exists='replace', index=False)

# build dict for lookup
stop_dict = dict(zip(stopDF.stop, stopDF.stopid))
stop_dict

{'1-stop': 1, '2-stops': 2, 'non-stop': 3}

In [11]:
stopDF.head(3)

Unnamed: 0,stopid,stop
18,1,1-stop
175,2,2-stops
0,3,non-stop


#### Tables: flight and flightclass

In [12]:
unionAllDF.head(2)

Unnamed: 0,flightdate,airline,ch_code,flightnumber,departuretime,locationfrom,timetaken,stop,arrivaltime,locationto,price,seatclass
0,11/2/2022,SpiceJet,SG,8709,18:55,Delhi,130,non-stop,21:05,Mumbai,73.34096,economy
1,11/2/2022,SpiceJet,SG,8157,6:20,Delhi,140,non-stop,8:40,Mumbai,73.34096,economy


In [13]:
# build dictionaries of unique records for flight and flight_class

flight_tup = ()
flight_dict = {}

flightclass_tup = ()
flightclass_dict = {}

# check flight data
for index, row in unionAllDF.iterrows():
    
    flight_tup = (
        airline_dict.get(row.airline), 
        row.flightdate, 
        row.flightnumber, 
        stop_dict.get(row.stop), 
        row.departuretime, 
        location_dict.get(row.locationfrom), 
        location_dict.get(row.locationto), 
        row.timetaken, 
        row.arrivaltime)
    
    lookupflightkey = flight_dict.get(flight_tup)
    
    if lookupflightkey is None:
        flightid = index
        flight_dict[flight_tup] = flightid
    else:
        flightid = lookupflightkey
        
    flightclass_tup = (
        flightid, 
        seatclass_dict.get(row.seatclass), 
        row.price)        

    lookupflightclasskey = flightclass_dict.get(flightclass_tup)
        
    if lookupflightclasskey is None:
        lookupflightclasskey = index
        flightclass_dict[flightclass_tup] = lookupflightclasskey    


In [14]:
cnt = 0 
fastflightDict = {}

for k, v in flight_dict.items():
    a, b, c, d, e, f, g, h, i = k
    fastflightDict[cnt]= [v, a, b, c, d, e, f, g, h, i]
    cnt = cnt + 1
    
flightDF = pd.DataFrame.from_dict(fastflightDict, "index")
flightDF.columns=['flightid','airlineid','flightdate','flightnumber','stopid','departuretime','locationfromid',
                  'locationtoid','timetaken','arrivaltime']   

In [15]:
# write DF to SQL
flightDF.to_sql(name='flight', con=engine, if_exists='replace', index=False)
flightDF.tail(3)

Unnamed: 0,flightid,airlineid,flightdate,flightnumber,stopid,departuretime,locationfromid,locationtoid,timetaken,arrivaltime
219095,300213,8,30-03-2022,834,1,17:25,2,4,1585,19:50
219096,300214,8,30-03-2022,828,1,7:00,2,4,600,17:00
219097,300255,8,31-03-2022,826,1,12:30,2,4,625,22:55


In [16]:
cnt = 0 
fastflightclassDict = {}

for k, v in flightclass_dict.items():
    a, b, c = k
    fastflightclassDict[cnt]= [v, a, b, c]
    cnt = cnt + 1
    
flightclassDF = pd.DataFrame.from_dict(fastflightclassDict, "index")
flightclassDF.columns=['flightclassid','flightid','seatclassid','price']    

In [17]:
# write DF to SQL
flightclassDF.to_sql(name='flight_class', con=engine, if_exists='replace', index=False)
flightclassDF.tail(3)

Unnamed: 0,flightclassid,flightid,seatclassid,price
300256,300256,206767,1,974.49968
300257,300257,206765,1,1005.1272
300258,300258,206766,1,1005.1272


In [7]:
# read in fuel data
fuelDF = pd.read_csv('Resources/DailyPetroleumPricesMod.csv')
fuelDF.columns=['fueldate','fuelprice']

fuelDF.head()

Unnamed: 0,fueldate,fuelprice
0,1990-04-02,0.55
1,1990-04-03,0.555
2,1990-04-04,0.56
3,1990-04-05,0.54
4,1990-04-06,0.536


In [6]:
# write DF to SQL
fuelDF.to_sql(name='fuel', con=engine, if_exists='replace', index=False)
fuelDF.tail(3)

Unnamed: 0,fueldate,fuelprice
8197,"Nov 10, 2022",3.096
8198,"Nov 14, 2022",3.141
8199,,


In [8]:
# read in flight with fuel data
flightfuelDF = pd.read_csv('Resources/FlightsWithFuel.csv')
flightfuelDF.head()

Unnamed: 0,flightid,airline,flightdate,stop,departuretime,fromlocation,tolocation,timetaken,arrivaltime,fuelprice
0,0,SpiceJet,2022-02-11,non-stop,18:55,Delhi,Mumbai,130,21:05,2.8
1,1,SpiceJet,2022-02-11,non-stop,6:20,Delhi,Mumbai,140,8:40,2.8
2,2,AirAsia,2022-02-11,non-stop,4:25,Delhi,Mumbai,130,6:35,2.8
3,3,Vistara,2022-02-11,non-stop,10:20,Delhi,Mumbai,135,12:35,2.8
4,4,Vistara,2022-02-11,non-stop,8:50,Delhi,Mumbai,140,11:10,2.8
