### Feature engineering

In [1]:
# The modules we will need

# We import the SQLite python module for the database handling
import sqlite3


# NumPy might be useful in some cases with data manipulating 
import numpy as np


# pandas will be useful for storing and processing query results
import pandas as pd

# an option for displaying all the columns of our main dataframe
pd.options.display.max_columns = 30


# With matplotlib we can do the plotting
import matplotlib.pyplot as plt
%matplotlib inline


from matplotlib import style
style.use((['ggplot',  'fast']))


# alternatively, we can also use seaborn
import seaborn as sns


# while the OS module will help us manage the filepaths
import os


In [2]:
# The database filepath.

# The current working directoary
cwd = os.getcwd()

# Note that we have created a symbolic link, placed in the input folder of phase_4, pointing to the database
database_name = 'link_to_db'
filepath_to_database = cwd + '/' + 'input' + '/' + database_name


In [3]:

def my_queries( filepath, query ):
    '''
    input: an sqlite query - a string, and the filepath to the database - a string
    output: a dataframe - the result of the query
    '''
    
    # connect to the database
    try:
        conn = sqlite3.connect(filepath)
    except:
        print('cannot connect to database')
    cursor = conn.cursor()

    
    # run the query and get the results into a dataframe
    try:
        query_df = pd.read_sql_query(query, conn, index_col=None, coerce_float=False, parse_dates=None, chunksize=None)
    except:
        print('could not generate dataframe - maybe error in sql query?')
    
    # don't forget to close the connection 
    conn.close()

    
    # return the result
    return query_df
    

### Features

In [4]:
## temporary views on the database

In [5]:
#### OUTBOUND FLIGHTS


## prepare the database
conn = sqlite3.connect(filepath_to_database)
cursor = conn.cursor()


##  create view: number of daily outbound flights from each airport
query = '''
CREATE VIEW IF NOT EXISTS daily_outbound_flights_per_airport AS
SELECT
	Origin
	, FlightMonth
	, FlightDay
	, (COUNT(*)) AS daily_outbound_flights
FROM
	flights
GROUP BY
	Origin, FlightMonth, FlightDay;
'''


## write changes in the database
cursor.execute(query)
conn.commit()


## create view: maximum number of daily outbound flights for each airport over a year
query = '''
CREATE VIEW IF NOT EXISTS max_outbound_flights_per_airport AS
SELECT 
    Origin
    , FlightMonth
    , FlightDay
    , MAX(daily_outbound_flights) AS max_daily_outbound_flights
FROM 
    ( 
    SELECT
        Origin
        , FlightMonth
        , FlightDay
        , (COUNT(*)) as daily_outbound_flights
    FROM
        flights
    GROUP BY
        Origin, FlightMonth, FlightDay 
    )
        
GROUP BY Origin;
'''


## write changes in the database
cursor.execute(query)
conn.commit()


## close database connection
conn.close()


In [6]:
#### INBOUND FLIGHTS


## prepare the database
conn = sqlite3.connect(filepath_to_database)
cursor = conn.cursor()


##  create view: number of daily inbound flights to each airport
query = '''
CREATE VIEW IF NOT EXISTS daily_inbound_flights_per_airport AS
SELECT
	Dest
	, FlightMonth
	, FlightDay
	, (COUNT(*)) AS daily_inbound_flights
FROM
	flights
GROUP BY
	Dest, FlightMonth, FlightDay;
'''


## write changes in the database
cursor.execute(query)
conn.commit()


## create view: maximum number of daily inbound flights to each airport over a year
query = '''
CREATE VIEW IF NOT EXISTS max_inbound_flights_per_airport AS
SELECT 
    Dest
    , FlightMonth
    , FlightDay
    , MAX(daily_inbound_flights) AS max_daily_inbound_flights
FROM 
    ( 
    SELECT
        Dest
        , FlightMonth
        , FlightDay
        , (COUNT(*)) as daily_inbound_flights
    FROM
        flights
    GROUP BY
        Dest, FlightMonth, FlightDay 
    )
        
GROUP BY Dest;
'''


## write changes in the database
cursor.execute(query)
conn.commit()


## close database connection
conn.close()


In [7]:
## daily total, and yearly max flights for each airport and each day of the year
## columns in the result:
## airport - FlightMonth - FlightDay - total_daily_flights - max_daily_flights - utilisation_percent


query = '''
SELECT
	daily_inbound_flights_per_airport.Dest AS airport

	, daily_inbound_flights_per_airport.FlightMonth

	, daily_inbound_flights_per_airport.FlightDay

	, daily_inbound_flights_per_airport.daily_inbound_flights + daily_outbound_flights_per_airport.daily_outbound_flights AS total_daily_flights
	
	, max_inbound_flights_per_airport.max_daily_inbound_flights + max_outbound_flights_per_airport.max_daily_outbound_flights AS max_daily_flights
	
    , 1.0 * (daily_inbound_flights_per_airport.daily_inbound_flights + daily_outbound_flights_per_airport.daily_outbound_flights) / \
    (max_inbound_flights_per_airport.max_daily_inbound_flights + max_outbound_flights_per_airport.max_daily_outbound_flights) AS utilisation_percent
	
	
FROM
	daily_inbound_flights_per_airport
    
	JOIN daily_outbound_flights_per_airport
		ON daily_inbound_flights_per_airport.Dest == daily_outbound_flights_per_airport. Origin
			AND
			daily_inbound_flights_per_airport.FlightMonth == daily_outbound_flights_per_airport. FlightMonth
			AND
			daily_inbound_flights_per_airport.FlightDay == daily_outbound_flights_per_airport. FlightDay
            
	JOIN max_outbound_flights_per_airport 
		ON daily_inbound_flights_per_airport.Dest == max_outbound_flights_per_airport.Origin
        
	JOIN max_inbound_flights_per_airport
		ON daily_inbound_flights_per_airport.Dest == max_inbound_flights_per_airport.Dest
    

'''

airport_utilisation_df = my_queries(filepath_to_database, query)

In [8]:
airport_utilisation_df.head()

Unnamed: 0,airport,FlightMonth,FlightDay,total_daily_flights,max_daily_flights,utilisation_percent
0,ABE,1,1,24,34,0.705882
1,ABE,1,2,32,34,0.941176
2,ABE,1,3,32,34,0.941176
3,ABE,1,4,32,34,0.941176
4,ABE,1,5,18,34,0.529412


In [38]:
## average amount of time necessary for a flight of a given carrier for a given Origin-Destination pair

query = '''
select 
    UniqueCarrier
    , Origin
    , Dest
    , avg(CRSElapsedTime)
    , avg(ActualElapsedTime) AS avg_flight_time
    , count(CRSElapsedTime)
    , Cancelled
    , Diverted
    
from 
    flights
group by 
    UniqueCarrier, Origin, Dest
order by 
    count(CRSElapsedTime)
'''

elapsed_time_df = my_queries(filepath_to_database, query)

In [10]:
elapsed_time_df.head()

Unnamed: 0,UniqueCarrier,Origin,Dest,avg(CRSElapsedTime),avg(ActualElapsedTime),count(CRSElapsedTime),Cancelled,Diverted
0,9E,ATL,BGR,,,0,0,1
1,9E,ATL,BTR,,,0,0,1
2,9E,ATL,CMH,,,0,0,1
3,9E,ATL,LGA,,,0,0,1
4,9E,ATL,MSP,,,0,0,1


In [11]:
## All the tailnumbers - planes

query = '''
select distinct TailNum
from flights

'''

planes_df = my_queries(filepath_to_database, query)

In [12]:
planes_df

Unnamed: 0,TailNum
0,
1,80009E
2,80019E
3,80059E
4,80129E
5,80139E
6,80199E
7,80209E
8,80219E
9,80239E


In [81]:


## putting it all together. This is a bit long to read so please be patient...

## the objective here is to represent flights, as a list of plane trips, and connect the data from one trip to another
## iterate over the plane list and create the following dataframe ...
for plane in planes_df['TailNum'][1:-1]:
    query = '''SELECT
        Origin
        , Dest
        , TailNum
        , UniqueCarrier
        , FlightMonth
        , FlightDay
        , LateAircraftDelay
        
        , (strftime('%s',CRSArrTime )- strftime('%s',LAG ( CRSArrTime, 1, 0 ) OVER ( 
            ORDER BY TailNum, FlightMonth, FlightDay, DepTime )))/60 AS time_btwn_Arrivals
        
        , LAG ( ArrDelay, 1, 0 ) OVER ( 
            ORDER BY TailNum, FlightMonth, FlightDay, DepTime ) AS previous_ArrDelay
        
        , CASE WHEN (Origin == LAG ( Dest, 1, 0 ) OVER ( 
            ORDER BY TailNum, FlightMonth, FlightDay, DepTime ) ) THEN 1 ELSE 0 END AS suitable

    FROM
        flights
    WHERE 
        TailNum == '{0}'
    ORDER BY 
        TailNum, FlightMonth, FlightDay, DepTime
    LIMIT
        -1
    OFFSET
        1
        

    '''.format(plane)

    ## bring the query result into the dataframe
    flight_df = (my_queries(filepath_to_database, query))
  

    
    ## ... then join in the airport utilisation ...
    flight_df = pd.merge(flight_df, 
                         airport_utilisation_df.rename(columns={'airport': 'Origin'})[['Origin', 'FlightMonth', 'FlightDay', 'utilisation_percent']
                            ], how='left', 
                         on=['Origin', 'FlightMonth', 'FlightDay'])
    
    ## ... and finally the usual time for a flight between airports
    flight_df = pd.merge(flight_df, elapsed_time_df[['UniqueCarrier', 'Origin', 'Dest' ,'avg_flight_time']], 
                        how='left', on=['UniqueCarrier', 'Origin', 'Dest' ] )
    
    ## we then filter out these flights where the Origin of the plane is not the same as the previous Destination
    flight_df = flight_df[flight_df['suitable'] == 1]
    
    
    ## now we need to manipulate the columns a litlle bit. 
    ## we want to transform the avg_flight_time in percentage over the time_btwn_Arrivals
    flight_df['avg_flight_time_percent'] = (flight_df['avg_flight_time'] / flight_df['time_btwn_Arrivals'])

    
    
    ## in a similar fashion we want to transform the previous_ArrDelay in percentage over the time_btwn_Arrivals
    flight_df['previous_ArrDelay_percent'] = (flight_df['previous_ArrDelay'] / flight_df['time_btwn_Arrivals'])

    
    ## let us choose the columns we need...
    flight_df = flight_df[['LateAircraftDelay', 'previous_ArrDelay_percent', 'avg_flight_time_percent', 'utilisation_percent' ]]
    
    ## ... and cast the columns in float16, which is adequate for the values we have here, and uses less memory
    flight_df = flight_df.astype('float16')
    
    ## once we have all the data for a particular plane, we create a final dataframe, and append data for all planes
    ## we choose ony the relevant columns
    if not os.path.isfile('model1_data.csv'):
        flight_df.to_csv('output/model1_data.csv', mode='w')
    else:  
        flight_df.to_csv('output/model1_data.csv', mode='a', header=False)
        


In [8]:
# Let us take a look at what we have built

final_df = pd.read_csv('output/model1_data.csv', dtype='float16', index_col='Unnamed: 0')

  mask |= (ar1 == a)


In [9]:
final_df.head()

Unnamed: 0,LateAircraftDelay,previous_ArrDelay_percent,avg_flight_time_percent,utilisation_percent
0.0,,0.038757,0.976562,1.0
1.0,,-0.043701,0.587402,0.965332
2.0,0.0,0.153809,-4.292969,1.0
3.0,96.0,1.021484,0.679688,0.82373
4.0,63.0,0.806641,1.081055,1.0
