## Connect to ICW:

In [None]:
import cadspy

In [None]:
icw = cadspy.DatabaseConnection(system='ICW', user='u204570')

Below are some packages to get you started. You don't have to use them but you may find them useful!

In [None]:
import numpy as np
import pandas as pd
import datetime as dt


In [None]:
# diplay all rows and cols when using 'dataframe'.head() or 'dataframe'.tail()
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

<br>

### Data

#### S19 Lounge Eligibility data

In [None]:
query = """

sel * from LDB_SBOX_OR.HACKATHON_OPS_LOUNGE_ELIGIBILITY

"""

df_lounge_eligibility = icw.queryToDataframe(query)

In [None]:
df_lounge_eligibility.head(2)

In [None]:
df_lounge_eligibility.shape

In [None]:
# a look to a particular flight number and date

mask = (df_lounge_eligibility['DISCHARGE_STN_CD'] == 'GCM   ') & (df_lounge_eligibility['GMT_UPLIFT_DT'] == dt.date(2019,9,12) )

df_lounge_eligibility[mask]

#### S19 Flight info

In [None]:
query = """

select * from LDB_SBOX_OR.HACKATHON_OPS_FLIGHT_INFO

"""

df_flight_info = icw.queryToDataframe(query)

In [None]:
df_flight_info.head(2)

#### Station Code Decode

In [None]:
query = """

select * from LDB_SBOX_OR.HACKATHON_OPS_COUNTRY_DECODE

"""

df_country = icw.queryToDataframe(query)

In [None]:
df_country.head(2)

#### Aircraft Type

In [None]:
query = """

select * from LDB_SBOX_OR.HACKATHON_OPS_AC_TYPE

"""

df_acft_typ = icw.queryToDataframe(query)

In [None]:
df_acft_typ.head(2)

In [None]:
df_acft_typ.shape

<br>

### Pre-processing

*Hint:* It is always worth checking the format of each of the columns in your dataframes before trying to do any work with them. To do so, you can make use of the `headers_and_first_row` function below.


In [None]:
def headers_and_first_row(df):
    '''
    print headers and first row of a df to deal with data types
    '''
    
    headers = df.columns
    first_row = []

    for col in headers:
        first_row.append(df[col][0])
    
    dictionary = dict( zip( headers, first_row) )

    return dictionary

In [None]:
# applying headers_and_first_row to df_lounge_eligibility
format_df = headers_and_first_row(df_lounge_eligibility)

In [None]:
# Note that some columns have blank spaces!
format_df

<br>
<div class="alert alert-" style = "border-radius:10px;border-width:3px;border-color:lightblue;font-family:Verdana,sans-serif;font-size:16px;">

<font size="3">**Exercise 1:** Pre-process **all** the tables above (df_lounge_eligibility, df_flight_info, df_country, df_acft_typ).

</font>

</ol>
</div>

In [None]:
# helper function

def dataframe_str_formatter(df):
    '''Strips all whitespace in string columns in DataFrame'''
    for col in df.columns:
        if isinstance(df[col][0],str):
            df[col] = df[col].str.strip()
        else:
            continue
    return df


In [None]:
# Example: strip method works well to remove blank spaces
df_lounge_eligibility['OPERATING_AIRLINE_CD'] = df_lounge_eligibility['OPERATING_AIRLINE_CD'].str.strip()

In [None]:
# applying headers_and_first_row to df_lounge_eligibility
format_df = headers_and_first_row(df_lounge_eligibility)

# print the dictionary format_df. Note how the OPERATING_AIRLINE_CD column now appear without white spaces
format_df

In [None]:
# pre-processing code

df_lounge_eligibility = dataframe_str_formatter(df_lounge_eligibility)

In [None]:
# applying headers_and_first_row to df_lounge_eligibility
format_df = headers_and_first_row(df_lounge_eligibility)

# print the dictionary format_df. Note how the OPERATING_AIRLINE_CD column now appear without white spaces
format_df

In [None]:
df_lounge_eligibility['GMT_UPLIFT_DT'] = pd.to_datetime(df_lounge_eligibility['GMT_UPLIFT_DT'])

<br><br>

### df_flight_info pre-processing

In [None]:
format_df = headers_and_first_row(df_flight_info)

format_df

In [None]:
# pre-processing code

df_flight_info = dataframe_str_formatter(df_flight_info)

# Need to convert to same date type for merge
df_flight_info['GMT_PLND_DEP_TS'] = pd.to_datetime(df_flight_info['GMT_PLND_DEP_TS'], format = '%Y-%m-%d')
df_flight_info['GMT_PLND_DEP'] = pd.to_datetime(df_flight_info['GMT_PLND_DEP_TS'].dt.date)

format_df = headers_and_first_row(df_flight_info)

format_df

### df_country pre-processing

In [None]:
# show current format

format_df_country = headers_and_first_row(df_country)

format_df_country

In [None]:
# pre-processing code

df_country = dataframe_str_formatter(df_country)

format_df_country = headers_and_first_row(df_country)

format_df_country

### df_acft_type pre-processing

In [None]:
# show current format

format_df_aircraft = headers_and_first_row(df_acft_typ)

format_df_aircraft

In [None]:
# pre-processing code

df_acft_type = dataframe_str_formatter(df_acft_typ)

format_df_aircraft = headers_and_first_row(df_acft_typ)

format_df_aircraft

<br>
<div class="alert alert-" style = "border-radius:10px;border-width:3px;border-color:lightblue;font-family:Verdana,sans-serif;font-size:16px;">

<font size="3">**Exercise 2:** Join the tables below
   
    - df_flight_info
    - df_country
    - df_acft_typ
    
to the table df_lounge_eligibility to generate a final dataset.

</font>

</ol>
</div>

In [None]:
# Example

# joining df_lounge_eligibility and df_flight_info

df_lounge_elig_flight_info = pd.merge(df_lounge_eligibility,# left table
                                     df_flight_info, # right table
                                     left_on = ['GMT_UPLIFT_DT','OPERATING_FLT_NO','UPLIFT_STN_CD','DISCHARGE_STN_CD'], # left on? e.g. which columns from the left table are you joining on to?
                                     right_on = ['GMT_PLND_DEP','OPG_FLT_NO','ACT_DEP_STN_CD','ACT_ARR_STN_CD'], # right on? # left on? e.g. which columns from the right table are you joining on to?
                                     how = "left" # how? e.g. left, right, inner,etc
                                     )

In [None]:
print('Old Shape: {}'.format(df_lounge_eligibility.shape))
print('New Shape: {}'.format(df_lounge_elig_flight_info.shape))

### Check Join has worked correctly by looking at some rows and countings null where join may not have worked

In [None]:
df_lounge_elig_flight_info.head()

In [None]:
# your code here!

df_lounge_country_flight = pd.merge(df_lounge_elig_flight_info,
                                   df_country,
                                   on = 'ROUTE',
                                   how = 'left'
                                   )

In [None]:
print('Old Shape: {}'.format(df_lounge_elig_flight_info.shape))
print('New Shape: {}'.format(df_lounge_country_flight.shape))

In [None]:
df_final = pd.merge(df_lounge_country_flight,
                   df_acft_typ,
                   on = ['IATA_AC_TYP_CD','ACT_AC_TYP_CD'],
                   how = 'left'

                   )

In [None]:
print('Old Shape: {}'.format(df_lounge_country_flight.shape))
print('New Shape: {}'.format(df_final.shape))

In [None]:
df_final.head()

<br><br>
<div class="alert alert-" style = "border-radius:10px;border-width:3px;border-color:lightblue;font-family:Verdana,sans-serif;font-size:16px;">

<font size="3">Exercise 3: Based on Summer 2019 data, provide a lookup table of Lounge eligibility assumptions that can be applied to a future schedule. To do so, answer each of the following questions in the Markdown cell provided below. 

- What level of granularity do you use?
- What metric do you use to come up with Lounge eligibility profiles?

    
Note 1: **Provide evidence for your assumptions.** This can be in the form of tables, graphs, correlation matrix, etc.
    
Note 2: Make use of the examples below to give structure to your answer. Feel free to attend the Hackathon Clinics if you have any questions. 
</font>


    
</ol>
</div>

**Reasoning (Example 1)**:


Assume:
- Data has been preprocessed.
- Data has been joined, and a final dataset has been created. This dataset is the result of joining the 4 tables.

The final dataset has been called `df_lounge_elig_flight_info_country_acft_typ`.

<u>What level of granularity do you use?</u>
- I have decided to split all flights in the network based on their aircraft type. I will therefore have a lookup table with two categories: Narrowbody and Widebody. 

<u>What metric do you use to come up with Lounge eligibility profiles?</u>
- I have sumed up all the passengers by Aircraft Type, by Tier. Then I have divided them by the total number of passengers by Aircraft Type. For example:
    - For NB aircrafts, and for Tier 1 passengers: In S19 we had 41,728 pax eligible for Tier 1 out of 7,222,830 pax flying on Narrowbody aircraft. This represents 0.6% of the costumers and I assume that this will be the number of costumers elegible for this specific Lounge in a future schedule.   

In [None]:
# your code here!

In [None]:
# columns that you want to group by
list_groupby = ['WB_NB_CAT','Lounge_eligibility_tier']

# grouping by WB_NB_CAT and Lounge_eligibility_tier
df_groupby_wb_nb = df_lounge_elig_flight_info_country_acft_typ.groupby(list_groupby).agg(
                pax_count = ('pax','sum')
).reset_index()

In [None]:
# a look at the data
df_groupby_wb_nb

In [None]:
# getting the number of pax by aircraft type

# columns that you want to group by
list_groupby = ['WB_NB_CAT']

# grouping by WB_NB_CAT
df_groupby_wb_nb_ttl = df_lounge_elig_flight_info_country_acft_typ.groupby(list_groupby).agg(
                pax_count_ttl = ('pax','sum')
).reset_index()

In [None]:
# a look at the data
df_groupby_wb_nb_ttl

In [None]:
# now, let's left join df_groupby_wb_nb_ttl onto df_groupby_wb_nb
# this adds a new column to the df_groupby_wb_nb table (pax_count_ttl) that will be used to get the percentage of passenger eligible by Tier

df_groupby_wb_nb = pd.merge(df_groupby_wb_nb,
                            df_groupby_wb_nb_ttl,
                            on = ['WB_NB_CAT'],
                            how = 'left'
                           )

In [None]:
# a look at the data
df_groupby_wb_nb

In [None]:
# getting the percentage of pax elegible for each of the Tiers

df_groupby_wb_nb['pax_eligible%'] = (df_groupby_wb_nb['pax_count'] / df_groupby_wb_nb['pax_count_ttl'] )*100 

# getting the pax_elegible% column in the right format
df_groupby_wb_nb['pax_eligible%'] = df_groupby_wb_nb['pax_eligible%'].map('{:,.1f}%'.format)

In [None]:
# dropping pax_count, pax_count_ttl columns - not needed anymore
df_groupby_wb_nb.drop(columns=['pax_count','pax_count_ttl'],inplace = True)

In [None]:
# a look at the data
df_groupby_wb_nb

In [None]:
# dropping 'not eligible' rows - not needed anymore
mask = df_groupby_wb_nb['Lounge_eligibility_tier'] == 'Not eligible'

df_groupby_wb_nb = df_groupby_wb_nb[~mask].copy()

<br>

Please save your final lookup table below in the form of a pandas dataframe. It must contain the categories you have come up with as rows, and the Tier 1, Tier 2, and Tier 3 percentage of costumers as columns.

In [None]:
# using set_index to come up with the final lookup table
df_groupby_wb_nb = df_groupby_wb_nb.set_index(['WB_NB_CAT','Lounge_eligibility_tier'],drop = True).unstack('Lounge_eligibility_tier')

In [None]:
# final table
df_groupby_wb_nb

<br>

**Feedback:** This analysis provides a lookup table in the format needed to be input into a future schedule. Nevertheless, this analysis is too high level and you haven't provided any evidence for your assumptions. To further enhance your answer use insights from the data and provide evidence for your assumptions. Please find some ideas below: 

- Using the same categories (WB,NB), plot data overtime to better understand the peaks for the different lounges. 
- Is there a way to split Widebody into more categories? Do the Haul, Region, Time of Day, or Country play a role in the number of passengers that are eligible in Tier 1, Tier 2 and Tier 3? Etc...

<br><br>

**Reasoning (Example 2)**:



<u>What level of granularity do you use?</u>
- I have decided to split all flights in the network based on their flight number. I will therefore have a lookup table with a lot of categories as each flight number is a category. 

<u>What metric do you use to come up with Lounge eligibility profiles?</u>
- I have sumed up all the passengers by flight number, by Tier. Then I have divided them by the total number of passengers flight number.  

In [None]:
# your code here!

In [None]:
# columns that you want to group by
list_groupby = ['OPERATING_FLT_NO','DISCHARGE_STN_CD','Lounge_eligibility_tier']

# grouping by OPERATING_FLT_NO, DISCHARGE_STN_CD and Lounge_eligibility_tier
df_groupby_flt_no = df_lounge_elig_flight_info_country_acft_typ.groupby(list_groupby).agg(
                pax_count = ('pax','sum')
).reset_index()

# a look at the data
df_groupby_flt_no.head(4)

In [None]:
# getting the number of pax by OPERATING_FLT_NO and DISCHARGE_STN_CD

# columns that you want to group by
list_groupby = ['OPERATING_FLT_NO','DISCHARGE_STN_CD']

# grouping by WB_NB_CAT
df_groupby_flt_no_ttl = df_lounge_elig_flight_info_country_acft_typ.groupby(list_groupby).agg(
                pax_count_ttl = ('pax','sum')
).reset_index()

In [None]:
# a look at the data
df_groupby_flt_no_ttl.head(2)

In [None]:
# now, let's left join df_groupby_flt_no_ttl onto df_groupby_flt_no
# this adds a new column to the df_groupby_flt_no table (pax_count_ttl) that will be used to get the percentage of passenger eligible by Tier

df_groupby_flt_no = pd.merge(df_groupby_flt_no,
                            df_groupby_flt_no_ttl,
                            on = ['OPERATING_FLT_NO','DISCHARGE_STN_CD'],
                            how = 'left'
                           )

In [None]:
# a look at the data
df_groupby_flt_no.head(4)

In [None]:
# getting the percentage of pax elegible for each of the Tiers

df_groupby_flt_no['pax_eligible%'] = (df_groupby_flt_no['pax_count'] / df_groupby_flt_no['pax_count_ttl'] )*100 

# getting the pax_elegible% column in the right format
df_groupby_flt_no['pax_eligible%'] = df_groupby_flt_no['pax_eligible%'].map('{:,.1f}%'.format)

# dropping pax_count, pax_count_ttl columns - not needed anymore
df_groupby_flt_no.drop(columns=['pax_count','pax_count_ttl'],inplace = True)

In [None]:
# a look at the data
df_groupby_flt_no.head(4)

In [None]:
# dropping 'not eligible' rows - not needed anymore
mask = df_groupby_flt_no['Lounge_eligibility_tier'] == 'Not eligible'

df_groupby_flt_no = df_groupby_flt_no[~mask].copy()

<br>

Please save your final lookup table below in the form of a pandas dataframe. It must contain the categories you have come up with as rows, and the Tier 1, Tier 2, and Tier 3 percentage of costumers as columns.

In [None]:
# using set_index to come up with the final lookup table
df_groupby_flt_no = df_groupby_flt_no.set_index(['OPERATING_FLT_NO','DISCHARGE_STN_CD','Lounge_eligibility_tier'],drop = True).unstack('Lounge_eligibility_tier')

In [None]:
# final table
df_groupby_flt_no.head(4)

<br>

**Feedback:** This analysis goes at a very granular level, we might come up with missing values if we apply this lounge eligibility profiles to a future schedule. Here are some ideas to further enhance your answer: 

- What would happen if we fly to a new destination in the future? How do we ensure we have a lounge eligibility profile for this new route?
- As you can see in the example above: Pax eligible for Tier 1 for the BKK flight is significantly different from the rest. What's the most used aircraft type for this route? And why it differs that much from the rest? Is it because of the route characteristics instead?
- For SH routes, we might change the time of departure for a specific flight number from one year to the next. Explore the possibility of using a clasification that takes that into account, like using Time of Day instead of flight number. 

<br><br><br>

In [None]:
#--------------------------------
#
# Your turn!!!
#
#--------------------------------

<u>What level of granularity do you use?</u>

This project wants to understand what characteristics of a flight help us get a better picture of the lounge eligibility profiles.
Things I want to consider:
- Time of flight (Morning, Afternoon, Evening) or maybe even by hour
- Destination as this determines the passenger profile, different countries/regions have more premium passengers potentially
- Short Haul/Medium Haul/Long Haul 


<u>General Thoughts:</u>

Destination:
- Routes are too granular as if we have new routes in the future we wouldn't be able to estimate
- Countries could also be granular if we develop new routes to countries
- Region may be suited as we currently fly to all regions
- We need to think how to consider how certain countries within a region may have higher premium loads vs others

<u>What metric do you use to come up with Lounge eligibility profiles?</u>
- ... (your answer here)

## Data Exploration

<br>

Please save your final lookup table below in the form of a pandas dataframe. It must contain the categories you have come up with as rows, and the Tier 1, Tier 2, and Tier 3 percentage of costumers as columns.