## Connect to ICW:

In [1]:
import cadspy
import numpy as np
import pandas as pd
import datetime as dt

# diplay all rows and cols when using 'dataframe'.head() or 'dataframe'.tail()
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

icw = cadspy.DatabaseConnection(system='ICW', user='u204570')

Enter Password:  ·········


<br>

### Data

#### S19 Lounge Eligibility data

In [2]:
# lounge eligibility data
query = """

sel * from LDB_SBOX_OR.HACKATHON_OPS_LOUNGE_ELIGIBILITY

"""

df_lounge_eligibility = icw.queryToDataframe(query)

# flight info data
query = """

select * from LDB_SBOX_OR.HACKATHON_OPS_FLIGHT_INFO

"""

df_flight_info = icw.queryToDataframe(query)

# country decode data
query = """

select * from LDB_SBOX_OR.HACKATHON_OPS_COUNTRY_DECODE

"""

df_country = icw.queryToDataframe(query)

# AC_type
query = """

select * from LDB_SBOX_OR.HACKATHON_OPS_AC_TYPE

"""

df_acft_typ = icw.queryToDataframe(query)

<br>

### Pre-processing

*Hint:* It is always worth checking the format of each of the columns in your dataframes before trying to do any work with them. To do so, you can make use of the `headers_and_first_row` function below.


In [3]:
def headers_and_first_row(df):
    '''
    print headers and first row of a df to deal with data types
    '''
    
    headers = df.columns
    first_row = []

    for col in headers:
        first_row.append(df[col][0])
    
    dictionary = dict( zip( headers, first_row) )

    return dictionary

# helper function

def dataframe_str_formatter(df):
    '''Strips all whitespace in string columns in DataFrame'''
    for col in df.columns:
        if isinstance(df[col][0],str):
            df[col] = df[col].str.strip()
        else:
            continue
    return df


In [4]:
# applying headers_and_first_row to df_lounge_eligibility
format_df = headers_and_first_row(df_lounge_eligibility)

# Note that some columns have blank spaces!
format_df

{'OPERATING_AIRLINE_CD': 'BA    ',
 'OPERATING_FLT_NO': 57,
 'GMT_UPLIFT_DT': datetime.date(2019, 6, 18),
 'UPLIFT_STN_CD': 'LHR   ',
 'DISCHARGE_STN_CD': 'JNB   ',
 'BOOKED_CABIN_CD': 'J ',
 'TRAVEL_CABIN_CD': 'J ',
 'BA_PAX_TIER': '',
 'ONEWORLD_TIER': 'EMER',
 'Lounge_eligibility_tier': 'Tier 2',
 'pax': 1}

In [5]:
# pre-processing code

df_lounge_eligibility = dataframe_str_formatter(df_lounge_eligibility)

In [6]:
# pre-processing code

df_lounge_eligibility = dataframe_str_formatter(df_lounge_eligibility)
df_lounge_eligibility['GMT_UPLIFT_DT'] = pd.to_datetime(df_lounge_eligibility['GMT_UPLIFT_DT'])


<br><br>

### df_flight_info pre-processing

In [7]:
format_df = headers_and_first_row(df_flight_info)

format_df

{'OPG_ALN_CD': 'BA    ',
 'OPG_FLT_NO': 548,
 'GMT_PLND_DEP_TS': Timestamp('2019-06-03 07:30:00'),
 'GMT_ACT_DEP_TS': Timestamp('2019-06-03 07:27:00'),
 'ACT_DEP_STN_CD': 'LHR   ',
 'ACT_DEP_TML_CD': '5 ',
 'PLND_ARR_STN_CD': 'FCO   ',
 'ACT_ARR_STN_CD': 'FCO   ',
 'IATA_AC_TYP_CD': '321   ',
 'ACT_AC_TYP_CD': 'V6  ',
 'ROUTE': 'LHRFCO      ',
 'COUNTRY_CD': 'IT    ',
 'COUNTRY_NM': 'Italy',
 'CORP_GEOG_CTRY_GRP_NM': 'MEDITERRANEAN',
 'CORP_GEOG_CONTINENT_NM': 'EUROPE EXC UK'}

In [8]:
# pre-processing code

df_flight_info = dataframe_str_formatter(df_flight_info)

# Need to convert to same date type for merge
df_flight_info['GMT_PLND_DEP_TS'] = pd.to_datetime(df_flight_info['GMT_PLND_DEP_TS'], format = '%Y-%m-%d')
df_flight_info['GMT_PLND_DEP'] = pd.to_datetime(df_flight_info['GMT_PLND_DEP_TS'].dt.date)

format_df = headers_and_first_row(df_flight_info)

format_df

{'OPG_ALN_CD': 'BA',
 'OPG_FLT_NO': 548,
 'GMT_PLND_DEP_TS': Timestamp('2019-06-03 07:30:00'),
 'GMT_ACT_DEP_TS': Timestamp('2019-06-03 07:27:00'),
 'ACT_DEP_STN_CD': 'LHR',
 'ACT_DEP_TML_CD': '5',
 'PLND_ARR_STN_CD': 'FCO',
 'ACT_ARR_STN_CD': 'FCO',
 'IATA_AC_TYP_CD': '321',
 'ACT_AC_TYP_CD': 'V6',
 'ROUTE': 'LHRFCO',
 'COUNTRY_CD': 'IT',
 'COUNTRY_NM': 'Italy',
 'CORP_GEOG_CTRY_GRP_NM': 'MEDITERRANEAN',
 'CORP_GEOG_CONTINENT_NM': 'EUROPE EXC UK',
 'GMT_PLND_DEP': Timestamp('2019-06-03 00:00:00')}

### df_country pre-processing

In [9]:
# show current format

format_df_country = headers_and_first_row(df_country)

format_df_country

# pre-processing code

df_country = dataframe_str_formatter(df_country)

format_df_country = headers_and_first_row(df_country)

format_df_country

{'ROUTE': 'LHRINV',
 'COUNTRY_CD': 'GB',
 'COUNTRY_NM': 'United Kingdom and Northern Ireland',
 'CORP_GEOG_CTRY_GRP_NM': 'UK',
 'CORP_GEOG_CONTINENT_NM': 'UK'}

### df_acft_type pre-processing

In [10]:
# show current format

format_df_aircraft = headers_and_first_row(df_acft_typ)

format_df_aircraft

# pre-processing code

df_acft_type = dataframe_str_formatter(df_acft_typ)

format_df_aircraft = headers_and_first_row(df_acft_typ)

format_df_aircraft

{'IATA_AC_TYP_CD': '320',
 'ACT_AC_TYP_CD': 'A3',
 'WB_NB_CAT': 'NB',
 'FIRST_SEATS_QTY': 0,
 'CLUB_SEATS_QTY': 24,
 'PREM_ECONOMY_SEATS_QTY': 0,
 'ECONOMY_SEATS_QTY': 132}

In [11]:
# Example

# joining df_lounge_eligibility and df_flight_info

df_lounge_elig_flight_info = pd.merge(df_lounge_eligibility,# left table
                                     df_flight_info, # right table
                                     left_on = ['GMT_UPLIFT_DT','OPERATING_FLT_NO','UPLIFT_STN_CD','DISCHARGE_STN_CD'], # left on? e.g. which columns from the left table are you joining on to?
                                     right_on = ['GMT_PLND_DEP','OPG_FLT_NO','ACT_DEP_STN_CD','ACT_ARR_STN_CD'], # right on? # left on? e.g. which columns from the right table are you joining on to?
                                     how = "left" # how? e.g. left, right, inner,etc
                                     )

print('Old Shape: {}'.format(df_lounge_eligibility.shape))
print('New Shape: {}'.format(df_lounge_elig_flight_info.shape))

Old Shape: (973878, 11)
New Shape: (973878, 27)


### Check Join has worked correctly by looking at some rows and countings null where join may not have worked

In [12]:
# your code here!

df_lounge_country_flight = pd.merge(df_lounge_elig_flight_info,
                                   df_country,
                                   on = 'ROUTE',
                                   how = 'left'
                                   )

print('Old Shape: {}'.format(df_lounge_elig_flight_info.shape))
print('New Shape: {}'.format(df_lounge_country_flight.shape))

Old Shape: (973878, 27)
New Shape: (973878, 31)


In [13]:
df_final = pd.merge(df_lounge_country_flight,
                   df_acft_typ,
                   on = ['IATA_AC_TYP_CD','ACT_AC_TYP_CD'],
                   how = 'left'

                   )

print('Old Shape: {}'.format(df_lounge_country_flight.shape))
print('New Shape: {}'.format(df_final.shape))

Old Shape: (973878, 31)
New Shape: (973878, 36)


<br><br><br>

In [39]:
#--------------------------------
#
# Your turn!!!
#
#--------------------------------

<u>What level of granularity do you use?</u>

This project wants to understand what characteristics of a flight help us get a better picture of the lounge eligibility profiles.
Things I want to consider:
- Time of flight (Morning, Afternoon, Evening) or maybe even by hour
- Destination as this determines the passenger profile, different countries/regions have more premium passengers potentially
- Short Haul/Medium Haul/Long Haul 
- Month of Flight


<u>General Thoughts:</u>

Destination:
- Routes are too granular as if we have new routes in the future we wouldn't be able to estimate
- Countries could also be granular if we develop new routes to countries
- Region may be suited as we currently fly to all regions
- We need to think how to consider how certain countries within a region may have higher premium loads vs others

<u>What metric do you use to come up with Lounge eligibility profiles?</u>
- ... (your answer here)

## Data Exploration

In [14]:
df_final.head()

Unnamed: 0,OPERATING_AIRLINE_CD,OPERATING_FLT_NO,GMT_UPLIFT_DT,UPLIFT_STN_CD,DISCHARGE_STN_CD,BOOKED_CABIN_CD,TRAVEL_CABIN_CD,BA_PAX_TIER,ONEWORLD_TIER,Lounge_eligibility_tier,pax,OPG_ALN_CD,OPG_FLT_NO,GMT_PLND_DEP_TS,GMT_ACT_DEP_TS,ACT_DEP_STN_CD,ACT_DEP_TML_CD,PLND_ARR_STN_CD,ACT_ARR_STN_CD,IATA_AC_TYP_CD,ACT_AC_TYP_CD,ROUTE,COUNTRY_CD_x,COUNTRY_NM_x,CORP_GEOG_CTRY_GRP_NM_x,CORP_GEOG_CONTINENT_NM_x,GMT_PLND_DEP,COUNTRY_CD_y,COUNTRY_NM_y,CORP_GEOG_CTRY_GRP_NM_y,CORP_GEOG_CONTINENT_NM_y,WB_NB_CAT,FIRST_SEATS_QTY,CLUB_SEATS_QTY,PREM_ECONOMY_SEATS_QTY,ECONOMY_SEATS_QTY
0,BA,57,2019-06-18,LHR,JNB,J,J,,EMER,Tier 2,1,BA,57.0,2019-06-18 20:25:00,2019-06-18 20:22:00,LHR,5,JNB,JNB,744,S4,LHRJNB,ZA,South Africa,SOUTHERN AFRICA,AFRICA,2019-06-18,ZA,South Africa,SOUTHERN AFRICA,AFRICA,WB,14.0,86.0,30.0,145.0
1,BA,247,2019-06-29,LHR,GRU,J,J,Gold,EMER,Tier 2,1,BA,247.0,2019-06-29 21:25:00,2019-06-29 21:39:00,LHR,5,GRU,GRU,77W,G7,LHRGRU,BR,Brazil,SOUTH AMERICA,SOUTH AMERICA INC CARIBBEAN,2019-06-29,BR,Brazil,SOUTH AMERICA,SOUTH AMERICA INC CARIBBEAN,WB,14.0,56.0,44.0,183.0
2,BA,766,2019-06-13,LHR,OSL,C,C,Silver,SAPP,Tier 3,7,BA,766.0,2019-06-13 11:55:00,2019-06-13 12:23:00,LHR,5,OSL,OSL,321,V6,LHROSL,NO,Norway,SCANDINAVIA,EUROPE EXC UK,2019-06-13,NO,Norway,SCANDINAVIA,EUROPE EXC UK,NB,0.0,38.0,0.0,160.0
3,BA,556,2019-05-21,LHR,FCO,M,M,,SAPP,Tier 3,1,BA,556.0,2019-05-21 14:45:00,2019-05-21 14:55:00,LHR,5,FCO,FCO,321,V6,LHRFCO,IT,Italy,MEDITERRANEAN,EUROPE EXC UK,2019-05-21,IT,Italy,MEDITERRANEAN,EUROPE EXC UK,NB,0.0,38.0,0.0,160.0
4,BA,27,2019-09-19,LHR,HKG,F,F,Premier,EMER,Tier 1,2,BA,27.0,2019-09-19 20:45:00,2019-09-19 20:50:00,LHR,5,HKG,HKG,77W,G7,LHRHKG,HK,Hong Kong - SAR of China,FAR EAST HUBS,ASIA FAR EAST,2019-09-19,HK,Hong Kong - SAR of China,FAR EAST HUBS,ASIA FAR EAST,WB,14.0,56.0,44.0,183.0


In [49]:
def tier_proportions(features):
    
    '''A function to build our the proportion of customers eligible for lounge based on features inputted'''

    tier_col = 'Lounge_eligibility_tier'
    
    # Create group by parameters
    groupby_list = features[:]
    groupby_list.append(tier_col)

    # Get total eligible by features and tier
    df_grouped = df_final.groupby(groupby_list).agg({'pax':'sum'})

    # Add total amount for features excluding tier to get breakdown
    df_grouped['total_pax_features'] = df_grouped.groupby(features)['pax'].transform('sum')

    # proportion elgible 
    df_grouped['proportion'] = (df_grouped['pax'] / df_grouped['total_pax_features']) * 100
    df_grouped['proportion'] = df_grouped['proportion'].map('{:,.1f}%'.format)

    # reset index
    df_grouped.reset_index(inplace = True)

    # remove not eligible passengers
    df_grouped = df_grouped[df_grouped[tier_col] != 'Not eligible']

    # reshape data required for Ops Team
    df = df_grouped.pivot(index = features, columns = tier_col, values = 'proportion')
    
    return df


<br>

Please save your final lookup table below in the form of a pandas dataframe. It must contain the categories you have come up with as rows, and the Tier 1, Tier 2, and Tier 3 percentage of costumers as columns.

In [52]:
features = ['CORP_GEOG_CTRY_GRP_NM_x']
df_test = tier_proportions(features)
df_test.head()

Lounge_eligibility_tier,Tier 1,Tier 2,Tier 3
CORP_GEOG_CTRY_GRP_NM_x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BENELUX,0.9%,8.9%,19.8%
CANADA,2.9%,4.0%,19.4%
CARIBBEAN,0.5%,5.7%,25.5%
CENTRAL AMERICA,5.1%,4.6%,20.3%
CHINA,5.9%,4.7%,20.2%
