<H2>Travel Benefit Contract Data Generation</b>

In [1]:
import pandas as pd
import sqlite3
import re
import hashlib

<b>Read membership data from Reltio export</b>

In [2]:
df_non_resigned = pd.read_csv("datasets/prod_non_resigned_membership_20181217.csv", dtype=str)
print(len(df_non_resigned.index))

399232


<b>Read Community Benefit File<b>

<b>Read benefit tier queries</b>

In [3]:
query_df = pd.read_csv("datasets/travel_benefit_tier_query.csv", dtype=str)
query_df['benefit_tier'] = query_df['benefit_tier'].str.upper()
query_df['benefit_tier'] = query_df['benefit_tier'].str.strip()

In [4]:
query_df.columns

Index(['benefit_tier', 'query'], dtype='object')

<b>Loading datasets in SQLite</b>

In [5]:
%%time
conn = sqlite3.connect("travelBenefitContract.db")

Wall time: 17 ms


<b>Benefit contract generation</b>

In [6]:
benefit_contracts = []

df_non_resigned.to_sql("df", conn, if_exists="replace")

travelBenefits = query_df['benefit_tier'].dropna().unique()

for travelBenefit in travelBenefits:
    travelBenefit = travelBenefit.strip()
    travelBenefit = travelBenefit.replace('  ',' ')
    travelBenefit = travelBenefit.upper()
    print("Generating benefit contract for : " + travelBenefit)
    query_str = query_df[query_df['benefit_tier'] == travelBenefit]['query'].iloc[0]
    query_all_columns = query_str.replace('select df.ID', 'select df.*')
    rs = pd.read_sql_query(query_all_columns, conn)
    if len(rs.index) > 0:

        for membership_index, membership_row in rs.iterrows():

            membership_uri = membership_row['ID']
            membership_number = membership_row['Attributes.EC_Membership_Number']
            entity_code = membership_row['Attributes.CCNative.ENTITY_CODE']
            membership_type_id = membership_row['Attributes.EC_MP_Detail.MBRSHIP_TYPE_ID']
            membership_category_code = membership_row['Attributes.EC_MP_Detail.MBRSHIP_CATEGORY_CODE']
            
            benefit_contracts.append({'membership_uri':membership_uri,'membership_number':membership_number,'entity_code':entity_code, 'membership_type_id':membership_type_id, 'membership_category_code' : membership_category_code , 'travel_benefit': travelBenefit})


Generating benefit contract for : CLUBBENEFITS
Generating benefit contract for : PALMER CLUB
Generating benefit contract for : PACIFIC LINKS
Generating benefit contract for : BRONZE TRAVEL
Generating benefit contract for : SILVER TRAVEL
Generating benefit contract for : ASSOCIATE ALUMNI
Generating benefit contract for : ASSOCIATE PLUS
Generating benefit contract for : ASSOCIATE GOLD
Generating benefit contract for : SIGNATURE GOLD DINING
Generating benefit contract for : SIGNATURE GOLD GOLF
Generating benefit contract for : SIGNATURE GOLD UNLIMITED
Generating benefit contract for : CLUBCORP TRAVEL
Generating benefit contract for : ALUMNI TRAVEL
Generating benefit contract for : EWGA
Generating benefit contract for : NCS
Generating benefit contract for : NCS CUSTOM


<b>Save Benefit Contract Data</b>

In [7]:
benefit_contract_df = pd.DataFrame(benefit_contracts)
benefit_contract_df = benefit_contract_df[['membership_uri','membership_number','entity_code','membership_type_id', 'membership_category_code', 'travel_benefit']]
benefit_contract_df.to_csv("output/travel_benefit_contracts_initial.csv", index=False)
print(len(benefit_contract_df.index))

390178


In [8]:
pattern = re.compile(r'\s+')
def generateIdBasedOnHashBenefitContract(df):
    benefit_contract_ids = []
    for index, row in df.iterrows():
        text = row['travel_benefit'] + "|" + "|" + row['membership_number']+ "|" + row['entity_code'] 
        benefit_contract_ids.append(generateIdBasedOnHash(text))
    return benefit_contract_ids


def generateIdBasedOnHashBenefitPackage(df):
    benefit_contract_ids = []
    for index, row in df.iterrows():
        text = row['travel_benefit']
        benefit_contract_ids.append(generateIdBasedOnHash(text))
    return benefit_contract_ids

pattern = re.compile(r'\s+')
def generateIdBasedOnHash(text):
    normalized_text = text.strip().lower()
    normalized_text = re.sub(pattern, '_', normalized_text)
    return str(hashlib.md5(normalized_text.encode('utf-8')).hexdigest())

<b>Generating Benefit Contract with all the required values With All</b>

In [9]:
benefit_contract_initial_df = pd.read_csv("output/travel_benefit_contracts_initial.csv", dtype=str)

benefit_contract_initial_df['benefit_package_id'] = generateIdBasedOnHashBenefitPackage(benefit_contract_initial_df)
benefit_contract_initial_df['benefit_contract_id'] = generateIdBasedOnHashBenefitContract(benefit_contract_initial_df)
benefit_contract_initial_df['name'] = benefit_contract_initial_df.apply(lambda row: row.travel_benefit + "(" + row.membership_number + ")", axis=1)
benefit_contract_initial_df['description'] = benefit_contract_initial_df.apply(lambda row: row.travel_benefit + "(" + row.membership_number+ "-" + row.entity_code + ")", axis=1)
benefit_contract_initial_df['type'] = "Travel"
benefit_contract_initial_df['active'] = "Y"

benefit_contract_initial_df.to_csv("output/travel_benefit_contracts_initial_allValues.csv", index=False)

<b>Close database connection</b>

In [10]:
conn.close()

<b>Removing Duplicate Ids</b>

In [11]:
benefit_contract_id_unique = benefit_contract_initial_df['benefit_contract_id'].is_unique
print("Benefit Contract Id Uniqueness : " + str(benefit_contract_id_unique))

if not benefit_contract_id_unique:
    #Getting the duplicate Ids
    sub_set = benefit_contract_initial_df[['benefit_contract_id']]
    dulicateBenefitContractIds = sub_set[sub_set.duplicated(['benefit_contract_id'], keep=False)].groupby(('benefit_contract_id')).max().index.values
    print(dulicateBenefitContractIds)
    
    
    condition = benefit_contract_initial_df['benefit_contract_id'].isin(dulicateBenefitContractIds)
    df_unique_values = benefit_contract_initial_df[(~condition)]
    df_duplicate_values = benefit_contract_initial_df[(condition)]
    
    print(len(benefit_contract_initial_df.index))
    print(len(df_unique_values.index))
    print(len(df_duplicate_values.index))
    
    df_unique_values.to_csv("output/travel_benefit_contracts_allValues_unique.csv", index=False)
    df_duplicate_values.to_csv("output/travel_benefit_contracts_allValues_duplicate.csv", index=False)
          

Benefit Contract Id Uniqueness : False
['02a7fc096f9b8a1a1bc8629550ed8e81' '215542f72f3fe798a82134c052bd5b6e'
 '4d50cabf6e23365b2ece0aa3cc331e9e' '914e60fbdf871f766a17fb75be9c5005'
 'e454cb01f46e017fb8a1759dbe26f358']
390178
390168
10
