## PROSPECT BASE

To train our model, we will be using prospects who applied to a given credit card on the RBC public site. Note, it doesn't matter if they were approved or not, it's important that they applied. 

##### Timing 
We want to time how long these programs take to run. We are interested both in real time and CPU time. 

In [None]:
import time 

start_time = time.time()
start_cpu_time = time.process_time()

#### Set Up

In [None]:
import os
import numpy as np
import calendar

import pyspark
from pyspark.sql import SparkSession

from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType, StringType, FloatType
from pyspark.sql.functions import collect_list, regexp_replace, lower
from pyspark.sql import functions as F
from pyspark.sql import Window
from pyspark.sql.functions import year, month, dayofmonth, to_date, trim, concat, col, lit
from functools import reduce

import datetime 
from datetime import datetime as dt
from dateutil.relativedelta import *
import pandas as pd

#### Load in Applicant Data

Define the look back period as the 12 months prior to current date. (!) The exception being that GA4 was only implemented in July 2024 (!)

__We also want to apply the following filters to these applicants:__ 
- Personal, not business client; approved product is a personal banking account (PROD_APPRVD = P)
- Not an existing client (CLNT_TENURE_SEG_CD != EXISTING) 

##### Function to Pull Months

In [None]:
def month_end_list(num_months): 

    #Define period start and end based on today's date 
    today = datetime.date.today()
    period_start = today.replace(day=1) + relativedelta(months=-num_months)
    period_end = today.replace(day=1)

    #Make sure we keep only the period that we have GA4 rolled out - before July 31, 2024
    #We will be looking at previous sessions and logins 30 days before app
    #Therefore, we really care about the period after Aug 2024
    #if (period_start < datetime.date(2024,8,1)): #August
    #   period_start = datetime.date(2024,8,1)

    #Create list of month ends
    dtrange = pd.date_range(start=period_start, end=period_end, freq='d')
    months = pd.Series(dtrange.month)
    starts, ends = months.ne(months.shift(1)), months.ne(months.shift(-1))
    df = pd.DataFrame({'month_starting_date': dtrange[starts].strftime('%Y-%m-%d'),
                       'month_ending_date': dtrange[ends].strftime('%Y-%m-%d')})
    
    me_list = list(df['month_ending_date'])[:-1]
    return me_list 

In [None]:
print(month_end_list(11))

##### Function to Check Existing File Paths

In [None]:
def check_folder(path):

    # Access Hadoop FileSystem
    hadoop_fs = spark._jsc.hadoopConfiguration()
    fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(hadoop_fs)
    
    # List subfolders
    files = fs.listStatus(spark._jvm.org.apache.hadoop.fs.Path(path))
    subfolders = [file.getPath().getName() for file in files if file.isDirectory()]
        
    return subfolders

##### Function to Import Applicant Data for Each Month in Range

In [None]:
subfolders = check_folder("/anaurosevic/cdn0_cards_affinity/appl_fact/")
print(subfolders)

In [None]:
months_to_download = list(set(month_end_list(10)) - set(subfolders))
print(months_to_download)

In [None]:
#Import for each month end date 
def import_appl_fact(me_list): 

    #Loop over list of month ends
    for i in me_list: 
        print(i)

        #Set file name
        folder_path =  "/anaurosevic/cdn0_cards_affinity/appl_fact/"
        file_path = folder_path + str(i)
    
        #Create date filter string 
        date_filter_string = "ME_DT == '" + str(i) + "'"
    
        appl_fact = spark.read.option(
            "basePath","...DAILY_APPLICANT...").load(
            "...DAILY_APPLICANT.../").filter(
            date_filter_string).withColumn(
            "ep_reference_id", F.substring(F.trim(F.col("APPLCNT_NUM")),0,8)).filter(
            #Personal not business client
            "trim(PROD_APPRVD)= 'P'").filter(
            #Not an existing client at the time of application
            "trim(CLNT_TENURE_SEG_CD) != 'EXISTING'").select(
            "CLNT_NO","APPLCNT_NUM","ep_reference_id","PSTCD","APP_RCV_DT",
        ).distinct()
            
        appl_fact.coalesce(1).write.mode("overwrite").parquet(file_path)

    return appl_fact 

In [None]:
#import_appl_fact(months_to_download)
import_appl_fact(['2025-06-30']) #Just for testing

##### Combine Files 

In [None]:
#Import 
base_path = "/anaurosevic/cdn0_cards_affinity/appl_fact/"

for i in range (0,len(month_end_list(11))): 

    dynamic_path = base_path + str(month_end_list(11)[i])
    month = spark.read.load(dynamic_path)
    
    if i==0:
        appl_12m = month
    else:
        appl_12m = appl_12m.union(month)

In [None]:
print(appl_12m.count())
print(appl_12m.distinct().count()) #Yay

#### Load in E-commerce Data
This tells us who who clicked submit on the credit cards application flow via the public site. We only want to keep customers that applied through the public site (basically exclude situations where they went through an advisor or some other form).

##### Function to Import E-Commerce Data for Each Month in Range

In [None]:
subfolders = check_folder("/anaurosevic/cdn0_cards_affinity/ecommerce/") #These are the files we have already downloaded :) 
print(subfolders)

In [None]:
months_to_download = list(set(month_end_list(10)) - set(subfolders))
print(months_to_download)

In [None]:
def import_ecommerce_data(me_list): 

    date = dt.strptime(me_list[0], '%Y-%m-%d').date()

    #Loop over list of month ends
    for i in range(0,len(me_list)): 
        print(me_list[i])

        #Set file name for saving 
        save_folder_path =  "/anaurosevic/cdn0_cards_affinity/ecommerce/"
        save_file_path =  save_folder_path + str(me_list[i])

        #Location of files 
        data_folder_path = "...GA4_ECOMMERCE..."
        date = dt.strptime(me_list[i], '%Y-%m-%d').date()
        date_filter_string = "YEAR=" + str(date.strftime('%Y')) + "/Month=" + str(date.strftime('%m')) + "/"
        data_file_path = data_folder_path + date_filter_string

        #Import data 
        ecommerce = spark.read.load(
            data_file_path).filter(
            "ep_lob = 'credit cards' and ep_content_group = 'credit cards : cardapp'").filter(
            #Made it to the final submission step
            "ep_step_name = 'step_finished'").filter(
            #Not missing application ID
            "ep_reference_id is not null").filter(
            #Make sure we don't have internal traffic - e.g., branch computer
            "ep_traffic_type is null").withColumn(
            #Format CC choice
            "product_code", F.substring("it_item_id",3,3)).select(  
            "user_pseudo_id", F.trim(F.col("ep_reference_id")).alias("ep_reference_id"), 
            "event_date", "user_session_id", "product_code").distinct()
                
        ecommerce.coalesce(1).write.mode("overwrite").parquet(save_file_path)

    return ecommerce 

In [None]:
#import_ecommerce_data(months_to_download)
import_ecommerce_data(['2025-06-30']) #Just for testing

##### Combine Files 

In [None]:
#Import 
base_path = "/anaurosevic/cdn0_cards_affinity/ecommerce/"

for i in range (0,len(month_end_list(11))): 

    dynamic_path = base_path + str(month_end_list(11)[i])
    month = spark.read.load(dynamic_path)
    
    if i==0:
        ecommerce_12m = month
    else:
        ecommerce_12m = ecommerce_12m.union(month)

In [None]:
print(ecommerce_12m.count())
print(ecommerce_12m.distinct().count())
#Great, no duplicates to worry about :) 

#### Join Tables to Identify Prospect Base
Join the two tables so that we can ensure that customers who made it to the CC submission point (via e-commerce) are not existing clients. 

In [None]:
#Join applicant table to e-commerce table via ep_reference_id
#Ignore existing clients 
ga_to_appl = ecommerce_12m.join(appl_12m, [
    (ecommerce_12m.ep_reference_id == appl_12m.ep_reference_id) & (ecommerce_12m.event_date <= appl_12m.APP_RCV_DT)
]).drop(ecommerce_12m.ep_reference_id)

In [None]:
ga_to_appl.printSchema()

In [None]:
ga_to_appl.show(5, False)

In [None]:
ga_to_appl.count()

In [None]:
#Grab subset of columns of interest ~
new_clients = ga_to_appl.withColumnRenamed(
    "PSTCD","postal_code").select(
    "user_pseudo_id","user_session_id","ep_reference_id",F.col("CLNT_NO").alias('clnt_no'),"postal_code","product_code","event_date").persist()

In [None]:
new_clients.show(5,False)

In [None]:
new_clients.count()

#### Fix Inconsistencies
We would expect each row to be unique: one user, one session, one product, one date. We also shouldn't see cases where there are multiple rows per user_pseudo_id. Therefore, we are removing strange cases like the following: 
- __(A) Missing user_pseudo_id or user_session_id:__ We won't be able to link these customers to their session details and it is unclear why this would ever be missing
- __(B) Multiple applications or cards per session (unique at the session level):__ It could be the following reasons: (1) They are scammers that are re-submitting applications for the same card multiple times; (2) They could be households with diff people applying within the same session off the same device; (3) It could be a branch or public computer during which multiple people are submitting. All three cases are problematic. We will only keep prospects who have 1 application for a single CC in the session.
- __(C) Multiple applications or sessions for the same device (unique at the user level):__ Multiple sessions for the same device could be if they get declined or multiple households. We will make the assumption that we should only consider the first application for each device.

##### (A) Missing user_pseudo_id or user_session_id

In [None]:
print(new_clients.filter("user_pseudo_id is null").count())
print(new_clients.filter("user_session_id == '_'").count())
#There are 1,807 people who don't have a user pseudo id nor a session id - let's drop these, not sure why they are happening 

In [None]:
df = new_clients.filter("user_pseudo_id is not null")

In [None]:
print(df.count())

##### (B) Multiple applications or cards per session (unique at the session level)

In [None]:
#Keep only people who had one application within the session 
one_application = df.groupBy(["user_pseudo_id","user_session_id"]).count().filter("count==1").select('user_session_id')
one_application.show(5,False)

In [None]:
#Merge 
df_one_app = df.join(one_application, 
                  [df.user_session_id == one_application.user_session_id],"inner").drop(
    one_application.user_session_id).select(
    "user_pseudo_id","user_session_id","clnt_no","event_date","postal_code","product_code").distinct()
df_one_app.count()
#Now it should be unique at the session level - within each session, the customer only applied once! 

In [None]:
df_one_app.show(5,False)

In [None]:
#Grab session timestamp! 
df_one_app = df_one_app.withColumn(
    "session_timestamp", F.from_unixtime(F.split(F.col("user_session_id"), "_").getItem(1)))
df_one_app.show(5,False)

##### (C) Multiple applications or sessions for the same device (unique at the user level)

In [None]:
#In these cases, we'll take the first credit card application as the event date and CC for that client
df_one_app_first = df_one_app.withColumn(
    "rank", F.row_number().over(Window.partitionBy("user_pseudo_id").orderBy("session_timestamp"))
).filter("rank = 1").drop("rank")

##### Add province/territory breakdown 
Province is important especially for certain cards that are based on region: moi - QC, Westjet - BC.

In [None]:
#Let's also grab the province - province level details are important 
#Source: https://www12.statcan.gc.ca/census-recensement/2021/ref/dict/tab/index-eng.cfm?ID=T1_9
df_one_app_first = df_one_app_first.withColumn(
            "province",
            F.when(
                F.substring("postal_code",1,1)=='A','NL').when(
                F.substring("postal_code",1,1)=='B','NS').when(
                F.substring("postal_code",1,1)=='C','PE').when(
                F.substring("postal_code",1,1)=='E','NB').when(
                F.substring("postal_code",1,1).isin('G','H','J'), 'QC').when(
                F.substring("postal_code",1,1).isin('K','L','M','N','P'), 'ON').when(
                F.substring("postal_code",1,1)=='R', 'MB').when(
                F.substring("postal_code",1,1)=='S', 'SK').when(
                F.substring("postal_code",1,1)=='T', 'AB').when(
                F.substring("postal_code",1,1)=='V', 'BC').when(
                F.substring("postal_code",1,1)=='X', 'NTNU').when(
                F.substring("postal_code",1,1)=='Y', 'YT').otherwise(
                "unk"))

#### Remove Devices with Client Login in Last 30 Days

In [None]:
new_cards = df_one_app_first.withColumn(
    "rank", F.row_number().over(Window.partitionBy("user_pseudo_id").orderBy("event_date"))
).filter("rank = 1").drop("rank").withColumn(
    "date_lag", F.date_add(F.col("event_date"), -1) # need a one day delay due to above reason
).withColumnRenamed(
    "event_date", "card_sale_date")

##### Function to Import Visitor Data for Each Month in Range
Important: Make sure we have one additional month prior!

In [None]:
print(month_end_list(12))

In [None]:
subfolders = check_folder("/anaurosevic/cdn0_cards_affinity/visitor/")
print(subfolders)

In [None]:
months_to_download = list(set(month_end_list(11)) - set(subfolders))
print(months_to_download)

In [None]:
def import_visitors_data(me_list): 
    
    #Loop over list of month ends
    for i in range(0,len(me_list)): 
        print(me_list[i])

        #Set file name for saving 
        save_folder_path =  "/anaurosevic/cdn0_cards_affinity/visitor/"
        save_file_path =  save_folder_path + str(me_list[i]) 

        #Location of files 
        data_folder_path = "...GA4_VISITOR..."
        date = dt.strptime(me_list[i], '%Y-%m-%d').date()
        date_filter_string = "YEAR=" + str(date.strftime('%Y')) + "/Month=" + str(date.strftime('%m'))
        data_file_path = data_folder_path + date_filter_string + "/*"

        #Import data 
        visitors = spark.read.option(
            "basePath",data_folder_path).load(
            data_file_path).withColumn(
            "visitor_table_date", 
            F.to_date(F.concat(F.col("YEAR"), F.lit("-"), F.col("Month"), F.lit("-"), F.col("Day")), "yyyy-MM-dd")).select(
            "user_pseudo_id","visitor_table_date").distinct()
                
        visitors.coalesce(1).write.mode("overwrite").parquet(save_file_path)

    return visitors 

In [None]:
#import_visitors_data(months_to_download)
import_visitors_data(['2025-06-30']) #Just to test 

##### Combine Files 

In [None]:
#Import 
base_path = "/anaurosevic/cdn0_cards_affinity/visitor/"

for i in range (0,len(month_end_list(12))): 

    dynamic_path = base_path + str(month_end_list(12)[i])
    month = spark.read.load(dynamic_path)
    
    if i==0:
        visitors_12m = month
    else:
        visitors_12m = visitors_12m.union(month)

#### Join Tables to Exclude Devices with Previous Login

In [None]:
new_cards = new_cards.join(visitors_12m, 
    on=[(new_cards.user_pseudo_id == visitors_12m.user_pseudo_id) & (visitors_12m.visitor_table_date == new_cards.date_lag)], 
    how='left_anti').drop(
    new_cards.date_lag).drop(
    visitors_12m.visitor_table_date)

In [None]:
new_cards.count()

In [None]:
new_cards.show(5, False)

#### CCs of Interest

In [None]:
cb = new_cards

In [None]:
cb.count()

In [None]:
credit_cards = ["CLO","IAV","ION","MC4","IOP","GCP","MC1","AVP","MC2","MCP","MV1","PLT","GUS","BAP"]

In [None]:
cb.show(3,False)

In [None]:
cb_ccs_of_interest = cb.filter(F.col('product_code').isin(credit_cards))

In [None]:
cb_ccs_of_interest.groupBy('product_code').count().orderBy(F.desc('count')).show(50,False)

##### Final checks

In [None]:
cb_ccs_of_interest.count()

In [None]:
cb_ccs_of_interest.agg(F.min(F.col('card_sale_date')).alias('min_date'), F.max(F.col('card_sale_date')).alias('max_date')).show()

In [None]:
cb_ccs_of_interest.show(10) #Yay!! :) 

#### Save File 

In [None]:
cb_ccs_of_interest.coalesce(1).write.mode("overwrite").parquet("/anaurosevic/cdn0_cards_affinity/prospect_base/")

--- END PROGRAM ---

In [None]:
#Timing summary
end_time = time.time()
end_cpu_time = time.process_time()

real_time_elapsed = end_time - start_time
cpu_time_elapsed = end_cpu_time - start_cpu_time

print(f"Real time: {real_time_elapsed:.2f} seconds")
print(f"CPU time: {cpu_time_elapsed:.2f} seconds")