## PREVIOUS SESSIONS

We are interested to know if prospects ever visited our public site before and if so, whether they had already explored or considered certain credit cards and accounts. 
*In the initial training of the model, we will be pulling session details for the session during which the prospect first applied for a CC. Once the model is live, this will be pulled real-time for each new prospect.*

##### Timing 
We want to time how long these programs take to run. We are interested both in real time and CPU time.

In [None]:
import time 

start_time = time.time()
start_cpu_time = time.process_time()

#### Set Up

In [None]:
import pyspark.sql.functions as F
from pyspark.sql import HiveContext
from pyspark.ml.feature import StringIndexer

import os
import numpy as np
import calendar

import pyspark
from pyspark.sql import SparkSession

from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType, StringType, FloatType
from pyspark.sql.functions import collect_list, regexp_replace, lower
from pyspark.sql import functions as F
from pyspark.sql import Window
from pyspark.sql.functions import year, month, dayofmonth, to_date, trim, concat, col, lit
from functools import reduce

import datetime 
from datetime import datetime as dt
from dateutil.relativedelta import *
import pandas as pd

#### Function to Create Dates

In [None]:
def month_end_list(num_months):

    #Define period start and end based on today's date 
    today = datetime.date.today()
    period_start = today.replace(day=1) + relativedelta(months=-num_months)
    period_end = today.replace(day=1)

    #Create list of month ends
    dtrange = pd.date_range(start=period_start, end=period_end, freq='d')
    months = pd.Series(dtrange.month)
    starts, ends = months.ne(months.shift(1)), months.ne(months.shift(-1))
    df = pd.DataFrame({'month_starting_date': dtrange[starts].strftime('%Y-%m-%d'),
                       'month_ending_date': dtrange[ends].strftime('%Y-%m-%d')})
    
    me_list = list(df['month_ending_date'])[:-1]
    return me_list 

In [None]:
month_end_list(12)

#### Relevant Events

In [None]:
events_of_interest = [
    'experience_impression',
    'view_promotion',
    'first_visit',
    'click',
    'generate_tool',
    'begin_tool',
    'select_promotion',
    'add_to_cart',
    'remove_from_cart',
    "view_item",
    "select_item",
    "begin_checkout",
    "purchase"
]

lobs_of_interest = [
    'credit cards',
    'accounts',
    'students'
]

urls_of_interest = '%newcomers%'

bap_regex = "(british.*airways|/ba/)"
mcp_regex = "%MCP%"
iav_regex = "(infinite|iav|cartes/avion|fridayfriendpass)"
ion_regex = "(ion\-|/ion/)"
mc4_regex = "(westjet.*world.*elite|wj|mc4)"
mc1_regex = "(cash.*back.*mastercard|mc1)"
gcp_regex = "%avion%platinum%"
gus_regex = "%us%dollar%visa%gold%"
mv1_regex = "%moi%"
avp_regex = "%privilege%"
plt_regex = "%visa%platinum%"
iop_regex = "%iop-%"
mc2_regex = "(westjet.*mastercard|mc2)"
clo_regex = "(classic.*low.*rate|clo|low\-interest)"


d2d_regex = '%day-to-day-banking%'
adv_regex = '%advantage-banking%'
snlb_regex = '%signature-no-limit-banking%'

##### Load Prospect Base 

In [None]:
pb = spark.read.load("/anaurosevic/cdn0_cards_affinity/prospect_base/")

##### Load Sessions

In [None]:
filter_string = "session_date>='"+str(month_end_list(12)[0])+"' and session_date<'"+str(month_end_list(12)[-1])+"'"
print(filter_string)

In [None]:
session = spark.read.option(
    "basePath","...GA4_SESSION...").load(
    "...GA4_SESSION...").filter(
    filter_string).filter(  #remove branch computers 
    "ep_traffic_type is null").withColumn(
    "sess_timestamp",F.from_unixtime(F.col("user_session_start_timestamp")/1e6)).withColumn(
    "sess_date",F.to_date("sess_timestamp")).select(
    "user_pseudo_id","user_session_id","sess_date","sess_timestamp").distinct()

In [None]:
#Grab first sess_timestamp for each session ID to simplify 
session_filtered = session.withColumn(
     "rank", F.row_number().over(Window.partitionBy("user_session_id").orderBy("sess_timestamp"))).filter(
    "rank=1").drop("rank")

In [None]:
#Grab only sessions which occur in the 30 days before application
#Only some users have previous sessions - let's subset to this group 
sessions_30d = pb.join(session_filtered,
                       pb.user_pseudo_id == session_filtered.user_pseudo_id, how='inner').drop(
    session_filtered.user_session_id).drop(session_filtered.user_pseudo_id).filter(
    (F.col('sess_date')<F.col('card_sale_date')) & (F.col('sess_date')>=F.date_sub(F.col('card_sale_date'),30))).select(
    'user_pseudo_id','user_session_id','clnt_no','sess_date').persist() #Persist

In [None]:
#sessions_30d.count() 
#37,379 rows at last update (May 28, 2025) 

##### Load E-commerce Data

In [None]:
#What files do we already have? Don't duplicate effort :D 
path = "/anaurosevic/cdn0_cards_affinity/previous_sessions/events/"   # Replace with your folder path

# Access Hadoop FileSystem
hadoop_fs = spark._jsc.hadoopConfiguration()
fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(hadoop_fs)

# List subfolders
try:
    files = fs.listStatus(spark._jvm.org.apache.hadoop.fs.Path(path))
    subfolders = [file.getPath().getName() for file in files if file.isDirectory()]
    
    if not subfolders:
        print("No subfolders found in the directory.")
    else:
        print("Subfolders found:")
        for subfolder in subfolders:
            print(subfolder)
except Exception as e:
    print(f"Error accessing subfolders: {e}")

In [None]:
subfolders #These are the files we have already downloaded :) 

In [None]:
def import_ecommerce_data(me_list): 

    #Loop over list of month ends
    for i in range(0,len(me_list)): 
        print(me_list[i])

        #Set file name for saving 
        save_folder_path = "/anaurosevic/cdn0_cards_affinity/previous_sessions/events/"
        save_file_path =  save_folder_path + str(me_list[i])

        #Location of e-commerce files 
        data_folder_path = "...GA4_ECOMMERCE..."
        date = dt.strptime(me_list[i], '%Y-%m-%d').date()
        date_filter_string = "YEAR=" + str(date.strftime('%Y')) + "/Month=" + str(date.strftime('%m')) + "/"
        data_file_path = data_folder_path + date_filter_string

        #Import ecommerce data 
        ecommerce = spark.read.option("basePath",data_folder_path).load(
             data_file_path)
        #ecommerce = spark.read.load(data_file_path)

         #Subset to only clients and sessions of interest
        ecommerce_subset = sessions_30d.join(ecommerce, on='user_session_id',how='inner').drop(ecommerce.user_session_id).persist()

        #Grab LOBs of interest
        previous_ecommerce = ecommerce_subset.filter(
            (F.col("ep_lob").isin(lobs_of_interest) & F.col("event_name").isin(events_of_interest)) | F.lower(F.col("ep_clean_url")).like(urls_of_interest))

        #Note: Order is important here to prevent regex shenanigans
        #Flag events of interest
        previous_ecommerce = previous_ecommerce.withColumn(
        "it_item_id", F.\
        when(((F.col("event_name") == "first_visit") | F.col("ep_content_group").like('credit cards%')) & F.col("ep_clean_url").like(avp_regex), "i_AVP").\
        when(((F.col("event_name") == "first_visit") | F.col("ep_content_group").like('credit cards%')) & F.col("ep_clean_url").rlike(iav_regex), "i_IAV").\
        when(((F.col("event_name") == "first_visit") | F.col("ep_content_group").like('credit cards%')) & F.col("ep_clean_url").rlike(mc4_regex), "i_MC4").\
        when(((F.col("event_name") == "first_visit") | F.col("ep_content_group").like('credit cards%')) & F.col("ep_clean_url").rlike(mc2_regex), "i_MC2").\
        when(((F.col("event_name") == "first_visit") | F.col("ep_content_group").like('credit cards%')) & F.col("ep_clean_url").rlike(bap_regex), "i_BAP").\
        when(((F.col("event_name") == "first_visit") | F.col("ep_content_group").like('credit cards%')) & F.col("ep_clean_url").like(mcp_regex), "i_MCP").\
        when(((F.col("event_name") == "first_visit") | F.col("ep_content_group").like('credit cards%')) & F.col("ep_clean_url").rlike(ion_regex), "i_ION").\
        when(((F.col("event_name") == "first_visit") | F.col("ep_content_group").like('credit cards%')) & F.col("ep_clean_url").rlike(mc1_regex), "i_MC1").\
        when(((F.col("event_name") == "first_visit") | F.col("ep_content_group").like('credit cards%')) & F.col("ep_clean_url").like(gcp_regex), "i_GCP").\
        when(((F.col("event_name") == "first_visit") | F.col("ep_content_group").like('credit cards%')) & F.col("ep_clean_url").like(gus_regex), "i_GUS").\
        when(((F.col("event_name") == "first_visit") | F.col("ep_content_group").like('credit cards%')) & F.col("ep_clean_url").like(mv1_regex), "i_MV1").\
        when(((F.col("event_name") == "first_visit") | F.col("ep_content_group").like('credit cards%')) & F.col("ep_clean_url").like(plt_regex), "i_PLT").\
        when(((F.col("event_name") == "first_visit") | F.col("ep_content_group").like('credit cards%')) & F.col("ep_clean_url").like(iop_regex), "i_IOP").\
        when(((F.col("event_name") == "first_visit") | F.col("ep_content_group").like('credit cards%')) & F.col("ep_clean_url").rlike(clo_regex), "i_CLP").\
    
        when(F.col("ep_content_group").like("%accounts%") & F.col("ep_clean_url").like(d2d_regex), "i_022").\
        when(F.col("ep_content_group").like("%accounts%") & F.col("ep_clean_url").like(adv_regex), "i_099").\
        when(F.col("ep_content_group").like("%accounts%") & F.col("ep_clean_url").like(snlb_regex), "i_004").\
        otherwise(F.col("it_item_id")))

        previous_ecommerce_session_lvl_indicators = previous_ecommerce.withColumn(
            "view", F.when((F.col("event_name").like('view%') | F.col("ep_content_group").like('credit cards%')) & ~F.col("it_item_id").rlike(".*00+$"), F.concat(F.lit("view_"), F.regexp_replace("ep_lob", "\s", "_"), F.lit("_"), F.upper(F.split("it_item_id", "_").getItem(1))))
            ).withColumn(
                "select", F.when(F.col("event_name").like('select%') & ~F.col("it_item_id").rlike(".*00+$"), F.concat(F.lit("select_"), F.regexp_replace("ep_lob", "\s", "_"), F.lit("_"), F.upper(F.split("it_item_id", "_").getItem(1))))
            ).withColumn(
                "checkout", F.when(F.col("event_name").like('begin_checkout') & ~F.col("it_item_id").rlike(".*00+$"), F.concat(F.lit("checkout_"), F.regexp_replace("ep_lob", "\s", "_"), F.lit("_"), F.upper(F.split("it_item_id", "_").getItem(1))))
            ).withColumn(
                "newcomer_view", F.when(F.col("ep_clean_url").like(urls_of_interest), 1).otherwise(0)
            ).withColumn(
                "student_view", F.when((F.col("ep_lob") == 'students') | F.col("ep_clean_url").like("%student%") | F.col("ep_content_group").like("%student%"), 1).otherwise(0)
            ).withColumn(
                "purchase", F.when(F.col("event_name").like('begin_checkout') & ~F.col("it_item_id").rlike(".*00+$"), F.concat(F.lit("checkout_"), F.regexp_replace("ep_lob", "\s", "_"), F.lit("_"), F.upper(F.split("it_item_id", "_").getItem(1))))
            ).persist() 
    
        #(A) Select event
        previous_ecommerce_selections = previous_ecommerce_session_lvl_indicators.groupBy("user_session_id").pivot(
            "select").agg(F.lit(1)).fillna(0).drop("null")
    
        #(B) View event
        previous_ecommerce_views = previous_ecommerce_session_lvl_indicators.groupBy("user_session_id").pivot(
            "view").agg(F.lit(1)).fillna(0).drop("null")
    
        #(C) Checkout event
        previous_ecommerce_checkouts  = previous_ecommerce_session_lvl_indicators.groupBy("user_session_id").pivot(
            "checkout").agg(F.lit(1)).fillna(0).drop("null")
    
        #(D) Demo event
        previous_ecommerce_demos = previous_ecommerce_session_lvl_indicators.groupBy("user_session_id").agg(
            F.max("newcomer_view").alias("newcomer_view"), F.max("student_view").alias('student_view'))
    
        #Final
        previous_ecommerce_session_lvl = previous_ecommerce_selections.join(previous_ecommerce_views, ['user_session_id'], 'full_outer').join(
            previous_ecommerce_checkouts, ['user_session_id'], 'full_outer').join(
            previous_ecommerce_demos, ['user_session_id'], 'full_outer').fillna(0)
                    
        previous_ecommerce_session_lvl.coalesce(1).write.mode("overwrite").parquet(save_file_path)
    
    return ecommerce 

In [None]:
month_end_list(12)

In [None]:
months_to_download = list(set(month_end_list(12)) - set(subfolders))
print(months_to_download)

In [None]:
import_ecommerce_data(months_to_download)

--- END PROGRAM --- 

In [None]:
#Timing summary
end_time = time.time()
end_cpu_time = time.process_time()

real_time_elapsed = end_time - start_time
cpu_time_elapsed = end_cpu_time - start_cpu_time

print(f"Real time: {real_time_elapsed:.2f} seconds")
print(f"CPU time: {cpu_time_elapsed:.2f} seconds")