In [1]:
import os
import shutil
import argparse
from datetime import datetime
import numpy as np
import pandas as pd
from collections import Counter

import pyspark
from pyspark.sql import SparkSession

from utils.data_preprocessing_bronze_table import *
from utils.data_preprocessing_silver_table import *
from utils.data_preprocessing_gold_table import *
from utils.helper import *


current_directory = os.path.dirname(os.path.abspath(os.path.join(os.getcwd(), "cs611-assignment-1")))
csv_dir = os.path.join(current_directory, "data")
print(current_directory)

# Other parts
data_mart_dir = os.path.join(current_directory, "datamart")
bronze_dir = os.path.join(data_mart_dir, "bronze")
silver_dir = os.path.join(data_mart_dir, "silver")
gold_dir = os.path.join(data_mart_dir, "gold")



c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1


In [2]:
# Run this to refresh the dataset
# Refresh current directory
if os.path.exists(data_mart_dir):
    shutil.rmtree(data_mart_dir)

os.mkdir(data_mart_dir)
os.mkdir(bronze_dir)
os.mkdir(silver_dir)
os.mkdir(gold_dir)

In [3]:
def data_prep_bronze(start_date, end_date, spark : SparkSession):
    print('\n\n---starting Bronze Table job---\n\n')

    # Get all the datetimes 
    dates_str_list = generate_first_of_month_dates(start_date, end_date)

    # We can build the bronze table
    # Get csvs
    csv_files = os.listdir(csv_dir)

    for csv_file in csv_files:
        csv_full_dir = os.path.join(csv_dir, csv_file)
        for date_str in dates_str_list:
            print("Preparing bronze table {}".format(csv_file))
            prepare_bronze_table_daily(csv_full_dir, bronze_dir, spark, date_str)

In [4]:
def data_prep_silver(start_date, end_date, spark : SparkSession):
    print('\n\n---starting Silver table job---\n\n')

    # Get all the datetimes 
    dates_str_list = generate_first_of_month_dates(start_date, end_date)

    # We can build the silver table
    for date_str in dates_str_list:
        # Build the silver table for each csv
        expected_lms_loan_daily_file_name = "bronze_lms_loan_daily_" + date_str + ".csv"
        expected_loan_full_dir = os.path.join(bronze_dir, expected_lms_loan_daily_file_name)

        process_silver_table_loan_daily(expected_loan_full_dir,
                                        silver_dir,
                                        date_str,
                                        spark)
        
        expected_feature_financials_file_name = "bronze_features_financial_" + date_str + ".csv"
        expected_financial_full_dir = os.path.join(bronze_dir, expected_feature_financials_file_name)

        process_silver_table_feature_financials(expected_financial_full_dir,
                                                silver_dir,
                                                date_str,
                                                spark)
        
        expected_feature_attributes_file_name = "bronze_features_attribute_" + date_str + ".csv"
        expected_feature_attributes_full_dir = os.path.join(bronze_dir, expected_feature_attributes_file_name)

        process_silver_table_features_attributes(expected_feature_attributes_full_dir,
                                                 silver_dir,
                                                 date_str,
                                                 spark)
        
        expected_feature_clickstream_file_name = "bronze_feature_clickstream_" + date_str + ".csv"
        expected_feature_clickstream_full_dir = os.path.join(bronze_dir, expected_feature_clickstream_file_name)

        process_silver_table_features_clickstream(expected_feature_clickstream_full_dir,
                                                  silver_dir,
                                                  date_str,
                                                  spark)

In [5]:
spark = pyspark.sql.SparkSession.builder \
        .config("spark.driver.memory", "4g") \
        .config("spark.executor.memory", "4g") \
        .appName("dev") \
        .master("local[*]") \
        .getOrCreate()
    
# Set log level to ERROR to hide warnings
spark.sparkContext.setLogLevel("ERROR")

In [6]:
data_prep_bronze("2023-01-01", "2024-12-01", spark)



---starting Bronze Table job---


Preparing bronze table features_attributes.csv
Row Count for Date 2023-01-01 00:00:00 : 530
Bronze features_attribute Daily Date 2023-01-01 00:00:00 saved to : c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\bronze\bronze_features_attribute_2023-01-01.csv
Preparing bronze table features_attributes.csv
Row Count for Date 2023-02-01 00:00:00 : 501
Bronze features_attribute Daily Date 2023-02-01 00:00:00 saved to : c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\bronze\bronze_features_attribute_2023-02-01.csv
Preparing bronze table features_attributes.csv
Row Count for Date 2023-03-01 00:00:00 : 506
Bronze features_attribute Daily Date 2023-03-01 00:00:00 saved to : c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\bronze\bronze_features_attribute_2023-03-01.csv
Preparing bronze table features_attributes.csv
Row Count for Date 2023-04-01 00:00:00 : 510
Bronze features_attribute Daily Date 2023-04-01 00:00:00 saved to

In [7]:
# Now we do the same for silver table
data_prep_silver("2023-01-01", "2024-12-01", spark)



---starting Silver table job---


Loaded c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\bronze\bronze_lms_loan_daily_2023-01-01.csv, row count 530
Saving File c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_lms_loan_daily_2023-01-01.csv row count 530
Loaded c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\bronze\bronze_features_financial_2023-01-01.csv, row count 530
Saving file : c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_financials_2023-01-01.csv row count 404
Loaded c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\bronze\bronze_features_attribute_2023-01-01.csv, row count Customer_ID      530
Name             530
Age              530
SSN              530
Occupation       530
snapshot_date    530
dtype: int64
Saving file : c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_attributes_2023-01-01.csv row count 530
Loaded c:\Users\Admin\Desktop\SMU\C

In [None]:
# Fuse the data to by year, the perform correlation analysis
import os
import pandas as pd
from datetime import datetime

import numpy as np

import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import StringType, IntegerType, FloatType, DateType

def process_labels_gold_table(snapshot_date_str, silver_loan_daily_directory, gold_label_store_directory, spark : SparkSession, dpd, mob):
    # prepare arguments
    snapshot_date = datetime.strptime(snapshot_date_str, "%Y-%m-%d")
    
    # connect to bronze table
    partition_name = "silver_lms_loan_daily_" + snapshot_date_str + '.csv'
    filepath = os.path.join(silver_loan_daily_directory, partition_name)
    df = spark.read.csv(filepath, header=True, inferSchema=True)
    print('loaded from:', filepath, 'row count:', df.count())

    # get customer at mob
    df = df.filter(col("mob") == mob)

    # get label
    df = df.withColumn("label", F.when(col("dpd") >= dpd, 1).otherwise(0).cast(IntegerType()))
    df = df.withColumn("label_def", F.lit(str(dpd)+'dpd_'+str(mob)+'mob').cast(StringType()))

    # select columns to save
    df = df.select("loan_id", "Customer_ID", "label", "label_def", "snapshot_date")

    # save gold table - IRL connect to database to write
    partition_name = "gold_label_store_" + snapshot_date_str.replace('-','_') + '.csv'
    filepath = os.path.join(gold_label_store_directory, partition_name)
    
    df.toPandas().to_csv(filepath, index=False)
    print('saved to:', filepath)
    
    return df

def process_features_gold_table(snapshot_date_str, silver_dir, gold_feature_store_directory, spark : SparkSession):
    snapshot_date = datetime.strptime(snapshot_date_str, "%Y-%m-%d")

    # We need to access each of the stores from the specific dates
    silver_features_financials = os.path.join(silver_dir, "silver_feature_financials_" + snapshot_date_str + '.csv')
    silver_features_attributes = os.path.join(silver_dir, "silver_feature_attributes_" + snapshot_date_str + '.csv')
    silver_features_clickstream = os.path.join(silver_dir, "silver_feature_clickstream_" + snapshot_date_str + '.csv')

    ff_df = spark.read.csv(silver_features_financials, header=True, inferSchema=True)
    fa_df = spark.read.csv(silver_features_attributes, header=True, inferSchema=True)
    fc_df = spark.read.csv(silver_features_clickstream, header=True, inferSchema=True)
    print('loaded from:', silver_features_financials, 'row count:', ff_df.count())
    print('loaded from:', silver_features_attributes, 'row count:', fa_df.count())
    print('loaded from:', silver_features_clickstream, 'row count:', fc_df.count())

    # Merge the 3 datasets by date to correspond to the label store
    # Feature clickstream is the cleanest dataset, followed by attributes then finally the financials 
    df_joined_1 = fc_df.join(fa_df, on="Customer_ID", how="inner")
    final_df = df_joined_1.join(ff_df, on="Customer_ID", how="inner")

    # Save the final file
    partition_name = "gold_feature_store_" + snapshot_date_str + '.csv'
    full_partition_path = os.path.join(gold_feature_store_directory, partition_name)
    final_df.toPandas().to_csv(full_partition_path, index=False)
    print(f"saved to : {full_partition_path}, row count : {final_df.count()}")

    return final_df

In [17]:
# Now we can prepare the gold table as well
def data_prep_gold(start_date, end_date, spark : SparkSession):
    print('\n\n---starting Gold table job---\n\n')

    dates_str_list = generate_first_of_month_dates(start_date, end_date)

    # We can build the silver table
    for date_str in dates_str_list:
        # Prepare the gold labels
        label_df = process_labels_gold_table(date_str, silver_dir, gold_dir, spark, dpd = 30, mob = 6)

        # Prepare the gold features
        features_df = process_features_gold_table(date_str, silver_dir, gold_dir, spark)

    return label_df, features_df


In [18]:
sample_label, sample_features = data_prep_gold("2023-01-01", "2024-12-01", spark)



---starting Gold table job---


loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_lms_loan_daily_2023-01-01.csv row count: 530
Row(loan_id='CUS_0x1037_2023_01_01', Customer_ID='CUS_0x1037', loan_start_date=datetime.date(2023, 1, 1), tenure=10, installment_num=0, loan_amt=10000.0, due_amt=0.0, paid_amt=0.0, overdue_amt=0.0, balance=10000.0, snapshot_date=datetime.date(2023, 1, 1), mob=0, installments_missed=0, first_missed_date=None, dpd=0)
saved to: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\gold\gold_label_store_2023_01_01.csv
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_financials_2023-01-01.csv row count: 404
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_attributes_2023-01-01.csv row count: 530
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_clickstream_2023-01-01.csv row count: 

In [19]:
sample_label.head()

Row(loan_id='CUS_0x10dd_2024_06_01', Customer_ID='CUS_0x10dd', label=0, label_def='30dpd_6mob', snapshot_date=datetime.date(2024, 12, 1))

In [22]:
print(sample_features.head())

None
