In [1]:
import os
import shutil
import argparse
from datetime import datetime
import numpy as np
import pandas as pd
from collections import Counter

import pyspark
from pyspark.sql import SparkSession

from utils.data_preprocessing_bronze_table import *
from utils.data_preprocessing_silver_table import *
from utils.data_preprocessing_gold_table import *
from utils.helper import *


current_directory = os.path.dirname(os.path.abspath(os.path.join(os.getcwd(), "cs611-assignment-1")))
csv_dir = os.path.join(current_directory, "data")
print(current_directory)

# Other parts
data_mart_dir = os.path.join(current_directory, "datamart")
bronze_dir = os.path.join(data_mart_dir, "bronze")
silver_dir = os.path.join(data_mart_dir, "silver")
gold_dir = os.path.join(data_mart_dir, "gold")



c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1


In [2]:
# Run this to refresh the dataset
# Refresh current directory
if os.path.exists(data_mart_dir):
    shutil.rmtree(data_mart_dir)

os.mkdir(data_mart_dir)
os.mkdir(bronze_dir)
os.mkdir(silver_dir)
os.mkdir(gold_dir)

In [3]:
def data_prep_bronze(start_date, end_date, spark : SparkSession):
    print('\n\n---starting Bronze Table job---\n\n')

    # Get all the datetimes 
    dates_str_list = generate_first_of_month_dates(start_date, end_date)

    # We can build the bronze table
    # Get csvs
    csv_files = os.listdir(csv_dir)

    for csv_file in csv_files:
        csv_full_dir = os.path.join(csv_dir, csv_file)
        for date_str in dates_str_list:
            print("Preparing bronze table {}".format(csv_file))
            prepare_bronze_table_daily(csv_full_dir, bronze_dir, spark, date_str)

In [4]:
def data_prep_silver(start_date, end_date, spark : SparkSession):
    print('\n\n---starting Silver table job---\n\n')

    # Get all the datetimes 
    dates_str_list = generate_first_of_month_dates(start_date, end_date)

    # We can build the silver table
    for date_str in dates_str_list:
        # Build the silver table for each csv
        expected_lms_loan_daily_file_name = "bronze_lms_loan_daily_" + date_str + ".csv"
        expected_loan_full_dir = os.path.join(bronze_dir, expected_lms_loan_daily_file_name)

        process_silver_table_loan_daily(expected_loan_full_dir,
                                        silver_dir,
                                        date_str,
                                        spark)
        
        expected_feature_financials_file_name = "bronze_features_financial_" + date_str + ".csv"
        expected_financial_full_dir = os.path.join(bronze_dir, expected_feature_financials_file_name)

        process_silver_table_feature_financials(expected_financial_full_dir,
                                                silver_dir,
                                                date_str,
                                                spark)
        
        expected_feature_attributes_file_name = "bronze_features_attribute_" + date_str + ".csv"
        expected_feature_attributes_full_dir = os.path.join(bronze_dir, expected_feature_attributes_file_name)

        process_silver_table_features_attributes(expected_feature_attributes_full_dir,
                                                 silver_dir,
                                                 date_str,
                                                 spark)
        
        expected_feature_clickstream_file_name = "bronze_feature_clickstream_" + date_str + ".csv"
        expected_feature_clickstream_full_dir = os.path.join(bronze_dir, expected_feature_clickstream_file_name)

        process_silver_table_features_clickstream(expected_feature_clickstream_full_dir,
                                                  silver_dir,
                                                  date_str,
                                                  spark)

In [5]:
spark = pyspark.sql.SparkSession.builder \
        .config("spark.driver.memory", "4g") \
        .config("spark.executor.memory", "4g") \
        .appName("dev") \
        .master("local[*]") \
        .getOrCreate()
    
# Set log level to ERROR to hide warnings
spark.sparkContext.setLogLevel("ERROR")

In [6]:
data_prep_bronze("2023-01-01", "2024-12-01", spark)



---starting Bronze Table job---


Preparing bronze table features_attributes.csv
Row Count for Date 2023-01-01 00:00:00 : 530
Bronze features_attribute Daily Date 2023-01-01 00:00:00 saved to : c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\bronze\bronze_features_attribute_2023-01-01.csv
Preparing bronze table features_attributes.csv
Row Count for Date 2023-02-01 00:00:00 : 501
Bronze features_attribute Daily Date 2023-02-01 00:00:00 saved to : c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\bronze\bronze_features_attribute_2023-02-01.csv
Preparing bronze table features_attributes.csv
Row Count for Date 2023-03-01 00:00:00 : 506
Bronze features_attribute Daily Date 2023-03-01 00:00:00 saved to : c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\bronze\bronze_features_attribute_2023-03-01.csv
Preparing bronze table features_attributes.csv
Row Count for Date 2023-04-01 00:00:00 : 510
Bronze features_attribute Daily Date 2023-04-01 00:00:00 saved to

In [7]:
# Now we do the same for silver table
data_prep_silver("2023-01-01", "2024-12-01", spark)



---starting Silver table job---


Loaded c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\bronze\bronze_lms_loan_daily_2023-01-01.csv, row count 530
Saving File c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_lms_loan_daily_2023-01-01.csv row count 530
Loaded c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\bronze\bronze_features_financial_2023-01-01.csv, row count 530
Saving file : c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_financials_2023-01-01.csv row count 404
Loaded c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\bronze\bronze_features_attribute_2023-01-01.csv, row count Customer_ID      530
Name             530
Age              530
SSN              530
Occupation       530
snapshot_date    530
dtype: int64
Saving file : c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_attributes_2023-01-01.csv row count 513
Loaded c:\Users\Admin\Desktop\SMU\C

In [None]:
# Fuse the data to by year, the perform correlation analysis
import os
import pandas as pd
from datetime import datetime

import numpy as np

import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date, greatest
from pyspark.sql.types import StringType, IntegerType, FloatType, DateType

def process_labels_gold_table(snapshot_date_str, silver_loan_daily_directory, gold_label_store_directory, spark : SparkSession, dpd, mob):
    # prepare arguments
    current_date = datetime.now().strftime("%Y-%m-%d")
    
    # connect to bronze table
    partition_name = "silver_lms_loan_daily_" + snapshot_date_str + '.csv'
    filepath = os.path.join(silver_loan_daily_directory, partition_name)
    df = spark.read.csv(filepath, header=True, inferSchema=True)
    print('loaded from:', filepath, 'row count:', df.count())

    # get customer at mob
    df = df.filter(col("mob") == mob)

    # get label
    df = df.withColumn("label", F.when(col("dpd") >= dpd, 1).otherwise(0).cast(IntegerType()))
    df = df.withColumn("label_def", F.lit(str(dpd)+'dpd_'+str(mob)+'mob').cast(StringType()))

    # select columns to save
    df = df.select("loan_id", "Customer_ID", "label", "label_def", "snapshot_date")

    # save gold table - IRL connect to database to write
    partition_name = f"snapdate_{current_date}_" + "gold_label_store_" + snapshot_date_str.replace('-','_') + '.csv'
    # filepath = os.path.join(gold_label_store_directory, partition_name)
    
    # df.toPandas().to_csv(filepath, index=False)
    # print('saved to:', filepath)
    
    return df

def process_features_gold_table(snapshot_date_str, silver_dir, gold_feature_store_directory, spark : SparkSession):
    current_date = datetime.now().strftime("%Y-%m-%d")

    # We need to access each of the stores from the specific dates
    silver_features_financials = os.path.join(silver_dir, "silver_feature_financials_" + snapshot_date_str + '.csv')
    silver_features_attributes = os.path.join(silver_dir, "silver_feature_attributes_" + snapshot_date_str + '.csv')
    silver_features_clickstream = os.path.join(silver_dir, "silver_feature_clickstream_" + snapshot_date_str + '.csv')

    ff_df = spark.read.csv(silver_features_financials, header=True, inferSchema=True)
    fa_df = spark.read.csv(silver_features_attributes, header=True, inferSchema=True)
    fc_df = spark.read.csv(silver_features_clickstream, header=True, inferSchema=True)
    print('loaded from:', silver_features_financials, 'row count:', ff_df.count())
    print('loaded from:', silver_features_attributes, 'row count:', fa_df.count())
    print('loaded from:', silver_features_clickstream, 'row count:', fc_df.count())

    # Find the latest snapshot date afterwards
    ff_df = ff_df.withColumnRenamed("snapshot_date", "snapshot_date_1")
    ff_df = ff_df.withColumn("snapshot_date_1", to_date("snapshot_date_1"))

    fa_df = fa_df.withColumnRenamed("snapshot_date", "snapshot_date_2")
    fa_df = fa_df.withColumn("snapshot_date_2", to_date("snapshot_date_2"))

    fc_df = fc_df.withColumnRenamed("snapshot_date", "snapshot_date_3")
    fc_df = fc_df.withColumn("snapshot_date_3", to_date("snapshot_date_3"))

    # Merge the 3 datasets by date to correspond to the label store
    # Feature clickstream is the cleanest dataset, followed by attributes then finally the financials 
    df_joined_1 = fc_df.join(fa_df, on="Customer_ID", how="inner")
    final_df = df_joined_1.join(ff_df, on="Customer_ID", how="inner")

    final_df = final_df.withColumn(
        "snapshot_date",
        greatest("snapshot_date_1", "snapshot_date_2", "snapshot_date_3")
    )
    final_df = final_df.drop("snapshot_date_1", "snapshot_date_2", "snapshot_date_3")

    print("Final Row Count : ", final_df.count())

    # Save the final file
    # partition_name = f"snapdate_{current_date}_" + "gold_feature_store_" + snapshot_date_str + '.csv'
    # full_partition_path = os.path.join(gold_feature_store_directory, partition_name)
    # final_df.toPandas().to_csv(full_partition_path, index=False)
    # print(f"saved to : {full_partition_path}, row count : {final_df.count()}")

    return final_df

In [None]:
# Now we can prepare the gold table as well
from tqdm import tqdm

def data_prep_gold(start_date, end_date, spark : SparkSession):
    print('\n\n---starting Gold table job---\n\n')

    dates_str_list = generate_first_of_month_dates(start_date, end_date)

    label_df = None
    features_df = None

    # We can build the silver table
    for date_str in tqdm(dates_str_list):
        # Prepare the gold labels
        if label_df:
            cur_label_df = process_labels_gold_table(date_str, silver_dir, gold_dir, spark, dpd = 60, mob = 7)
            label_df = label_df.unionByName(cur_label_df)
        else:
            label_df = process_labels_gold_table(date_str, silver_dir, gold_dir, spark, dpd = 60, mob = 7)

        # Prepare the gold features
        if features_df:
            cur_feature_df = process_features_gold_table(date_str, silver_dir, gold_dir, spark)
            features_df = features_df.unionByName(cur_feature_df)
        else:
            features_df = process_features_gold_table(date_str, silver_dir, gold_dir, spark)

    # Save the data
    current_date = datetime.now().strftime("%Y-%m-%d")

    label_name = f"snapdate_{current_date}_" + "gold_label_store_" + start_date + "to" + end_date + ".csv"
    label_filepath = os.path.join(gold_dir, label_name)
    label_df.toPandas().to_csv(label_filepath, index=False)
    print('labels saved to : ', label_filepath, " row count : ", label_df.count())

    feature_name = f"snapdate_{current_date}_" + "gold_feature_store_" + start_date + "to" + end_date + ".csv"
    feature_filepath = os.path.join(gold_dir, feature_name)
    features_df.toPandas().to_csv(feature_filepath, index=False)
    print(f"saved to : {feature_filepath}, row count : {features_df.count()}")


    return label_df, features_df


In [18]:
sample_label, sample_features = data_prep_gold("2023-01-01", "2024-12-01", spark)



---starting Gold table job---




  0%|          | 0/24 [00:00<?, ?it/s]

loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_lms_loan_daily_2023-01-01.csv row count: 530
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_financials_2023-01-01.csv row count: 404
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_attributes_2023-01-01.csv row count: 513
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_clickstream_2023-01-01.csv row count: 8974


  4%|▍         | 1/24 [00:01<00:33,  1.47s/it]

Final Row Count :  391
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_lms_loan_daily_2023-02-01.csv row count: 1031
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_financials_2023-02-01.csv row count: 390
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_attributes_2023-02-01.csv row count: 490
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_clickstream_2023-02-01.csv row count: 8974


  8%|▊         | 2/24 [00:02<00:30,  1.37s/it]

Final Row Count :  380
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_lms_loan_daily_2023-03-01.csv row count: 1537
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_financials_2023-03-01.csv row count: 400
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_attributes_2023-03-01.csv row count: 489
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_clickstream_2023-03-01.csv row count: 8974


 12%|█▎        | 3/24 [00:03<00:26,  1.24s/it]

Final Row Count :  388
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_lms_loan_daily_2023-04-01.csv row count: 2047
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_financials_2023-04-01.csv row count: 407
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_attributes_2023-04-01.csv row count: 494
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_clickstream_2023-04-01.csv row count: 8974


 17%|█▋        | 4/24 [00:05<00:24,  1.24s/it]

Final Row Count :  392
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_lms_loan_daily_2023-05-01.csv row count: 2568
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_financials_2023-05-01.csv row count: 417
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_attributes_2023-05-01.csv row count: 507
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_clickstream_2023-05-01.csv row count: 8974


 21%|██        | 5/24 [00:06<00:22,  1.19s/it]

Final Row Count :  407
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_lms_loan_daily_2023-06-01.csv row count: 3085
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_financials_2023-06-01.csv row count: 413
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_attributes_2023-06-01.csv row count: 501
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_clickstream_2023-06-01.csv row count: 8974


 25%|██▌       | 6/24 [00:07<00:21,  1.21s/it]

Final Row Count :  399
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_lms_loan_daily_2023-07-01.csv row count: 3556
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_financials_2023-07-01.csv row count: 379
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_attributes_2023-07-01.csv row count: 462
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_clickstream_2023-07-01.csv row count: 8974


 29%|██▉       | 7/24 [00:08<00:22,  1.31s/it]

Final Row Count :  373
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_lms_loan_daily_2023-08-01.csv row count: 4037
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_financials_2023-08-01.csv row count: 398
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_attributes_2023-08-01.csv row count: 467
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_clickstream_2023-08-01.csv row count: 8974


 33%|███▎      | 8/24 [00:10<00:21,  1.31s/it]

Final Row Count :  385
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_lms_loan_daily_2023-09-01.csv row count: 4491
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_financials_2023-09-01.csv row count: 363
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_attributes_2023-09-01.csv row count: 443
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_clickstream_2023-09-01.csv row count: 8974


 38%|███▊      | 9/24 [00:11<00:19,  1.32s/it]

Final Row Count :  355
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_lms_loan_daily_2023-10-01.csv row count: 4978
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_financials_2023-10-01.csv row count: 385
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_attributes_2023-10-01.csv row count: 475
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_clickstream_2023-10-01.csv row count: 8974


 42%|████▏     | 10/24 [00:12<00:18,  1.31s/it]

Final Row Count :  376
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_lms_loan_daily_2023-11-01.csv row count: 5469
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_financials_2023-11-01.csv row count: 393
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_attributes_2023-11-01.csv row count: 480
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_clickstream_2023-11-01.csv row count: 8974


 46%|████▌     | 11/24 [00:14<00:16,  1.30s/it]

Final Row Count :  384
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_lms_loan_daily_2023-12-01.csv row count: 5428
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_financials_2023-12-01.csv row count: 394
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_attributes_2023-12-01.csv row count: 477
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_clickstream_2023-12-01.csv row count: 8974


 50%|█████     | 12/24 [00:15<00:15,  1.27s/it]

Final Row Count :  385
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_lms_loan_daily_2024-01-01.csv row count: 5412
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_financials_2024-01-01.csv row count: 394
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_attributes_2024-01-01.csv row count: 476
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_clickstream_2024-01-01.csv row count: 8974


 54%|█████▍    | 13/24 [00:16<00:14,  1.29s/it]

Final Row Count :  388
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_lms_loan_daily_2024-02-01.csv row count: 5424
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_financials_2024-02-01.csv row count: 417
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_attributes_2024-02-01.csv row count: 507
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_clickstream_2024-02-01.csv row count: 8974


 58%|█████▊    | 14/24 [00:18<00:13,  1.34s/it]

Final Row Count :  409
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_lms_loan_daily_2024-03-01.csv row count: 5425
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_financials_2024-03-01.csv row count: 416
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_attributes_2024-03-01.csv row count: 504
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_clickstream_2024-03-01.csv row count: 8974


 62%|██████▎   | 15/24 [00:19<00:11,  1.32s/it]

Final Row Count :  412
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_lms_loan_daily_2024-04-01.csv row count: 5417
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_financials_2024-04-01.csv row count: 417
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_attributes_2024-04-01.csv row count: 500
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_clickstream_2024-04-01.csv row count: 8974


 67%|██████▋   | 16/24 [00:20<00:10,  1.33s/it]

Final Row Count :  405
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_lms_loan_daily_2024-05-01.csv row count: 5391
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_financials_2024-05-01.csv row count: 395
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_attributes_2024-05-01.csv row count: 478
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_clickstream_2024-05-01.csv row count: 8974


 71%|███████   | 17/24 [00:22<00:10,  1.53s/it]

Final Row Count :  384
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_lms_loan_daily_2024-06-01.csv row count: 5418
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_financials_2024-06-01.csv row count: 407
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_attributes_2024-06-01.csv row count: 481
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_clickstream_2024-06-01.csv row count: 8974


 75%|███████▌  | 18/24 [00:24<00:09,  1.57s/it]

Final Row Count :  393
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_lms_loan_daily_2024-07-01.csv row count: 5442
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_financials_2024-07-01.csv row count: 411
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_attributes_2024-07-01.csv row count: 495
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_clickstream_2024-07-01.csv row count: 8974


 79%|███████▉  | 19/24 [00:25<00:07,  1.50s/it]

Final Row Count :  0
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_lms_loan_daily_2024-08-01.csv row count: 5531
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_financials_2024-08-01.csv row count: 425
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_attributes_2024-08-01.csv row count: 531
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_clickstream_2024-08-01.csv row count: 8974


 83%|████████▎ | 20/24 [00:27<00:06,  1.51s/it]

Final Row Count :  0
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_lms_loan_daily_2024-09-01.csv row count: 5537
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_financials_2024-09-01.csv row count: 398
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_attributes_2024-09-01.csv row count: 482
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_clickstream_2024-09-01.csv row count: 8974


 88%|████████▊ | 21/24 [00:28<00:04,  1.47s/it]

Final Row Count :  0
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_lms_loan_daily_2024-10-01.csv row count: 5502
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_financials_2024-10-01.csv row count: 379
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_attributes_2024-10-01.csv row count: 445
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_clickstream_2024-10-01.csv row count: 8974


 92%|█████████▏| 22/24 [00:30<00:02,  1.48s/it]

Final Row Count :  0
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_lms_loan_daily_2024-11-01.csv row count: 5501
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_financials_2024-11-01.csv row count: 392
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_attributes_2024-11-01.csv row count: 473
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_clickstream_2024-11-01.csv row count: 8974


 96%|█████████▌| 23/24 [00:31<00:01,  1.46s/it]

Final Row Count :  0
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_lms_loan_daily_2024-12-01.csv row count: 5531
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_financials_2024-12-01.csv row count: 417
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_attributes_2024-12-01.csv row count: 500
loaded from: c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_clickstream_2024-12-01.csv row count: 8974


100%|██████████| 24/24 [00:32<00:00,  1.37s/it]

Final Row Count :  0





labels saved to :  c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\gold\snapdate_2025-05-20_gold_label_store2023-01-01to2024-12-01.csv  row count :  8476
saved to : c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\gold\snapdate_2025-05-20_gold_feature_store2023-01-01to2024-12-01.csv, row count : 7006


In [19]:
sample_label.head()

Row(loan_id='CUS_0x10dd_2024_06_01', Customer_ID='CUS_0x10dd', label=0, label_def='30dpd_6mob', snapshot_date=datetime.date(2024, 12, 1))

In [22]:
print(sample_features.head())

None
