In [1]:
import os
import shutil
import argparse
from datetime import datetime
import numpy as np
import pandas as pd
from collections import Counter

import pyspark
from pyspark.sql import SparkSession

from utils.data_preprocessing_bronze_table import *
from utils.data_preprocessing_silver_table import *
from utils.helper import *


current_directory = os.path.dirname(os.path.abspath(os.path.join(os.getcwd(), "cs611-assignment-1")))
csv_dir = os.path.join(current_directory, "data")
print(current_directory)

# Other parts
data_mart_dir = os.path.join(current_directory, "datamart")
bronze_dir = os.path.join(data_mart_dir, "bronze")
silver_dir = os.path.join(data_mart_dir, "silver")
gold_dir = os.path.join(data_mart_dir, "gold")



c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1


In [2]:
# Run this to refresh the dataset
# Refresh current directory
if os.path.exists(data_mart_dir):
    shutil.rmtree(data_mart_dir)

os.mkdir(data_mart_dir)
os.mkdir(bronze_dir)
os.mkdir(silver_dir)
os.mkdir(gold_dir)

In [3]:
def data_prep_bronze(start_date, end_date, spark : SparkSession):
    print('\n\n---starting Bronze Table job---\n\n')

    # Get all the datetimes 
    dates_str_list = generate_first_of_month_dates(start_date, end_date)

    # We can build the bronze table
    # Get csvs
    csv_files = os.listdir(csv_dir)

    for csv_file in csv_files:
        csv_full_dir = os.path.join(csv_dir, csv_file)
        for date_str in dates_str_list:
            print("Preparing bronze table {}".format(csv_file))
            prepare_bronze_table_daily(csv_full_dir, bronze_dir, spark, date_str)

In [4]:
def data_prep_silver(start_date, end_date, spark : SparkSession):
    print('\n\n---starting Silver table job---\n\n')

    # Get all the datetimes 
    dates_str_list = generate_first_of_month_dates(start_date, end_date)

    # We can build the silver table
    for date_str in dates_str_list:
        # Build the silver table for each csv
        expected_lms_loan_daily_file_name = "bronze_lms_loan_daily_" + date_str + ".csv"
        expected_loan_full_dir = os.path.join(bronze_dir, expected_lms_loan_daily_file_name)

        process_silver_table_loan_daily(expected_loan_full_dir,
                                        silver_dir,
                                        date_str,
                                        spark)
        
        expected_feature_financials_file_name = "bronze_features_financial_" + date_str + ".csv"
        expected_financial_full_dir = os.path.join(bronze_dir, expected_feature_financials_file_name)

        process_silver_table_feature_financials(expected_financial_full_dir,
                                                silver_dir,
                                                date_str,
                                                spark)
        
        expected_feature_attributes_file_name = "bronze_features_attribute_" + date_str + ".csv"
        expected_feature_attributes_full_dir = os.path.join(bronze_dir, expected_feature_attributes_file_name)

        process_silver_table_features_attributes(expected_feature_attributes_full_dir,
                                                 silver_dir,
                                                 date_str,
                                                 spark)
        
        expected_feature_clickstream_file_name = "bronze_feature_clickstream_" + date_str + ".csv"
        expected_feature_clickstream_full_dir = os.path.join(bronze_dir, expected_feature_clickstream_file_name)

        process_silver_table_features_clickstream(expected_feature_clickstream_full_dir,
                                                  silver_dir,
                                                  date_str,
                                                  spark)

In [5]:
spark = pyspark.sql.SparkSession.builder \
        .config("spark.driver.memory", "4g") \
        .config("spark.executor.memory", "4g") \
        .appName("dev") \
        .master("local[*]") \
        .getOrCreate()
    
# Set log level to ERROR to hide warnings
spark.sparkContext.setLogLevel("ERROR")

In [6]:
data_prep_bronze("2023-01-01", "2024-12-01", spark)



---starting Bronze Table job---


Preparing bronze table features_attributes.csv
Row Count for Date 2023-01-01 00:00:00 : 530
Bronze features_attribute Daily Date 2023-01-01 00:00:00 saved to : c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\bronze\bronze_features_attribute_2023-01-01.csv
Preparing bronze table features_attributes.csv
Row Count for Date 2023-02-01 00:00:00 : 501
Bronze features_attribute Daily Date 2023-02-01 00:00:00 saved to : c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\bronze\bronze_features_attribute_2023-02-01.csv
Preparing bronze table features_attributes.csv
Row Count for Date 2023-03-01 00:00:00 : 506
Bronze features_attribute Daily Date 2023-03-01 00:00:00 saved to : c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\bronze\bronze_features_attribute_2023-03-01.csv
Preparing bronze table features_attributes.csv
Row Count for Date 2023-04-01 00:00:00 : 510
Bronze features_attribute Daily Date 2023-04-01 00:00:00 saved to

In [7]:
# Now we do the same for silver table
data_prep_silver("2023-01-01", "2024-12-01", spark)



---starting Silver table job---


Loaded c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\bronze\bronze_lms_loan_daily_2023-01-01.csv, row count 530
Saving File c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_lms_loan_daily_2023-01-01.csv row count 530
Loaded c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\bronze\bronze_features_financial_2023-01-01.csv, row count 530
Saving file : c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_financals_2023-01-01.csv row count 404
Loaded c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\bronze\bronze_features_attribute_2023-01-01.csv, row count Customer_ID      530
Name             530
Age              530
SSN              530
Occupation       530
snapshot_date    530
dtype: int64
Saving file : c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\silver\silver_feature_attributes_2023-01-01.csv row count 530
Loaded c:\Users\Admin\Desktop\SMU\CS

In [None]:
# sample_dir = os.path.join(current_directory, "datamart", "bronze", "bronze_features_financial_2023-01-01.csv")

# sample_df = process_silver_table_feature_financials(sample_dir,
#                                         silver_dir,
#                                         "2023-01-01",
#                                         spark)
# sample_df

Loaded c:\Users\Admin\Desktop\SMU\CS611\cs611-assignment-1\datamart\bronze\bronze_features_financial_2023-01-01.csv, row count 530


Unnamed: 0,Customer_ID,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Type_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,...,snapshot_date,Auto Loan,Student Loan,Credit-Builder Loan,Payday Loan,Personal Loan,Home Equity Loan,Mortgage Loan,Debt Consolidation Loan,Not Specified
0,CUS_0x1037,15989.085,1086.423750,5,4,2.0,4,"Credit-Builder Loan, Auto Loan, Auto Loan, and...",13,15,...,2023-01-01,2,0,1,0,0,0,1,0,0
1,CUS_0x1069,58637.340,4799.445000,4,6,10.0,3,"Personal Loan, Auto Loan, and Not Specified",9,17,...,2023-01-01,1,0,0,0,1,0,0,0,1
3,CUS_0x1184,19867.475,1396.622917,3,5,11.0,3,"Student Loan, Mortgage Loan, and Payday Loan",10,9,...,2023-01-01,0,1,0,1,0,0,1,0,0
4,CUS_0x1297,57738.060,4881.505000,9,8,30.0,9,"Payday Loan, Personal Loan, Payday Loan, Perso...",61,24,...,2023-01-01,0,1,1,3,2,1,1,0,0
5,CUS_0x12fb,26342.910,1949.242500,6,7,7.0,1,Credit-Builder Loan,23,14,...,2023-01-01,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
524,CUS_0xe84,18521.710,1613.056385,10,7,19.0,9,"Payday Loan, Home Equity Loan, Home Equity Loa...",24,22,...,2023-01-01,1,0,1,3,0,2,1,1,0
526,CUS_0xea6,42772.490,3664.374167,4,4,2.0,4,"Personal Loan, Debt Consolidation Loan, Person...",13,10,...,2023-01-01,0,0,0,0,2,0,0,2,0
527,CUS_0xed3,36035.320,2910.943333,9,10,22.0,4,"Credit-Builder Loan, Auto Loan, Home Equity Lo...",33,17,...,2023-01-01,1,0,1,0,0,1,0,1,0
528,CUS_0xed8,15847.060,1560.588333,4,4,8.0,6,"Auto Loan, Payday Loan, Debt Consolidation Loa...",26,11,...,2023-01-01,1,2,0,1,0,1,0,1,0


In [None]:
data_prep_silver("2023-01-01", "2024-12-01", spark)