In [1]:
import os
import shutil
import argparse
from datetime import datetime

import pyspark
from pyspark.sql import SparkSession

from utils.data_preprocessing_bronze_table import *
from utils.data_preprocessing_silver_table import *
from utils.helper import *


current_directory = os.path.dirname(os.path.abspath(os.path.join(os.getcwd(), "cs611-assignment-1")))
csv_dir = os.path.join(current_directory, "data")
print(current_directory)

# Other parts
data_mart_dir = os.path.join(current_directory, "datamart")
bronze_dir = os.path.join(data_mart_dir, "bronze")
silver_dir = os.path.join(data_mart_dir, "silver")
gold_dir = os.path.join(data_mart_dir, "gold")

# Refresh current directory
if os.path.exists(data_mart_dir):
    shutil.rmtree(data_mart_dir)

os.mkdir(data_mart_dir)
os.mkdir(bronze_dir)
os.mkdir(silver_dir)
os.mkdir(gold_dir)

/home/kieren/SMU_MITB/CS611/cs611-assignment-1


In [2]:
def data_prep_bronze(start_date, end_date, spark : SparkSession):
    print('\n\n---starting Bronze Table job---\n\n')

    # Get all the datetimes 
    dates_str_list = generate_first_of_month_dates(start_date, end_date)

    # We can build the bronze table
    # Get csvs
    csv_files = os.listdir(csv_dir)

    for csv_file in csv_files:
        csv_full_dir = os.path.join(csv_dir, csv_file)
        for date_str in dates_str_list:
            print("Preparing bronze table {}".format(csv_file))
            prepare_bronze_table_daily(csv_full_dir, bronze_dir, spark, date_str)

    # Now we can prepare the silver
    # for csv_file in csv_files:
    #     csv_full_dir = os.path.join(csv_dir, csv_file)
    #     csv_type = csv_file.rstrip(".csv")

    #     if csv_type == "lms_loan_daily":
    #         # Get all the lms_loan_daily files
    #         for date_str in dates_str_list:
    #             expected_lms_loan_daily_file_name = "bronze_" + csv_type + "_" + date_str + ".csv"
    #             expected_full_dir = os.path.join(bronze_dir, expected_lms_loan_daily_file_name)

    #             process_silver_table_loan_daily(expected_full_dir,
    #                                             silver_dir,
    #                                             date_str,
    #                                             spark)
                
    #     elif csv_type == "feature_finanicals":
    #         # Get all feature_financials files
    #         for date_str in dates_str_list:
    #             expected_feature_financials_file_name = "bronze_" + csv_type + "_" + date_str + ".csv"
    #             expected_full_dir = os.path.join(bronze_dir, expected_feature_financials_file_name)

    #             process_silver_table_feature_financials(expected_full_dir,
    #                                                     silver_dir,
    #                                                     date_str,
    #                                                     spark)

In [7]:
def data_prep_silver(start_date, end_date, spark : SparkSession):
    print('\n\n---starting Silver table job---\n\n')

    # Get all the datetimes 
    dates_str_list = generate_first_of_month_dates(start_date, end_date)

    # We can build the silver table
    for date_str in dates_str_list:
        # Build the silver table for each csv
        expected_lms_loan_daily_file_name = "bronze_lms_loan_daily_" + date_str + ".csv"
        expected_loan_full_dir = os.path.join(bronze_dir, expected_lms_loan_daily_file_name)

        process_silver_table_loan_daily(expected_loan_full_dir,
                                        silver_dir,
                                        date_str,
                                        spark)
        
        expected_feature_financials_file_name = "bronze_features_financial_" + date_str + ".csv"
        expected_financial_full_dir = os.path.join(bronze_dir, expected_feature_financials_file_name)

        process_silver_table_feature_financials(expected_financial_full_dir,
                                                silver_dir,
                                                date_str,
                                                spark)

In [8]:
spark = pyspark.sql.SparkSession.builder \
        .appName("dev") \
        .master("local[*]") \
        .getOrCreate()
    
# Set log level to ERROR to hide warnings
spark.sparkContext.setLogLevel("ERROR")

In [5]:
data_prep_bronze("2023-01-01", "2024-12-01", spark)



---starting Bronze Table job---


Preparing bronze table features_financials.csv
Row Count for Date 2023-01-01 00:00:00 : 530
Bronze features_financial Daily Date 2023-01-01 00:00:00 saved to : /home/kieren/SMU_MITB/CS611/cs611-assignment-1/datamart/bronze/bronze_features_financial_2023-01-01.csv
Preparing bronze table features_financials.csv
Row Count for Date 2023-02-01 00:00:00 : 501
Bronze features_financial Daily Date 2023-02-01 00:00:00 saved to : /home/kieren/SMU_MITB/CS611/cs611-assignment-1/datamart/bronze/bronze_features_financial_2023-02-01.csv
Preparing bronze table features_financials.csv
Row Count for Date 2023-03-01 00:00:00 : 506
Bronze features_financial Daily Date 2023-03-01 00:00:00 saved to : /home/kieren/SMU_MITB/CS611/cs611-assignment-1/datamart/bronze/bronze_features_financial_2023-03-01.csv
Preparing bronze table features_financials.csv
Row Count for Date 2023-04-01 00:00:00 : 510
Bronze features_financial Daily Date 2023-04-01 00:00:00 saved to : /home/kieren

In [9]:
data_prep_silver("2023-01-01", "2024-12-01", spark)



---starting Silver table job---


Loaded /home/kieren/SMU_MITB/CS611/cs611-assignment-1/datamart/bronze/bronze_lms_loan_daily_2023-01-01.csv, row count 530
saved to: /home/kieren/SMU_MITB/CS611/cs611-assignment-1/datamart/silver/silver_lms_loan_daily_2023-01-01.csv
Loaded /home/kieren/SMU_MITB/CS611/cs611-assignment-1/datamart/bronze/bronze_features_financial_2023-01-01.csv, row count 530


25/05/15 14:42:54 ERROR PythonUDFRunner: Python worker exited unexpectedly (crashed)
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/home/kieren/pytorch_env/lib/python3.12/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 1225, in main
    eval_type = read_int(infile)
                ^^^^^^^^^^^^^^^^
  File "/home/kieren/pytorch_env/lib/python3.12/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 596, in read_int
    raise EOFError
EOFError

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:572)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:94)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:75)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(Interrupt

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "/home/kieren/SMU_MITB/CS611/cs611-assignment-1/utils/data_preprocessing_silver_table.py", line 33, in parse_float
    match = re.search(r"[-+]?\d*\.\d+|\d+", s)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/re/__init__.py", line 177, in search
    return _compile(pattern, flags).search(string)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
TypeError: expected string or bytes-like object, got 'float'
