In [5]:
from pyspark.sql import SparkSession, Window, Row, Column, functions as fn
import psutil
from typing import List, Dict, Union, Tuple
import pyspark

# Part 1: Loading the dataset

Simply load the dataset from the parquet format given in the google drive above
- Load the dataset.
- Preview first 20 rows.
- How many partitions is this dataframe split into?
- Change partitions to be equal to the number of your logical cores

In [6]:
# Initialize Spark session
spark = SparkSession.builder.appName("M3").getOrCreate()
#.config("spark.jars", "/opt/spark/jars/postgresql-42.7.3.jar").master("local")
# spark.conf.set("spark.sql.execution.arrow.enabled", 'true')
# spark.config("spark.memory.offHeap.enabled","true") 
# spark.config("spark.memory.offHeap.size","10g")
# spark.conf.set("spark.executor.memory", "2g")
# spark context to interact with the driver
sc = spark.sparkContext

In [7]:
import os
os.getcwd()

'/home/jovyan'

In [8]:
dataset_path = './work/fintech_data_29_52_1008.parquet'
fintech_df = spark.read.parquet(dataset_path)


In [9]:
fintech_df.show(20)

+--------------------+--------------------+----------+--------------+----------+----------------+-------------------+--------+----------+-----------+-----------+-------+-----------+-----------+-----+-------------+----------+--------+-----+-----------------+----------+----------+------------------+--------------------+
|         Customer Id|           Emp Title|Emp Length|Home Ownership|Annual Inc|Annual Inc Joint|Verification Status|Zip Code|Addr State|Avg Cur Bal|Tot Cur Bal|Loan Id|Loan Status|Loan Amount|State|Funded Amount|      Term|Int Rate|Grade|       Issue Date|Pymnt Plan|      Type|           Purpose|         Description|
+--------------------+--------------------+----------+--------------+----------+----------------+-------------------+--------+----------+-----------+-----------+-------+-----------+-----------+-----+-------------+----------+--------+-----+-----------------+----------+----------+------------------+--------------------+
|YidceDkzalx4YmRce...|     president/own

In [10]:
print(fintech_df.rdd.getNumPartitions())

1


In [11]:
logical_cores = psutil.cpu_count(logical=True)  # Logical cores
physical_cores = psutil.cpu_count(logical=False)  # Physical cores

print(f"Logical cores: {logical_cores}")
print(f"Physical cores: {physical_cores}")

Logical cores: 24
Physical cores: 12


In [12]:
repartitioned_fintech_df = fintech_df.repartition(logical_cores)

In [13]:
repartitioned_fintech_df.rdd.getNumPartitions()

24

# Part 2: Cleaning


- Rename all columns (replacing a space with an underscore, and making it lowercase)
- Detect missing    
  -  Create a function that takes in the df and returns any data structrue of your choice(df/dict,list,tuple,etc) which has the name of the column and percentage of missing entries from the whole dataset.
  - Tip : storing the missing info as dict where the key is the column name and value is the percentage would be the easiest.
  - Prinout the missing info
- Handle missing
  - For numerical features replace with 0.
  - For categorical/strings replace with mode
- Check missing
  - Afterwards, check that there are no missing values

In [14]:
repartitioned_fintech_df.columns

['Customer Id',
 'Emp Title',
 'Emp Length',
 'Home Ownership',
 'Annual Inc',
 'Annual Inc Joint',
 'Verification Status',
 'Zip Code',
 'Addr State',
 'Avg Cur Bal',
 'Tot Cur Bal',
 'Loan Id',
 'Loan Status',
 'Loan Amount',
 'State',
 'Funded Amount',
 'Term',
 'Int Rate',
 'Grade',
 'Issue Date',
 'Pymnt Plan',
 'Type',
 'Purpose',
 'Description']

In [15]:
renamed_fintech_df = repartitioned_fintech_df.toDF(*[col.replace(" ", "_").lower() for col in repartitioned_fintech_df.columns])



In [16]:
renamed_fintech_df.columns

['customer_id',
 'emp_title',
 'emp_length',
 'home_ownership',
 'annual_inc',
 'annual_inc_joint',
 'verification_status',
 'zip_code',
 'addr_state',
 'avg_cur_bal',
 'tot_cur_bal',
 'loan_id',
 'loan_status',
 'loan_amount',
 'state',
 'funded_amount',
 'term',
 'int_rate',
 'grade',
 'issue_date',
 'pymnt_plan',
 'type',
 'purpose',
 'description']

In [17]:
renamed_fintech_df.show(20)

+--------------------+--------------------+----------+--------------+----------+----------------+-------------------+--------+----------+-----------+-----------+-------+---------------+-----------+-----+-------------+----------+--------+-----+-----------------+----------+----------+------------------+--------------------+
|         customer_id|           emp_title|emp_length|home_ownership|annual_inc|annual_inc_joint|verification_status|zip_code|addr_state|avg_cur_bal|tot_cur_bal|loan_id|    loan_status|loan_amount|state|funded_amount|      term|int_rate|grade|       issue_date|pymnt_plan|      type|           purpose|         description|
+--------------------+--------------------+----------+--------------+----------+----------------+-------------------+--------+----------+-----------+-----------+-------+---------------+-----------+-----+-------------+----------+--------+-----+-----------------+----------+----------+------------------+--------------------+
|YicgXHgxOVx4ODU+X...| barte

In [18]:
def detect_missing(df: pyspark.sql.dataframe.DataFrame) -> Dict[str, float]:
    """
    Detect missing values in a PySpark DataFrame and calculate the percentage of missing entries.
    
    Args:
    df (DataFrame): Input PySpark DataFrame.
    
    Returns:
    dict: Dictionary where keys are column names and values are percentages of missing values.
    """
    total_rows = df.count()
    missing_info = {}
    
    for column in df.columns:
        missing_count = df.filter(fn.col(column).isNull()).count()
        missing_percentage = (missing_count / total_rows) * 100
        missing_info[column] = missing_percentage
    
    return missing_info

In [19]:
missing_dict = detect_missing(renamed_fintech_df)

In [20]:
missing_dict

{'customer_id': 0.0,
 'emp_title': 8.612652608213097,
 'emp_length': 6.777654458009619,
 'home_ownership': 0.0,
 'annual_inc': 0.0,
 'annual_inc_joint': 93.11875693673696,
 'verification_status': 0.0,
 'zip_code': 0.0,
 'addr_state': 0.0,
 'avg_cur_bal': 0.0,
 'tot_cur_bal': 0.0,
 'loan_id': 0.0,
 'loan_status': 0.0,
 'loan_amount': 0.0,
 'state': 0.0,
 'funded_amount': 0.0,
 'term': 0.0,
 'int_rate': 4.384017758046615,
 'grade': 0.0,
 'issue_date': 0.0,
 'pymnt_plan': 0.0,
 'type': 0.0,
 'purpose': 0.0,
 'description': 0.8139104698483166}

In [21]:
renamed_fintech_df.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- emp_title: string (nullable = true)
 |-- emp_length: string (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_inc: double (nullable = true)
 |-- annual_inc_joint: double (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- zip_code: string (nullable = true)
 |-- addr_state: string (nullable = true)
 |-- avg_cur_bal: double (nullable = true)
 |-- tot_cur_bal: double (nullable = true)
 |-- loan_id: long (nullable = true)
 |-- loan_status: string (nullable = true)
 |-- loan_amount: double (nullable = true)
 |-- state: string (nullable = true)
 |-- funded_amount: double (nullable = true)
 |-- term: string (nullable = true)
 |-- int_rate: double (nullable = true)
 |-- grade: long (nullable = true)
 |-- issue_date: string (nullable = true)
 |-- pymnt_plan: boolean (nullable = true)
 |-- type: string (nullable = true)
 |-- purpose: string (nullable = true)
 |-- description: string (nullable = t

In [22]:
numerical_types = (pyspark.sql.types.DoubleType, pyspark.sql.types.FloatType, pyspark.sql.types.IntegerType, pyspark.sql.types.LongType)
numerical_columns = [field.name for field in renamed_fintech_df.schema.fields if isinstance(field.dataType, numerical_types)]
numerical_columns

['annual_inc',
 'annual_inc_joint',
 'avg_cur_bal',
 'tot_cur_bal',
 'loan_id',
 'loan_amount',
 'funded_amount',
 'int_rate',
 'grade']

In [23]:
from pyspark.sql import functions as F

def handle_missing_numerical(df: pyspark.sql.dataframe.DataFrame) -> pyspark.sql.dataframe.DataFrame:
    """
    Replace missing values in numerical columns with 0.
    
    Args:
    df (DataFrame): Input PySpark DataFrame.
    
    Returns:
    DataFrame: DataFrame with missing values in numerical columns replaced by 0.
    """
    missing_dict = detect_missing(df)
    numerical_types = (pyspark.sql.types.DoubleType, pyspark.sql.types.FloatType, pyspark.sql.types.IntegerType, pyspark.sql.types.LongType)
    numerical_columns = [field.name for field in df.schema.fields if isinstance(field.dataType, numerical_types)]
    for column in numerical_columns:
        if missing_dict[column] > 0:
            df = df.fillna({column: 0})
    return df


In [24]:
numerically_imputed_df = handle_missing_numerical(renamed_fintech_df)

In [25]:
missing_dict_updated = detect_missing(numerically_imputed_df)
missing_dict_updated

{'customer_id': 0.0,
 'emp_title': 8.612652608213097,
 'emp_length': 6.777654458009619,
 'home_ownership': 0.0,
 'annual_inc': 0.0,
 'annual_inc_joint': 0.0,
 'verification_status': 0.0,
 'zip_code': 0.0,
 'addr_state': 0.0,
 'avg_cur_bal': 0.0,
 'tot_cur_bal': 0.0,
 'loan_id': 0.0,
 'loan_status': 0.0,
 'loan_amount': 0.0,
 'state': 0.0,
 'funded_amount': 0.0,
 'term': 0.0,
 'int_rate': 0.0,
 'grade': 0.0,
 'issue_date': 0.0,
 'pymnt_plan': 0.0,
 'type': 0.0,
 'purpose': 0.0,
 'description': 0.8139104698483166}

In [26]:
def handle_missing_categorical(df: pyspark.sql.dataframe.DataFrame) -> pyspark.sql.dataframe.DataFrame:
    """
    Replace missing values in categorical/string columns with the mode (most frequent value).
    
    Args:
    df (DataFrame): Input PySpark DataFrame.
    
    Returns:
    DataFrame: DataFrame with missing values in categorical columns replaced by mode.
    """
    missing_dict = detect_missing(df)
    categorical_columns = [field.name for field in df.schema.fields if isinstance(field.dataType, pyspark.sql.types.StringType)]
    for column in categorical_columns:
        if missing_dict[column] > 0:
            mode_col = df.filter(fn.col(column).isNotNull()).groupBy(column).count().orderBy(fn.col('count').desc()).limit(1)
            mode_col.show()
            mode_value = mode_col.select(column).collect()[0][0]
            print(f"Mode value for {column}: {mode_value}")
            df = df.fillna(value= mode_value, subset=[column])
            print('-'* 50)
    return df


In [27]:
categorically_imputed_df = handle_missing_categorical(numerically_imputed_df)

+---------+-----+
|emp_title|count|
+---------+-----+
|  Teacher|  468|
+---------+-----+

Mode value for emp_title: Teacher
--------------------------------------------------
+----------+-----+
|emp_length|count|
+----------+-----+
| 10+ years| 8855|
+----------+-----+

Mode value for emp_length: 10+ years
--------------------------------------------------
+------------------+-----+
|       description|count|
+------------------+-----+
|Debt consolidation|14421|
+------------------+-----+

Mode value for description: Debt consolidation
--------------------------------------------------


In [28]:
missing_dict_updated = detect_missing(categorically_imputed_df)
missing_dict_updated

{'customer_id': 0.0,
 'emp_title': 0.0,
 'emp_length': 0.0,
 'home_ownership': 0.0,
 'annual_inc': 0.0,
 'annual_inc_joint': 0.0,
 'verification_status': 0.0,
 'zip_code': 0.0,
 'addr_state': 0.0,
 'avg_cur_bal': 0.0,
 'tot_cur_bal': 0.0,
 'loan_id': 0.0,
 'loan_status': 0.0,
 'loan_amount': 0.0,
 'state': 0.0,
 'funded_amount': 0.0,
 'term': 0.0,
 'int_rate': 0.0,
 'grade': 0.0,
 'issue_date': 0.0,
 'pymnt_plan': 0.0,
 'type': 0.0,
 'purpose': 0.0,
 'description': 0.0}

no missing values are remaining

# Part 3: Encoding

Encode only the following categorical values
- Emp Length: Change to numerical
- Home Ownership: One Hot Encoding
- Verification Status: One Hot Encoding
- State: Label Encoding
- Type: One Hot Encoding
- Purpose: Label Encoding
- For the grade, only descretize it to be letter grade, not need to label encode it further

DO NOT Encode the employment title of description or any other column that is not
mentioned above

In [29]:
GLOBAL_LOOKUP_TABLE = spark.createDataFrame([], schema="original_column STRING, original_value STRING, encoded_column STRING")

## OHE

In [30]:
categorically_imputed_df.select('home_ownership').distinct().collect()[0][0]

'OWN'

In [31]:
categorically_imputed_df.select("home_ownership").distinct().show()

+--------------+
|home_ownership|
+--------------+
|           OWN|
|          RENT|
|      MORTGAGE|
|           ANY|
+--------------+



In [32]:
def one_hot_encode(df: pyspark.sql.dataframe.DataFrame, columns: List[str]) -> pyspark.sql.dataframe.DataFrame:
    """
    Apply one-hot encoding to the specified categorical columns in the DataFrame.
    
    Args:
    df (DataFrame): Input PySpark DataFrame.
    columns (list): List of column names to one-hot encode.
    
    Returns:
    DataFrame: DataFrame with one-hot encoded columns.
    """
    global GLOBAL_LOOKUP_TABLE
    lookup_data = []
    for column in columns:
        unique_values = df.select(column).distinct().collect()
        
        for row in unique_values:
            curr_value = row[0]
            encoded_col_name = f"{column}_{curr_value}"
            df = df.withColumn(encoded_col_name, fn.when(fn.col(column) == curr_value, 1).otherwise(0))
            df = df.withColumn(encoded_col_name, df[encoded_col_name].cast("int"))
            lookup_data.append(Row(original_column=column, original_value=curr_value, encoded_column=encoded_col_name))
            
    lookup_table = spark.createDataFrame(lookup_data)
    print(lookup_data)
    GLOBAL_LOOKUP_TABLE = GLOBAL_LOOKUP_TABLE.union(lookup_table)
    df = df.drop(*columns)
    return df


In [33]:
data = [("Alice", 28), ("Bob", 35), ("Charlie", 40)]

# Create DataFrame
ff = spark.createDataFrame(data, ["Name", "Age"])

ff.show()

+-------+---+
|   Name|Age|
+-------+---+
|  Alice| 28|
|    Bob| 35|
|Charlie| 40|
+-------+---+



In [34]:
ohe_columns = ["home_ownership", "verification_status", "type"]
ohe_fintech_df = one_hot_encode(categorically_imputed_df, ohe_columns)

[Row(original_column='home_ownership', original_value='OWN', encoded_column='home_ownership_OWN'), Row(original_column='home_ownership', original_value='RENT', encoded_column='home_ownership_RENT'), Row(original_column='home_ownership', original_value='MORTGAGE', encoded_column='home_ownership_MORTGAGE'), Row(original_column='home_ownership', original_value='ANY', encoded_column='home_ownership_ANY'), Row(original_column='verification_status', original_value='Verified', encoded_column='verification_status_Verified'), Row(original_column='verification_status', original_value='Source Verified', encoded_column='verification_status_Source Verified'), Row(original_column='verification_status', original_value='Not Verified', encoded_column='verification_status_Not Verified'), Row(original_column='type', original_value='Joint App', encoded_column='type_Joint App'), Row(original_column='type', original_value='Individual', encoded_column='type_Individual'), Row(original_column='type', original_

In [35]:
ohe_fintech_df.columns, len(ohe_fintech_df.columns)

(['customer_id',
  'emp_title',
  'emp_length',
  'annual_inc',
  'annual_inc_joint',
  'zip_code',
  'addr_state',
  'avg_cur_bal',
  'tot_cur_bal',
  'loan_id',
  'loan_status',
  'loan_amount',
  'state',
  'funded_amount',
  'term',
  'int_rate',
  'grade',
  'issue_date',
  'pymnt_plan',
  'purpose',
  'description',
  'home_ownership_OWN',
  'home_ownership_RENT',
  'home_ownership_MORTGAGE',
  'home_ownership_ANY',
  'verification_status_Verified',
  'verification_status_Source Verified',
  'verification_status_Not Verified',
  'type_Joint App',
  'type_INDIVIDUAL',
  'type_DIRECT_PAY',
  'type_JOINT'],
 32)

In [36]:
ohe_fintech_df.select(['home_ownership_MORTGAGE']).distinct().show()

+-----------------------+
|home_ownership_MORTGAGE|
+-----------------------+
|                      1|
|                      0|
+-----------------------+



## Label Encode

In [37]:
ohe_fintech_df.select("state").distinct().sort("state").collect()

[Row(state='AK'),
 Row(state='AL'),
 Row(state='AR'),
 Row(state='AZ'),
 Row(state='CA'),
 Row(state='CO'),
 Row(state='CT'),
 Row(state='DC'),
 Row(state='DE'),
 Row(state='FL'),
 Row(state='GA'),
 Row(state='HI'),
 Row(state='ID'),
 Row(state='IL'),
 Row(state='IN'),
 Row(state='KS'),
 Row(state='KY'),
 Row(state='LA'),
 Row(state='MA'),
 Row(state='MD'),
 Row(state='ME'),
 Row(state='MI'),
 Row(state='MN'),
 Row(state='MO'),
 Row(state='MS'),
 Row(state='MT'),
 Row(state='NC'),
 Row(state='ND'),
 Row(state='NE'),
 Row(state='NH'),
 Row(state='NJ'),
 Row(state='NM'),
 Row(state='NV'),
 Row(state='NY'),
 Row(state='OH'),
 Row(state='OK'),
 Row(state='OR'),
 Row(state='PA'),
 Row(state='RI'),
 Row(state='SC'),
 Row(state='SD'),
 Row(state='TN'),
 Row(state='TX'),
 Row(state='UT'),
 Row(state='VA'),
 Row(state='VT'),
 Row(state='WA'),
 Row(state='WI'),
 Row(state='WV'),
 Row(state='WY')]

In [38]:
def label_encode(df: pyspark.sql.dataframe.DataFrame, columns: List[str]) -> pyspark.sql.dataframe.DataFrame:
    """
    Apply label encoding to the specified categorical columns in the DataFrame.
    
    Args:
    df (DataFrame): Input PySpark DataFrame.
    columns (list): List of column names to label encode.
    
    Returns:
    DataFrame: DataFrame with label-encoded columns.
    """
    global GLOBAL_LOOKUP_TABLE
    lookup_data = []
    for column in columns:
        state_values = df.select(column).distinct().sort(column).collect()
        for i, row in enumerate(state_values):
            value = row[0]
            df = df.withColumn(column, fn.when(df[column] == value, i).otherwise(df[column]))
            lookup_data.append(Row(original_column=column, original_value=value, encoded_column=i))
        df = df.withColumn(column, df[column].cast("int"))
    print(lookup_data)
    lookup_table = spark.createDataFrame(lookup_data)
    GLOBAL_LOOKUP_TABLE = GLOBAL_LOOKUP_TABLE.union(lookup_table)
    return df


In [39]:
label_columns = ["state", "purpose"]
label_fintech_df = label_encode(ohe_fintech_df, label_columns)

[Row(original_column='state', original_value='AK', encoded_column=0), Row(original_column='state', original_value='AL', encoded_column=1), Row(original_column='state', original_value='AR', encoded_column=2), Row(original_column='state', original_value='AZ', encoded_column=3), Row(original_column='state', original_value='CA', encoded_column=4), Row(original_column='state', original_value='CO', encoded_column=5), Row(original_column='state', original_value='CT', encoded_column=6), Row(original_column='state', original_value='DC', encoded_column=7), Row(original_column='state', original_value='DE', encoded_column=8), Row(original_column='state', original_value='FL', encoded_column=9), Row(original_column='state', original_value='GA', encoded_column=10), Row(original_column='state', original_value='HI', encoded_column=11), Row(original_column='state', original_value='ID', encoded_column=12), Row(original_column='state', original_value='IL', encoded_column=13), Row(original_column='state', 

In [40]:
len(label_fintech_df.columns)

32

In [41]:
label_fintech_df.select("state").distinct().sort('state').show(53)

+-----+
|state|
+-----+
|    0|
|    1|
|    2|
|    3|
|    4|
|    5|
|    6|
|    7|
|    8|
|    9|
|   10|
|   11|
|   12|
|   13|
|   14|
|   15|
|   16|
|   17|
|   18|
|   19|
|   20|
|   21|
|   22|
|   23|
|   24|
|   25|
|   26|
|   27|
|   28|
|   29|
|   30|
|   31|
|   32|
|   33|
|   34|
|   35|
|   36|
|   37|
|   38|
|   39|
|   40|
|   41|
|   42|
|   43|
|   44|
|   45|
|   46|
|   47|
|   48|
|   49|
+-----+



## Letter Grade

- A (1-5)
- B (6-10)
- C (11-15)
- D (16-20)
- E (21-25)
- F (26-30)
- G (31-35)

In [42]:

def create_letter_grade(df: pyspark.sql.dataframe.DataFrame, grade_column: str="grade") -> pyspark.sql.dataframe.DataFrame:
    """
    Add a new column `letter_grade` based on the numerical range of `grade`.

    Args:
    df (DataFrame): Input PySpark DataFrame.
    grade_column (str): Name of the column containing numerical grades.

    Returns:
    DataFrame: DataFrame with an additional `letter_grade` column.
    """
    global GLOBAL_LOOKUP_TABLE
    letters = ["A", "B", "C", "D", "E", "F", "G"]
    lookup_data = []
    for i, letter in enumerate(letters):
        for j in range((i * 5)+1, ((i + 1) * 5)+1):
            lookup_data.append(Row(original_column=grade_column, original_value=str(j), encoded_value=letter))
    lookup_table = spark.createDataFrame(lookup_data)
    print(lookup_data)
    GLOBAL_LOOKUP_TABLE = GLOBAL_LOOKUP_TABLE.union(lookup_table)

    
    grade_mapping = [
        (fn.col(grade_column).between(1, 5), "A"),
        (fn.col(grade_column).between(6, 10), "B"),
        (fn.col(grade_column).between(11, 15), "C"),
        (fn.col(grade_column).between(16, 20), "D"),
        (fn.col(grade_column).between(21, 25), "E"),
        (fn.col(grade_column).between(26, 30), "F"),
        (fn.col(grade_column).between(31, 35), "G"),
    ]
    
    letter_grade_column = fn.when(*grade_mapping[0])
    for condition, letter in grade_mapping[1:]:
        letter_grade_column = letter_grade_column.when(condition, letter)
    letter_grade_column = letter_grade_column.otherwise("Unknown")

    df = df.withColumn("letter_grade", letter_grade_column)
    return df


In [43]:
encoded_fintech_df = create_letter_grade(label_fintech_df, "grade")

[Row(original_column='grade', original_value='1', encoded_value='A'), Row(original_column='grade', original_value='2', encoded_value='A'), Row(original_column='grade', original_value='3', encoded_value='A'), Row(original_column='grade', original_value='4', encoded_value='A'), Row(original_column='grade', original_value='5', encoded_value='A'), Row(original_column='grade', original_value='6', encoded_value='B'), Row(original_column='grade', original_value='7', encoded_value='B'), Row(original_column='grade', original_value='8', encoded_value='B'), Row(original_column='grade', original_value='9', encoded_value='B'), Row(original_column='grade', original_value='10', encoded_value='B'), Row(original_column='grade', original_value='11', encoded_value='C'), Row(original_column='grade', original_value='12', encoded_value='C'), Row(original_column='grade', original_value='13', encoded_value='C'), Row(original_column='grade', original_value='14', encoded_value='C'), Row(original_column='grade',

In [44]:
encoded_fintech_df.select("grade", "letter_grade").distinct().sort('grade').show(40)

+-----+------------+
|grade|letter_grade|
+-----+------------+
|    1|           A|
|    2|           A|
|    3|           A|
|    4|           A|
|    5|           A|
|    6|           B|
|    7|           B|
|    8|           B|
|    9|           B|
|   10|           B|
|   11|           C|
|   12|           C|
|   13|           C|
|   14|           C|
|   15|           C|
|   16|           D|
|   17|           D|
|   18|           D|
|   19|           D|
|   20|           D|
|   21|           E|
|   22|           E|
|   23|           E|
|   24|           E|
|   25|           E|
|   26|           F|
|   27|           F|
|   28|           F|
|   29|           F|
|   30|           F|
|   31|           G|
|   32|           G|
|   33|           G|
|   34|           G|
|   35|           G|
+-----+------------+



## Emp_length

In [45]:
encoded_fintech_df.select("emp_length").distinct().show()

+----------+
|emp_length|
+----------+
|   5 years|
|   9 years|
|    1 year|
|   2 years|
|   7 years|
|   8 years|
|   4 years|
|   6 years|
|   3 years|
| 10+ years|
|  < 1 year|
+----------+



In [46]:
from pyspark.sql import functions as fn

def convert_emp_length_to_numeric(df: pyspark.sql.dataframe.DataFrame, column: str = "emp_length") -> pyspark.sql.dataframe.DataFrame:
    """
    Converts the 'emp_length' column to numeric values using string manipulation.
    
    Args:
    df (DataFrame): Input PySpark DataFrame.
    column (str): Column name containing employment length data.
    
    Returns:
    DataFrame: DataFrame with 'emp_length' column converted to numeric values.
    """
    df = df.withColumn(column, fn.regexp_replace(fn.col(column), "years|year", ""))
    df = df.withColumn(column, fn.when(fn.col(column).like("%<%"), "0.5")
                                 .when(fn.col(column).like("%+%"), "11")
                                 .otherwise(fn.col(column)))
    df = df.withColumn(column, fn.col(column).cast("float"))
    
    return df


In [47]:
encoded_fintech_df_2 = convert_emp_length_to_numeric(encoded_fintech_df, "emp_length")

In [48]:
encoded_fintech_df_2.select("emp_length").distinct().sort("emp_length").show()

+----------+
|emp_length|
+----------+
|       0.5|
|       1.0|
|       2.0|
|       3.0|
|       4.0|
|       5.0|
|       6.0|
|       7.0|
|       8.0|
|       9.0|
|      11.0|
+----------+



In [49]:
encoded_fintech_df_2.show()

+--------------------+--------------------+----------+----------+----------------+--------+----------+-----------+-----------+-------+---------------+-----------+-----+-------------+----------+--------+-----+-----------------+----------+-------+--------------------+------------------+-------------------+-----------------------+------------------+----------------------------+-----------------------------------+--------------------------------+--------------+---------------+---------------+----------+------------+
|         customer_id|           emp_title|emp_length|annual_inc|annual_inc_joint|zip_code|addr_state|avg_cur_bal|tot_cur_bal|loan_id|    loan_status|loan_amount|state|funded_amount|      term|int_rate|grade|       issue_date|pymnt_plan|purpose|         description|home_ownership_OWN|home_ownership_RENT|home_ownership_MORTGAGE|home_ownership_ANY|verification_status_Verified|verification_status_Source Verified|verification_status_Not Verified|type_Joint App|type_INDIVIDUAL|type_

In [50]:
GLOBAL_LOOKUP_TABLE.show(50)

+-------------------+---------------+--------------------+
|    original_column| original_value|      encoded_column|
+-------------------+---------------+--------------------+
|     home_ownership|            OWN|  home_ownership_OWN|
|     home_ownership|           RENT| home_ownership_RENT|
|     home_ownership|       MORTGAGE|home_ownership_MO...|
|     home_ownership|            ANY|  home_ownership_ANY|
|verification_status|       Verified|verification_stat...|
|verification_status|Source Verified|verification_stat...|
|verification_status|   Not Verified|verification_stat...|
|               type|      Joint App|      type_Joint App|
|               type|     Individual|     type_Individual|
|               type|     DIRECT_PAY|     type_DIRECT_PAY|
|               type|          JOINT|          type_JOINT|
|               type|     INDIVIDUAL|     type_INDIVIDUAL|
|              state|             AK|                   0|
|              state|             AL|                   

# Part 4: Feature Engineering

Write a function that adds the 3 following features. Try as much as you can to use
built in fucntions in PySpark (from the functions library) check lab 8, Avoid writing
UDFs from scratch.
- Previous loan issue date from the same grade
- Previoius Loan amount from the same grade
- Previous loan date from the same state and grade combined
- Previous loan amount from the same state and grade combined

## Previous loan issue date from the same grade

In [51]:
def add_previous_loan_issue_date_form_same_grade(df: pyspark.sql.dataframe.DataFrame) -> pyspark.sql.dataframe.DataFrame:
    """
    Add a new column 'previous_loan_issue_date_same_grade' that contains the issue date of the previous loan with the same grade.
    
    Args:
    df (DataFrame): Input PySpark DataFrame.
    
    Returns:
    DataFrame: DataFrame with the new column 'previous_loan_issue_date_same_grade'.
    """
    df = df.withColumn("issue_date_preprocessed", fn.to_date(fn.col("issue_date"), "dd MMMM yyyy"))
    window_spec = Window.partitionBy("grade").orderBy("issue_date_preprocessed")
    prev_issue_date = fn.lag("issue_date", 1).over(window_spec)
    df = df.withColumn("previous_loan_issue_date_same_grade", prev_issue_date)
    df = df.drop("issue_date_preprocessed")
    return df

In [52]:
lagged_fintech_df = add_previous_loan_issue_date_form_same_grade(encoded_fintech_df_2)

In [53]:
lagged_fintech_df.select("issue_date", "grade", "previous_loan_issue_date_same_grade").show()

+-----------------+-----+-----------------------------------+
|       issue_date|grade|previous_loan_issue_date_same_grade|
+-----------------+-----+-----------------------------------+
|12 September 2012|    1|                               NULL|
|12 September 2012|    1|                  12 September 2012|
|12 September 2012|    1|                  12 September 2012|
|  12 October 2012|    1|                  12 September 2012|
| 12 November 2012|    1|                    12 October 2012|
| 12 December 2012|    1|                   12 November 2012|
| 12 December 2012|    1|                   12 December 2012|
|  13 January 2013|    1|                   12 December 2012|
|  13 January 2013|    1|                    13 January 2013|
| 13 February 2013|    1|                    13 January 2013|
| 13 February 2013|    1|                   13 February 2013|
|    13 March 2013|    1|                   13 February 2013|
|    13 March 2013|    1|                      13 March 2013|
|    13 

## Previoius Loan amount from the same grade

In [54]:
def add_prev_loan_amount_from_same_grade(df: pyspark.sql.dataframe.DataFrame) -> pyspark.sql.dataframe.DataFrame:
    """
    Add a new column 'previous_loan_amount_same_grade' that contains the loan amount of the previous loan with the same grade.
    
    Args:
    df (DataFrame): Input PySpark DataFrame.
    
    Returns:
    DataFrame: DataFrame with the new column 'previous_loan_amount_same_grade'.
    """
    df = df.withColumn("issue_date_preprocessed", fn.to_date(fn.col("issue_date"), "dd MMMM yyyy"))
    window_spec = Window.partitionBy("grade").orderBy("issue_date_preprocessed")
    prev_loan_amount = fn.lag("loan_amount", 1).over(window_spec)
    df = df.withColumn("previous_loan_amount_same_grade", prev_loan_amount)
    df = df.drop("issue_date_preprocessed")
    return df

In [55]:
lagged_fintech_df_2 = add_prev_loan_amount_from_same_grade(lagged_fintech_df)

In [56]:
lagged_fintech_df_2.select("issue_date","loan_amount", "grade", "previous_loan_amount_same_grade").show()

+-----------------+-----------+-----+-------------------------------+
|       issue_date|loan_amount|grade|previous_loan_amount_same_grade|
+-----------------+-----------+-----+-------------------------------+
|12 September 2012|    12000.0|    1|                           NULL|
|12 September 2012|    11200.0|    1|                        12000.0|
|12 September 2012|     6500.0|    1|                        11200.0|
|  12 October 2012|     8000.0|    1|                         6500.0|
| 12 November 2012|    10000.0|    1|                         8000.0|
| 12 December 2012|    24000.0|    1|                        10000.0|
| 12 December 2012|    24000.0|    1|                        24000.0|
|  13 January 2013|    16850.0|    1|                        24000.0|
|  13 January 2013|    12000.0|    1|                        16850.0|
| 13 February 2013|    10000.0|    1|                        12000.0|
| 13 February 2013|    28000.0|    1|                        10000.0|
|    13 March 2013| 

## Previous loan date from the same state and grade combined

In [57]:
def add_prev_loan_date_from_same_state_and_grade(df: pyspark.sql.dataframe.DataFrame) -> pyspark.sql.dataframe.DataFrame:
    """
    Add a new column 'previous_loan_date_same_state_and_grade' that contains the issue date of the previous loan with the same state and grade.
    
    Args:
    df (DataFrame): Input PySpark DataFrame.
    
    Returns:
    DataFrame: DataFrame with the new column 'previous_loan_amount_same_grade'.
    """
    df = df.withColumn("issue_date_preprocessed", fn.to_date(fn.col("issue_date"), "dd MMMM yyyy"))
    window_spec = Window.partitionBy("state","grade").orderBy("issue_date_preprocessed")
    prev_loan_amount = fn.lag("issue_date", 1).over(window_spec)
    df = df.withColumn("previous_loan_date_same_state_and_grade", prev_loan_amount)
    df = df.drop("issue_date_preprocessed")
    return df

In [58]:
lagged_fintech_df_3 = add_prev_loan_date_from_same_state_and_grade(lagged_fintech_df_2)

In [59]:
lagged_fintech_df_3.select("issue_date","state","grade", "previous_loan_date_same_state_and_grade").show()

+-----------------+-----+-----+---------------------------------------+
|       issue_date|state|grade|previous_loan_date_same_state_and_grade|
+-----------------+-----+-----+---------------------------------------+
|  14 October 2014|    0|    1|                                   NULL|
|  19 January 2019|    0|    1|                        14 October 2014|
| 19 November 2019|    0|    1|                        19 January 2019|
|  14 October 2014|    0|    2|                                   NULL|
|     17 June 2017|    0|    2|                        14 October 2014|
| 17 November 2017|    0|    2|                           17 June 2017|
|    18 March 2018|    0|    2|                       17 November 2017|
|15 September 2015|    0|    3|                                   NULL|
|  18 October 2018|    0|    3|                      15 September 2015|
|    19 April 2019|    0|    3|                        18 October 2018|
|     16 June 2016|    0|    4|                                 

## Previous loan amount from the same state and grade combined

In [60]:
def add_prev_loan_amount_from_same_state_and_grade(df: pyspark.sql.dataframe.DataFrame) -> pyspark.sql.dataframe.DataFrame:
    """
    Add a new column 'previous_loan_amount_same_state_and_grade' that contains the loan amount of the previous loan with the same state and grade.
    
    Args:
    df (DataFrame): Input PySpark DataFrame.
    
    Returns:
    DataFrame: DataFrame with the new column 'previous_loan_amount_same_state_and_grade'.
    """
    df = df.withColumn("issue_date_preprocessed", fn.to_date(fn.col("issue_date"), "dd MMMM yyyy"))
    window_spec = Window.partitionBy("state","grade").orderBy("issue_date_preprocessed")
    prev_loan_amount = fn.lag("loan_amount", 1).over(window_spec)
    df = df.withColumn("previous_loan_amount_same_state_and_grade", prev_loan_amount)
    df = df.drop("issue_date_preprocessed")
    return df

In [61]:
lagged_fintech_df_4 = add_prev_loan_amount_from_same_state_and_grade(lagged_fintech_df_3)

In [62]:
lagged_fintech_df_4.select("issue_date","state","loan_amount", "grade", "previous_loan_amount_same_state_and_grade").show()

+-----------------+-----+-----------+-----+-----------------------------------------+
|       issue_date|state|loan_amount|grade|previous_loan_amount_same_state_and_grade|
+-----------------+-----+-----------+-----+-----------------------------------------+
|  14 October 2014|    0|    10000.0|    1|                                     NULL|
|  19 January 2019|    0|    10000.0|    1|                                  10000.0|
| 19 November 2019|    0|     6000.0|    1|                                  10000.0|
|  14 October 2014|    0|    24000.0|    2|                                     NULL|
|     17 June 2017|    0|    30000.0|    2|                                  24000.0|
| 17 November 2017|    0|    21375.0|    2|                                  30000.0|
|    18 March 2018|    0|    35000.0|    2|                                  21375.0|
|15 September 2015|    0|    15000.0|    3|                                     NULL|
|  18 October 2018|    0|    25000.0|    3|           

# Part 6: Lookup Table & Saving the dataset

## load (save) the cleaned PySpark df and the lookup table to parquet files

In [63]:
def save_df_as_parquet(df: pyspark.sql.dataframe.DataFrame, path: str) -> None:
    """
    Save a PySpark DataFrame to disk in Parquet format.
    
    Args:
    df (DataFrame): Input PySpark DataFrame.
    path (str): Output path for the Parquet file.
    
    Returns:
    None
    """
    df.write.mode("overwrite").parquet(path)
    print(f"DataFrame saved to {path}")

In [67]:
save_df_as_parquet(lagged_fintech_df_4, "./work/fintech_spark_52_1008_clean.parquet")
save_df_as_parquet(GLOBAL_LOOKUP_TABLE, "./work/lookup_spark_52_1008.parquet")

DataFrame saved to ./work/fintech_spark_52_1008_clean.parquet
DataFrame saved to ./work/lookup_spark_52_1008.parquet


In [64]:
def load_df_from_parquet(path: str) -> pyspark.sql.dataframe.DataFrame:
    """
    Load a PySpark DataFrame from disk in Parquet format.
    
    Args:
    path (str): Path to the Parquet file.
    
    Returns:
    DataFrame: Loaded PySpark DataFrame.
    """
    return spark.read.parquet(path)

In [68]:
loaded_fintech_df = load_df_from_parquet("./work/fintech_spark_52_1008_clean.parquet")
loaded_lookup_table = load_df_from_parquet("./work/lookup_spark_52_1008.parquet")

In [71]:
loaded_fintech_df.show(5)

+--------------------+-------------------+----------+----------+----------------+--------+----------+-----------+-----------+-------+-----------+-----------+-----+-------------+----------+--------+-----+----------------+----------+-------+--------------------+------------------+-------------------+-----------------------+------------------+----------------------------+-----------------------------------+--------------------------------+--------------+---------------+---------------+----------+------------+-----------------------------------+-------------------------------+---------------------------------------+-----------------------------------------+
|         customer_id|          emp_title|emp_length|annual_inc|annual_inc_joint|zip_code|addr_state|avg_cur_bal|tot_cur_bal|loan_id|loan_status|loan_amount|state|funded_amount|      term|int_rate|grade|      issue_date|pymnt_plan|purpose|         description|home_ownership_OWN|home_ownership_RENT|home_ownership_MORTGAGE|home_ownership_

In [73]:
loaded_lookup_table.show(5)

+-------------------+---------------+--------------------+
|    original_column| original_value|      encoded_column|
+-------------------+---------------+--------------------+
|verification_status|Source Verified|verification_stat...|
|verification_status|   Not Verified|verification_stat...|
|verification_status|       Verified|verification_stat...|
|     home_ownership|       MORTGAGE|home_ownership_MO...|
|     home_ownership|           RENT| home_ownership_RENT|
+-------------------+---------------+--------------------+
only showing top 5 rows



# BONUS: Loading to Postgres

- Load the cleaned parquet file and lookup table into a Postgres database.
- Take Screenshots showing the newly added features in the feature engineering section
- Take a screenshot from the lookup table

In [65]:
def save_to_db(df: pyspark.sql.dataframe.DataFrame, table_name: str) -> None:
    """
    Save a PySpark DataFrame to a database table.
    
    Args:
    df (DataFrame): Input PySpark DataFrame.
    table_name (str): Name of the database table.
    
    Returns:
    None
    """
    postgres_url = "jdbc:postgresql://pgdatabase:5432/testdb"
    postgres_properties = {
        "user": "root",
        "password": "root",
        "driver": "org.postgresql.Driver"
    }

    df.write.jdbc(
        url=postgres_url,
        table=table_name,
        mode="overwrite",  # Options: 'overwrite', 'append', 'ignore', 'error'
        properties=postgres_properties
    )



In [66]:
# save_to_db(lagged_fintech_df_4, "fintech_data")