In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
#!pip install -q findspark
#!pip install pyspark
#!pip install py4j

In [None]:
from pyspark.sql import SparkSession
import pandas as pd
from pyspark.sql.functions import date_format
from pyspark.sql import Window
import pyspark.sql.functions as F

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)


# Create a Spark session
spark = SparkSession.builder \
    .master('local[*]') \
    .config("spark.driver.memory", "100g") \
    .appName('stackeddataata') \
    .getOrCreate()

spark.conf.set("spark.sql.repl.eagerEval.enabled", True)

# Load data from a CSV file into a DataFrame
df_merged = spark.read.csv('/content/drive/MyDrive/Final data/Sample_2005.csv', header=True, inferSchema=True)
df_merged = df_merged.withColumn('MONTHLY REPORTING PERIOD', date_format('MONTHLY REPORTING PERIOD', 'yyyy-MM'))
df_merged = df_merged.drop("FIRST PAYMENT DATE")
df_merged.show(10, truncate=False, vertical=True)

-RECORD 0----------------------------------------------
 _c0                                    | 0            
 LOAN SEQUENCE NUMBER                   | F05Q10000128 
 MONTHLY REPORTING PERIOD               | 2005-02      
 CURRENT ACTUAL UPB                     | 161000.0     
 CURRENT LOAN DELINQUENCY STATUS        | 0            
 LOAN AGE                               | 0            
 CURRENT INTEREST RATE                  | 5.875        
 ESTIMATED LOAN TO VALUE (ELTV)         | Undefined    
 DEFAULT                                | 0            
 CREDIT SCORE                           | 810          
 FIRST TIME HOMEBUYER FLAG              | N            
 NUMBER OF UNITS                        | 1            
 OCCUPANCY STATUS                       | P            
 ORIGINAL COMBINED LOAN-TO-VALUE (CLTV) | 49           
 ORIGINAL DEBT-TO-INCOME (DTI) RATIO    | 26           
 ORIGINAL LOAN-TO-VALUE (LTV)           | 49           
 CHANNEL                                | R     

In [None]:
windowSpec = Window.partitionBy("LOAN SEQUENCE NUMBER", "LOAN AGE").rowsBetween(Window.unboundedPreceding, Window.currentRow)

# Add a column to count occurrences of each LOAN AGE
df_merged = df_merged.withColumn("LOAN_AGE_COUNT", F.count("LOAN AGE").over(windowSpec))

# Check if LOAN AGE for a specific LSN resets
loan_age_reset = df_merged.groupby("LOAN SEQUENCE NUMBER").agg(F.max("LOAN_AGE_COUNT") > 1).\
    withColumnRenamed("max(LOAN_AGE_COUNT) > 1", "LOAN_AGE_RESET")

# Check if there is at least one true value in the column
has_true_value = loan_age_reset.select(F.max(F.col("(max(LOAN_AGE_COUNT) > 1)").cast("int"))).collect()[0][0] == 1

# Print "yes" if there is at least one true value, otherwise print "no"
if has_true_value:
    print("Multiple of the same loan age has been detected for a unique loan sequence number at two different points in time.")
else:
    print("Each loan increases in age across its lifespan, where age does not reset.")

df_merged = df_merged.drop("LOAN_AGE_COUNT")

Multiple of the same loan age has been detected for a unique loan sequence number at two different points in time.


In [None]:
# Calculate the minimum loan age for each LSN
min_loan_age = df_merged.groupBy("LOAN SEQUENCE NUMBER").agg(F.min("LOAN AGE").alias("min_loan_age"))

# Join the minimum loan age back to the original DataFrame
df_merged = df_merged.join(min_loan_age, on="LOAN SEQUENCE NUMBER")

windowSpec = Window.partitionBy("LOAN SEQUENCE NUMBER").orderBy(F.lit(1))

# Calculate the monotonically increasing ID that starts at the minimum loan age for each LSN
df_merged = df_merged.withColumn("LOAN AGE", F.row_number().over(windowSpec) + F.first("min_loan_age").over(windowSpec) - 1)
df_merged = df_merged.drop("min_loan_age")

In [None]:
from pyspark.sql.functions import lit, expr, row_number, monotonically_increasing_id, when, col, first
from pyspark.sql.window import Window
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
import numpy as np
#pip install tqdm
from tqdm import tqdm

df_merged_skinny = df_merged.select("LOAN SEQUENCE NUMBER", "CREDIT SCORE", "LOAN AGE", "DEFAULT")
df_merged_skinny = df_merged_skinny.withColumn("Group", monotonically_increasing_id())

# Calculate the number of rows needed
num_rows = df_merged_skinny.count()
new_num_rows = (num_rows * 24) + num_rows

In [None]:
# Define the schema of the DataFrame
schema = df_merged_skinny.schema

# Create a pandas DataFrame with the desired number of rows
pandas_df = pd.DataFrame(index=range(new_num_rows))

# Calculate the Horizon values
pandas_df['Horizon'] = (pandas_df.index % 25)

# Calculate the Group values
pandas_df['Group'] = (pandas_df.index // 25)

# Find the Source values
pandas_df['Source'] = 'orig'
pandas_df.loc[pandas_df['Horizon'].between(1, 24), 'Source'] = 'Duplicated'

result_df = spark.createDataFrame(pandas_df)
df_merged_skinny = df_merged_skinny.join(result_df, on='Group', how='left')

In [None]:
# Calculating Correct LOAN AGE for Duplicated rows
from pyspark.sql.functions import when, col, lit
from pyspark.sql.window import Window

# Calculate the static LOAN AGE value when Source = 'orig' for each group
static_loan_age_df = df_merged_skinny.filter(col("Source") == "orig") \
    .groupBy("Group").agg({"LOAN AGE": "first"}) \
    .withColumnRenamed("first(LOAN AGE)", "static_LOAN_AGE")

# Join static_loan_age_df to df_merged_skinny based on the "Group" column
result_df1 = df_merged_skinny.join(static_loan_age_df, on="Group")

# Define a window specification
window_spec = Window.partitionBy("Group").orderBy("Horizon")

# Subtract the static LOAN AGE value by the 'Horizon' value for each row within the same group
result_df1 = result_df1.withColumn("LOAN AGE",
                                 when(col("Source") == "Duplicated",
                                      col("static_LOAN_AGE") - col("Horizon"))
                                 .otherwise(col("LOAN AGE")))

# Drop the static_LOAN_AGE column and keep rows where loan age >= 0
result_df1 = result_df1.drop("static_LOAN_AGE")
result_df1 = result_df1[result_df1['LOAN AGE'] >= 0]

df_merged_skinny = result_df1

In [None]:
# Left join df_merged into df_merged_skinny based on matching columns
df_merged_skinny = df_merged_skinny.withColumnRenamed("LOAN SEQUENCE NUMBER", "LOAN SEQUENCE NUMBER 1") \
                                   .withColumnRenamed("CREDIT SCORE", "CREDIT SCORE 1") \
                                   .withColumnRenamed("LOAN AGE", "LOAN AGE 1") \
                                   .withColumnRenamed("DEFAULT", "DEFAULT_orig")

df_timeseries = df_merged_skinny.join(df_merged,
                                   (df_merged_skinny["LOAN SEQUENCE NUMBER 1"] == df_merged["LOAN SEQUENCE NUMBER"]) &
                                   (df_merged_skinny["LOAN AGE 1"] == df_merged["LOAN AGE"]),
                                   "left")
# Drop duplicated columns and rename DEFAULT_orig back to DEFAULT
df_timeseries = df_timeseries.drop("LOAN SEQUENCE NUMBER 1", "CREDIT SCORE 1", "LOAN AGE 1", "DEFAULT")
df_timeseries = df_timeseries.withColumnRenamed("DEFAULT_orig", "DEFAULT")

df_timeseries = df_timeseries.filter(df_timeseries['LOAN SEQUENCE NUMBER'].isNotNull())

df_timeseries = df_timeseries.orderBy("Group", "Horizon")


In [None]:
from pyspark.sql.functions import isnan, when, count, col
# Checks number of NaN Values in each column
df_timeseries.select([count(when(isnan(c), c)).alias(c) for c in df_timeseries.columns])

Group,DEFAULT,Horizon,Source,LOAN SEQUENCE NUMBER,_c0,MONTHLY REPORTING PERIOD,CURRENT ACTUAL UPB,CURRENT LOAN DELINQUENCY STATUS,LOAN AGE,CURRENT INTEREST RATE,ESTIMATED LOAN TO VALUE (ELTV),CREDIT SCORE,FIRST TIME HOMEBUYER FLAG,NUMBER OF UNITS,OCCUPANCY STATUS,ORIGINAL COMBINED LOAN-TO-VALUE (CLTV),ORIGINAL DEBT-TO-INCOME (DTI) RATIO,ORIGINAL LOAN-TO-VALUE (LTV),CHANNEL,AMORTIZATION TYPE,PROPERTY STATE,PROPERTY TYPE,LOAN PURPOSE,ORIGINAL LOAN TERM,NUMBER OF BORROWERS,OrigYear,OrigQuarter,OrigDate,index_sa,UNRATE,inflation,% Change in UPB
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
df_timeseries.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df_timeseries.columns])

Group,DEFAULT,Horizon,Source,LOAN SEQUENCE NUMBER,_c0,MONTHLY REPORTING PERIOD,CURRENT ACTUAL UPB,CURRENT LOAN DELINQUENCY STATUS,LOAN AGE,CURRENT INTEREST RATE,ESTIMATED LOAN TO VALUE (ELTV),CREDIT SCORE,FIRST TIME HOMEBUYER FLAG,NUMBER OF UNITS,OCCUPANCY STATUS,ORIGINAL COMBINED LOAN-TO-VALUE (CLTV),ORIGINAL DEBT-TO-INCOME (DTI) RATIO,ORIGINAL LOAN-TO-VALUE (LTV),CHANNEL,AMORTIZATION TYPE,PROPERTY STATE,PROPERTY TYPE,LOAN PURPOSE,ORIGINAL LOAN TERM,NUMBER OF BORROWERS,OrigYear,OrigQuarter,OrigDate,index_sa,UNRATE,inflation,% Change in UPB
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
df = df_timeseries.toPandas()

In [None]:
df.to_csv('/content/drive/MyDrive/Final data/Stacked_2005.csv', encoding='utf-8', header=True)