In [1]:
import pandas as pd
import numpy as np
from pandas_schema import pandas_schema

In [2]:
columns = list(pandas_schema.keys())
datatypes = list(pandas_schema.values())
schema_list = list(zip(columns, datatypes))
#file_path = '/Users/matthewtryba/Desktop/subsampled_data.csv'
file_path = "Y:\\FannieMaeMortgageData\\subsampled_data_00125.csv"
sf_loan_performance = pd.read_csv(file_path, sep='|', header=None, names=columns, low_memory=False)

## ETL

In [3]:
def get_indices_by_type(schema_list):
    """
    Creates dictionary of set of datatypes in pandas_schema
    """
    indices_by_type = {datatype:[] for datatype in list(set(pandas_schema.values()))}

    for key,_ in indices_by_type.items():
        for i,_ in enumerate(schema_list):
            if key in schema_list[i][1]:
                indices_by_type[key].append(i)
    
    return indices_by_type


def process_dates(df, indices_by_type):
    """
    Converts date columns from MMYYYY to YYYY-MM-DD format, safely handling NaN values.
    :param df: DataFrame with data.
    :param indices_by_type: Dictionary with 'datetime64[ns]' key pointing to list of column indices.
    """
    error_col_indices = []

    for col in indices_by_type['datetime64[ns]']:
        try:
            # Direct conversion to string and zero-filling
            df.iloc[:, col] = df.iloc[:, col].astype(str).str.zfill(6)

            # Convert to datetime format, specifying the original format to speed up parsing
            df.iloc[:, col] = pd.to_datetime(df.iloc[:, col], format='%m%Y', errors='coerce')
        
        except ValueError:
            error_col_indices.append(col)
        
    # Optionally, log errors (if frequent):
    # if error_col_indices:
    #     print(f'Errors on columns  {error_col_indices}')
    
    return df


def preprocess_booleans(df, indices_by_type):
    """
    Converts columns with "Y" and "N" values to boolean. All other values are set to NULL.
    :param df: DataFrame with data.
    :param bool_columns: List of columns indices to be converted.
    """
    for col in indices_by_type['bool']:
        df.iloc[:,col] = df.iloc[:,col].apply(convert_to_bool)

    return df


def convert_to_bool(x):
    if x == 'N' or x == 'n':
        return False
    elif x == "Y" or x == 'y':
        return True
    else:
        return None    

In [4]:
indices_by_type = get_indices_by_type(schema_list)
process_dates(sf_loan_performance, indices_by_type)
preprocess_booleans(sf_loan_performance, indices_by_type)

# Create dictionary of non-numeric features to be used in .astype
non_numeric_schema = {}
for key, value in pandas_schema.items():
    if value not in ['float64', 'int64', 'Int64']:
        non_numeric_schema[key] = value

# Set schema for non-numeric features
sf_loan_performance.astype(non_numeric_schema)

Unnamed: 0,Reference_Pool_ID,Loan_Identifier,Monthly_Reporting_Period,Channel,Seller_Name,Servicer_Name,Master_Servicer,Original_Interest_Rate,Current_Interest_Rate,Original_UPB,...,ARM_Plan_Number,Borrower_Assistance_Plan,High_Loan_to_Value_HLTV_Refinance_Option_Indicator,Deal_Name,Repurchase_Make_Whole_Proceeds_Flag,Alternative_Delinquency_Resolution,Alternative_Delinquency_Resolution_Count,Total_Deferral_Amount,Payment_Deferral_Modification_Event_Indicator,Interest_Bearing_UPB
0,,100891825768,2006-10-01,R,Other,Other,,8.500,8.500,54000.0,...,,,False,,False,,,,7,
1,,100392032458,2001-08-01,R,"Citimortgage, Inc.",,,7.000,7.000,35000.0,...,,,False,,False,,,,7,
2,,100775189711,2001-08-01,R,Bishops Gate Residential Mortgage Trust,,,8.375,8.375,114000.0,...,,,False,,False,,,,7,
3,,100278675059,2000-04-01,C,"Bank Of America, N.A.",,,8.375,8.375,70000.0,...,,,False,,False,,,,7,
4,,100733360727,2002-02-01,C,"Jpmorgan Chase Bank, Na","Jpmorgan Chase Bank, Na",,7.500,7.500,55000.0,...,,,False,,False,,,,7,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3469958,,136871031,2023-09-01,R,Other,Other,,7.375,7.375,238000.0,...,,7,False,,False,7,,,7,
3469959,,136879797,2023-09-01,R,"Pulte Mortgage, L.L.C.",Other,,6.375,6.375,207000.0,...,,7,False,,False,7,,,7,
3469960,,136880136,2023-09-01,C,"PennyMac Loan Services, LLC","PennyMac Loan Services, LLC",,7.375,7.375,510000.0,...,,7,False,,False,7,,,7,
3469961,,136880566,2023-09-01,R,Other,"Specialized Loan Servicing, LLC",,7.500,7.500,195000.0,...,,7,False,,False,7,,,7,


In [18]:
# Drop columns with 100% missing values
columns_to_drop = []

for col in sf_loan_performance.columns:

    # Check if total na values is equal to length of dataframe
    if sf_loan_performance[col].isna().sum() == sf_loan_performance.shape[0]:
        columns_to_drop.append(col)

sf_loan_performance.drop(columns=columns_to_drop, inplace=True)

# EDA

In [19]:
sf_loan_performance.describe()

Unnamed: 0,Loan_Identifier,Original_Interest_Rate,Current_Interest_Rate,Original_UPB,Current_Actual_UPB,Original_Loan_Term,Loan_Age,Remaining_Months_to_Legal_Maturity,Remaining_Months_to_Maturity,Original_Loan_to_Value_Ratio_LTV,...,Credit_Enhancement_Proceeds,Repurchase_Make_Whole_Proceeds,Other_Foreclosure_Proceeds,Modification_Related_Non_Interest_Bearing_UPB,Principal_Forgiveness_Amount,Mortgage_Insurance_Type,Foreclosure_Principal_Write_off_Amount,Alternative_Delinquency_Resolution_Count,Total_Deferral_Amount,Payment_Deferral_Modification_Event_Indicator
count,3469963.0,3469963.0,3422378.0,3469963.0,3469963.0,3469963.0,3422354.0,3422168.0,3369957.0,3469963.0,...,574.0,499.0,653.0,50843.0,22147.0,631185.0,22161.0,12713.0,12530.0,3469963.0
mean,482734000000.0,4.836708,4.803512,190805.8,151016.6,304.6243,44.41576,262.2492,253.4742,69.59998,...,16017.357247,7996.858016,4605.034916,7249.33164,0.0,1.065054,16.536959,1.048533,12374.312808,7.0
std,302828600000.0,1.284084,1.285489,114819.7,118209.2,83.12776,40.98809,95.34768,97.97084,17.75971,...,26703.877951,31350.126371,16611.643428,22004.246295,0.0,0.248286,733.777635,0.233158,11674.682909,0.0
min,97473120.0,1.5,1.5,5000.0,0.0,60.0,-1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,89.78,7.0
25%,230928500000.0,3.81,3.75,105000.0,67258.98,180.0,13.0,172.0,167.0,59.0,...,0.0,0.0,3.55,0.0,0.0,1.0,0.0,1.0,4345.495,7.0
50%,487223000000.0,4.826,4.75,163000.0,127010.1,360.0,32.0,303.0,294.0,75.0,...,0.0,0.0,632.31,0.0,0.0,1.0,0.0,1.0,8291.92,7.0
75%,743640500000.0,5.875,5.75,250000.0,212402.1,360.0,65.0,341.0,338.0,80.0,...,25245.94,0.0,2732.5,0.0,0.0,1.0,0.0,1.0,16526.7475,7.0
max,999999900000.0,11.5,11.5,1562000.0,1537338.0,360.0,293.0,481.0,480.0,97.0,...,158077.85,362476.52,187746.6,338000.0,0.0,3.0,82093.33,5.0,101706.3,7.0


In [24]:
duplicated_rows = sf_loan_performance.duplicated()
print(duplicated_rows.sum())

0


0
