In [1]:
import pandas as pd
import numpy as np
from pandas_schema import pandas_schema

In [2]:
columns = list(pandas_schema.keys())
datatypes = list(pandas_schema.values())
schema_list = list(zip(columns, datatypes))
file_path = '/Users/matthewtryba/Desktop/subsampled_data_0.1225_pct_.csv'
#file_path = "Y:\\FannieMaeMortgageData\\subsampled_data_00125.csv"
sf_loan_performance = pd.read_csv(file_path, sep='|', header=None, names=columns, low_memory=False, index_col=False)

In [4]:
sf_loan_performance

Unnamed: 0,Reference_Pool_ID,Loan_Identifier,Monthly_Reporting_Period,Channel,Seller_Name,Servicer_Name,Master_Servicer,Original_Interest_Rate,Current_Interest_Rate,Original_UPB,...,High_Loan_to_Value_HLTV_Refinance_Option_Indicator,Deal_Name,Repurchase_Make_Whole_Proceeds_Flag,Alternative_Delinquency_Resolution,Alternative_Delinquency_Resolution_Count,Total_Deferral_Amount,Payment_Deferral_Modification_Event_Indicator,Interest_Bearing_UPB,From_File,From_File_Numeric
0,,100036915856,32000,R,"Wells Fargo Bank, N.A.",,,7.750,7.750,135000.0,...,N,,,,,,7,,2000Q1,2000.0
1,,100151690382,42002,R,"Jpmorgan Chase Bank, Na","Jpmorgan Chase Bank, Na",,8.125,8.125,155000.0,...,N,,,,,,7,,2000Q1,2000.0
2,,100207354995,62010,B,"Jpmorgan Chase Bank, Na","Jpmorgan Chase Bank, Na",,7.875,7.875,74000.0,...,N,,,,,,7,,2000Q1,2000.0
3,,100292236815,82000,R,Other,,,8.500,8.500,57000.0,...,N,,,,,,7,,2000Q1,2000.0
4,,100393376629,52016,R,"Wells Fargo Bank, N.A.",Other,,7.625,7.625,50000.0,...,N,,,,,,7,,2000Q1,2000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2891617,,136878170,92023,R,Other,Other,,6.125,6.125,130000.0,...,N,,,7,,,7,,2023Q3,2023.5
2891618,,136879170,92023,R,Other,Other,,7.625,7.625,668000.0,...,N,,,7,,,7,,2023Q3,2023.5
2891619,,136880170,92023,C,"PennyMac Loan Services, LLC","PennyMac Loan Services, LLC",,7.125,7.125,454000.0,...,N,,,7,,,7,,2023Q3,2023.5
2891620,,136881170,92023,C,PHH Mortgage Corporation,PHH Mortgage Corporation,,6.000,6.000,293000.0,...,N,,,7,,,7,,2023Q3,2023.5


In [None]:
# # Check Column numbers against fannie-mae-loan-layout-and-glossary.pdf
# i = 0
# for col in columns:
#     i+=1
#     print(i, col)

## ETL

In [5]:
def get_indices_by_type(schema_list):
    """
    Creates dictionary of set of datatypes in pandas_schema
    """
    indices_by_type = {datatype:[] for datatype in list(set(pandas_schema.values()))}

    for key,_ in indices_by_type.items():
        for i,_ in enumerate(schema_list):
            if key in schema_list[i][1]:
                indices_by_type[key].append(i)
    
    return indices_by_type


def process_dates(df, indices_by_type):
    """
    Converts date columns from MMYYYY to YYYY-MM-DD format, safely handling NaN values.
    :param df: DataFrame with data.
    :param indices_by_type: Dictionary with 'datetime64[ns]' key pointing to list of column indices.
    """
    error_col_indices = []

    for col in indices_by_type['datetime64[ns]']:
        try:
            # Direct conversion to string and zero-filling
            df.iloc[:, col] = df.iloc[:, col].astype(str).str.zfill(6)

            # Convert to datetime format, specifying the original format to speed up parsing
            df.iloc[:, col] = pd.to_datetime(df.iloc[:, col], format='%m%Y', errors='coerce')
        
        except ValueError:
            error_col_indices.append(col)
        
    # Optionally, log errors (if frequent):
    # if error_col_indices:
    #     print(f'Errors on columns  {error_col_indices}')
    
    return df


def preprocess_booleans(df, indices_by_type):
    """
    Converts columns with "Y" and "N" values to boolean. All other values are set to NULL.
    :param df: DataFrame with data.
    :param bool_columns: List of columns indices to be converted.
    """
    for col in indices_by_type['bool']:
        df.iloc[:,col] = df.iloc[:,col].apply(convert_to_bool)

    return df


def convert_to_bool(x):
    if x == 'N' or x == 'n':
        return False
    elif x == "Y" or x == 'y':
        return True
    else:
        return None    

In [6]:
indices_by_type = get_indices_by_type(schema_list)
process_dates(sf_loan_performance, indices_by_type)
preprocess_booleans(sf_loan_performance, indices_by_type)

# Create dictionary of non-numeric features to be used in .astype
non_numeric_schema = {}
for key, value in pandas_schema.items():
    if value not in ['float64', 'int64', 'Int64']:
        non_numeric_schema[key] = value

# Set schema for non-numeric features
sf_loan_performance.astype(non_numeric_schema)

Unnamed: 0,Reference_Pool_ID,Loan_Identifier,Monthly_Reporting_Period,Channel,Seller_Name,Servicer_Name,Master_Servicer,Original_Interest_Rate,Current_Interest_Rate,Original_UPB,...,High_Loan_to_Value_HLTV_Refinance_Option_Indicator,Deal_Name,Repurchase_Make_Whole_Proceeds_Flag,Alternative_Delinquency_Resolution,Alternative_Delinquency_Resolution_Count,Total_Deferral_Amount,Payment_Deferral_Modification_Event_Indicator,Interest_Bearing_UPB,From_File,From_File_Numeric
0,,100036915856,2000-03-01,R,"Wells Fargo Bank, N.A.",,,7.750,7.750,135000.0,...,False,,False,,,,7,,2000Q1,2000.0
1,,100151690382,2002-04-01,R,"Jpmorgan Chase Bank, Na","Jpmorgan Chase Bank, Na",,8.125,8.125,155000.0,...,False,,False,,,,7,,2000Q1,2000.0
2,,100207354995,2010-06-01,B,"Jpmorgan Chase Bank, Na","Jpmorgan Chase Bank, Na",,7.875,7.875,74000.0,...,False,,False,,,,7,,2000Q1,2000.0
3,,100292236815,2000-08-01,R,Other,,,8.500,8.500,57000.0,...,False,,False,,,,7,,2000Q1,2000.0
4,,100393376629,2016-05-01,R,"Wells Fargo Bank, N.A.",Other,,7.625,7.625,50000.0,...,False,,False,,,,7,,2000Q1,2000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2891617,,136878170,2023-09-01,R,Other,Other,,6.125,6.125,130000.0,...,False,,False,7,,,7,,2023Q3,2023.5
2891618,,136879170,2023-09-01,R,Other,Other,,7.625,7.625,668000.0,...,False,,False,7,,,7,,2023Q3,2023.5
2891619,,136880170,2023-09-01,C,"PennyMac Loan Services, LLC","PennyMac Loan Services, LLC",,7.125,7.125,454000.0,...,False,,False,7,,,7,,2023Q3,2023.5
2891620,,136881170,2023-09-01,C,PHH Mortgage Corporation,PHH Mortgage Corporation,,6.000,6.000,293000.0,...,False,,False,7,,,7,,2023Q3,2023.5


In [12]:
# Drop columns with 100% missing values
columns_to_drop = []

for col in sf_loan_performance.columns:

    # Check if total na values is equal to length of dataframe
    if sf_loan_performance[col].isna().sum() == sf_loan_performance.shape[0]:
        columns_to_drop.append(col)

sf_loan_performance.drop(columns=columns_to_drop, inplace=True)

# Drop duplicate Rows
sf_loan_performance.drop_duplicates(inplace=True)

# Adding Taget Feature: Conditional Prepayment Rate (CPR)

- Features to investigate:  
    - Zero_Balance_Code



In [13]:
prepaid = sf_loan_performance[sf_loan_performance['Zero_Balance_Code'] == 1]

print(prepaid.shape[0]/sf_loan_performance.shape[0])

0.013524935140208506


# EDA

In [16]:
sf_loan_performance
sf_loan_performance_1000 = sf_loan_performance.head(1000)
sf_loan_performance_1000.to_csv("sample_1000_rows.csv", index=False)