In [1]:
import pandas as pd
import numpy as np
from pandas_schema import pandas_schema

In [2]:
columns = list(pandas_schema.keys())
datatypes = list(pandas_schema.values())
schema_list = list(zip(columns, datatypes))
file_path = '/Users/matthewtryba/Desktop/subsampled_data_0.1225_pct_.csv'
#file_path = "Y:\\FannieMaeMortgageData\\subsampled_data_00125.csv"
sf_loan_performance = pd.read_csv(file_path, sep='|', header=None, names=columns, low_memory=False)

In [None]:
# # Check Column numbers against fannie-mae-loan-layout-and-glossary.pdf
# i = 0
# for col in columns:
#     i+=1
#     print(i, col)

## ETL

In [None]:
def get_indices_by_type(schema_list):
    """
    Creates dictionary of set of datatypes in pandas_schema
    """
    indices_by_type = {datatype:[] for datatype in list(set(pandas_schema.values()))}

    for key,_ in indices_by_type.items():
        for i,_ in enumerate(schema_list):
            if key in schema_list[i][1]:
                indices_by_type[key].append(i)
    
    return indices_by_type


def process_dates(df, indices_by_type):
    """
    Converts date columns from MMYYYY to YYYY-MM-DD format, safely handling NaN values.
    :param df: DataFrame with data.
    :param indices_by_type: Dictionary with 'datetime64[ns]' key pointing to list of column indices.
    """
    error_col_indices = []

    for col in indices_by_type['datetime64[ns]']:
        try:
            # Direct conversion to string and zero-filling
            df.iloc[:, col] = df.iloc[:, col].astype(str).str.zfill(6)

            # Convert to datetime format, specifying the original format to speed up parsing
            df.iloc[:, col] = pd.to_datetime(df.iloc[:, col], format='%m%Y', errors='coerce')
        
        except ValueError:
            error_col_indices.append(col)
        
    # Optionally, log errors (if frequent):
    # if error_col_indices:
    #     print(f'Errors on columns  {error_col_indices}')
    
    return df


def preprocess_booleans(df, indices_by_type):
    """
    Converts columns with "Y" and "N" values to boolean. All other values are set to NULL.
    :param df: DataFrame with data.
    :param bool_columns: List of columns indices to be converted.
    """
    for col in indices_by_type['bool']:
        df.iloc[:,col] = df.iloc[:,col].apply(convert_to_bool)

    return df


def convert_to_bool(x):
    if x == 'N' or x == 'n':
        return False
    elif x == "Y" or x == 'y':
        return True
    else:
        return None    

In [None]:
indices_by_type = get_indices_by_type(schema_list)
process_dates(sf_loan_performance, indices_by_type)
preprocess_booleans(sf_loan_performance, indices_by_type)

# Create dictionary of non-numeric features to be used in .astype
non_numeric_schema = {}
for key, value in pandas_schema.items():
    if value not in ['float64', 'int64', 'Int64']:
        non_numeric_schema[key] = value

# Set schema for non-numeric features
sf_loan_performance.astype(non_numeric_schema)

In [None]:
# # Drop columns with 100% missing values
# columns_to_drop = []

# for col in sf_loan_performance.columns:

#     # Check if total na values is equal to length of dataframe
#     if sf_loan_performance[col].isna().sum() == sf_loan_performance.shape[0]:
#         columns_to_drop.append(col)

# sf_loan_performance.drop(columns=columns_to_drop, inplace=True)

# # Drop duplicate Rows
# sf_loan_performance.drop_duplicates(inplace=True)

# Adding Taget Feature: Conditional Prepayment Rate (CPR)

- Features to investigate:  
    - Zero_Balance_Code



In [None]:
print(sf_loan_performance['Zero_Balance_Code'].dtype)

In [None]:
prepaid = sf_loan_performance[sf_loan_performance['Zero_Balance_Code'] == 1]

sf_loan_performance['Zero_Balance_Code'].value_counts()

# EDA

In [None]:
sf_loan_performance.describe()