In [7]:
import pandas as pd
from pandas_schema import pandas_schema

In [8]:
columns = list(pandas_schema.keys())
datatypes = list(pandas_schema.values())
schema_list = list(zip(columns, datatypes))
file_path = '/Users/matthewtryba/Desktop/subsampled_data.csv'
sf_loan_performance = pd.read_csv(file_path, sep='|', header=None, names=columns, low_memory=False)

## Pre-Processing Functions

In [9]:
def get_indices_by_type(schema_list):
    """
    Creates dictionary of set of datatypes in pandas_schema
    """
    indices_by_type = {datatype:[] for datatype in list(set(pandas_schema.values()))}

    for key,_ in indices_by_type.items():
        for i,_ in enumerate(schema_list):
            if key in schema_list[i][1]:
                indices_by_type[key].append(i)
    
    return indices_by_type


def process_dates(df, indices_by_type):
    """
    Converts date columns from MMYYYY to YYYY-MM-DD format, safely handling NaN values.
    :param df: DataFrame with data.
    :param indices_by_type: Dictionary with 'datetime64[ns]' key pointing to list of column indices.
    """
    error_col_indices = []

    for col in indices_by_type['datetime64[ns]']:
        try:
            # Direct conversion to string and zero-filling
            df.iloc[:, col] = df.iloc[:, col].astype(str).str.zfill(6)

            # Convert to datetime format, specifying the original format to speed up parsing
            df.iloc[:, col] = pd.to_datetime(df.iloc[:, col], format='%m%Y', errors='coerce')
        
        except ValueError:
            error_col_indices.append(col)
        
    # Optionally, log errors (if frequent):
    # if error_col_indices:
    #     print(f'Errors on columns  {error_col_indices}')
    
    return df


def preprocess_booleans(df, indices_by_type):
    """
    Converts columns with "Y" and "N" values to boolean. All other values are set to NULL.
    :param df: DataFrame with data.
    :param bool_columns: List of columns indices to be converted.
    """
    for col in indices_by_type['bool']:
        df.iloc[:,col] = df.iloc[:,col].apply(convert_to_bool)

    return df


def convert_to_bool(x):
    if x == 'N' or x == 'n':
        return False
    elif x == "Y" or x == 'y':
        return True
    else:
        return None    

# use the pandas schema instead!!! it will be more efficient
def cast_types(df, indices_by_type):
    for type in indices_by_type: #.remove(['bool', 'datetime64[ns]']):
        for col in indices_by_type[type]:
            df.iloc[:, col] = df.iloc[:, col].astype(type)

In [10]:
indices_by_type = get_indices_by_type(schema_list)
process_dates(sf_loan_performance, indices_by_type)

Unnamed: 0,Reference_Pool_ID,Loan_Identifier,Monthly_Reporting_Period,Channel,Seller_Name,Servicer_Name,Master_Servicer,Original_Interest_Rate,Current_Interest_Rate,Original_UPB,...,ARM_Plan_Number,Borrower_Assistance_Plan,High_Loan_to_Value_HLTV_Refinance_Option_Indicator,Deal_Name,Repurchase_Make_Whole_Proceeds_Flag,Alternative_Delinquency_Resolution,Alternative_Delinquency_Resolution_Count,Total_Deferral_Amount,Payment_Deferral_Modification_Event_Indicator,Interest_Bearing_UPB
0,,100936387698,2002-07-01 00:00:00,R,Usaa Federal Savings Bank,Other,,8.0,8.0,140000.0,...,,,N,,,,,,7,
1,,100416816452,2000-07-01 00:00:00,R,Other,,,8.0,8.0,102000.0,...,,,N,,,,,,7,
2,,100399640918,2000-06-01 00:00:00,R,Suntrust Mortgage Inc.,,,7.625,7.625,164000.0,...,,,N,,,,,,7,
3,,100394926109,2004-08-01 00:00:00,B,Other,Other,,8.0,8.0,90000.0,...,,,N,,,,,,7,
4,,100650934438,2002-06-01 00:00:00,R,"Jpmorgan Chase Bank, Na","Jpmorgan Chase Bank, Na",,7.625,7.625,205000.0,...,,,N,,,,,,7,


In [11]:
preprocess_booleans(sf_loan_performance, indices_by_type)


Unnamed: 0,Reference_Pool_ID,Loan_Identifier,Monthly_Reporting_Period,Channel,Seller_Name,Servicer_Name,Master_Servicer,Original_Interest_Rate,Current_Interest_Rate,Original_UPB,...,ARM_Plan_Number,Borrower_Assistance_Plan,High_Loan_to_Value_HLTV_Refinance_Option_Indicator,Deal_Name,Repurchase_Make_Whole_Proceeds_Flag,Alternative_Delinquency_Resolution,Alternative_Delinquency_Resolution_Count,Total_Deferral_Amount,Payment_Deferral_Modification_Event_Indicator,Interest_Bearing_UPB
0,,100936387698,2002-07-01 00:00:00,R,Usaa Federal Savings Bank,Other,,8.0,8.0,140000.0,...,,,False,,,,,,7,
1,,100416816452,2000-07-01 00:00:00,R,Other,,,8.0,8.0,102000.0,...,,,False,,,,,,7,
2,,100399640918,2000-06-01 00:00:00,R,Suntrust Mortgage Inc.,,,7.625,7.625,164000.0,...,,,False,,,,,,7,
3,,100394926109,2004-08-01 00:00:00,B,Other,Other,,8.0,8.0,90000.0,...,,,False,,,,,,7,
4,,100650934438,2002-06-01 00:00:00,R,"Jpmorgan Chase Bank, Na","Jpmorgan Chase Bank, Na",,7.625,7.625,205000.0,...,,,False,,,,,,7,


In [12]:
cast_types(sf_loan_performance, indices_by_type)
sf_loan_performance.head()

KeyboardInterrupt: 