# Transforming the data, so that a model can be trained

We will have to transform the dataframe, so that we have rows that contain twelve months of information.

This will be achieved by creating dataframes for each id. From those dataframes, twelve month windows will be made into singular rows, for which the result of the loan will be known.

For example, rows 12 to 23 may be selected. From this range, values from a random row will be made into a new one, and the final result will be added as a column.

The new row will be added to a new dataset. From this dataset, we will split the data for training and testing. Later, a cross-validation may be implemented

In [10]:
import random

import pandas as pd
import numpy as np

path_to_csv = "../data/mortgage_sample.csv"
df = pd.read_csv(path_to_csv)

In [3]:
# Create a subset for a specified ID
def create_subset(df, borrower_id):
    subset_df = df.loc[df["id"] == borrower_id]
    return subset_df

# Split subset into twelve-month long windows (or less)
def split_subset(subset_df, window_length=12):
    subset_len = subset_df.shape[0]
    
    windows = list()
    
    # TODO find a more clever way to do this than using a while loop
    current = 0
    while current < subset_len:
        window = subset_df.iloc[current:current+window_length if current+window_length < subset_len else subset_len]
        windows.append(window)
        current += 12
        
    return windows
        
def create_usable_row(window_df, include_result=True):
    window_len = window_df.shape[0]
    if window_len > 2:
    
        chosen_row = window_df.iloc[[random.randint(0, window_len-2)]].to_dict()
        last_row = window_df.iloc[[window_len-1]].to_dict()

        # We will use the same methodology as the original dataset
        # Meaning 1 is a default, 2 is a payoff and 0 means nothing changed
        if include_result:
            chosen_row["result"] = {
                list(chosen_row["id"].keys())[0]:
                list(last_row["default_time"].values())[0]
            }

        # return [list(dictionary.values())[0] for dictionary in list(chosen_row.values())]
        return pd.DataFrame(chosen_row)

In [11]:
# drop the null values
df = df.dropna(axis=0)
# df = df[df["sample"] == "public"]

In [7]:
unique_ids = list(df.id.unique())

output_df = pd.DataFrame()

for borrower_id in unique_ids:
    subset_df = create_subset(df=df, borrower_id=borrower_id)
    windows = split_subset(subset_df=subset_df)
    for window_df in windows:
        usable_df = create_usable_row(window_df=window_df)
        
        # output_df.loc[-1] = usable_df
        # output_df.reset_index(drop=True)
        output_df = output_df.append(usable_df, ignore_index=True)

  output_df = output_df.append(usable_df, ignore_index=True)


In [8]:
path_to_output = "../data/mortgage_default_transformed.csv"
output_df.to_csv(path_to_output)