In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns

# import missingno as msno
# import pandas_profiling

In [2]:
filepath = './Completed Loans_Outliers Clipped.csv.gz'
data_completed = pd.read_csv(filepath, compression = 'gzip')

In [3]:
data_completed.shape

(1345774, 55)

In [4]:
## define a function that randomly samples a fraction (10% by def) of grouped dataframe (grouped by 'issue_d' by def)
## this function also tackles imbalance of the dataframe

def sampling(df, by = ['issue_d', 'grade'], imbalance = 'loan_status', fraction = 0.1, random_seed = 1, compromise_size = True):
    # create a list for the sampled row index
    lst_sample = []
    
    # groupby the defined columns
    grp = df.groupby(by)
    # list of sub-group keys
    keys = grp.groups.keys()
    
    # randomly sample a fraction of data from each subgroup
    for key in keys:
        
        # subgroup dataframe
        df_grp = grp.get_group(name = key)
        df_def = df_grp[df_grp[imbalance] == 'Default']
        df_paid = df_grp[df_grp[imbalance] != 'Default']
        
        # rows of the total sub-df and default df
        grp_size = df_grp.shape[0]
        def_size = df_def.shape[0]
        
        # num of samples to be drawn from this subgroup
        grp_sample_size = int(grp_size * fraction)
        
        # conditions of whether the default size in this subgroup is < half of the targeted sample size
        # if so, we will sample all of the default obs
        if def_size <= int(grp_sample_size / 2):
            
            # use all default observations
            lst_def = df_def.index.tolist()
            
            # [compromise_size] indicates whether we are willing to sample less than the defined fraction 
            # of data given the default observations are too few in the subgroup
            if compromise_size:
                # randomly draw the same num of fully paid obs as default
                # -> the total sample size from this group will be < defined fraction
                lst_paid = df_paid.sample(n = def_size, random_state = random_seed).index.tolist()
            else:    
                # still sample the defined fraction of data from this subgroup and allocate all rest to fully paid
                # fully paid size = targeted group sample size - default size
                paid_size = grp_sample_size - def_size
                lst_paid = df_paid.sample(n = paid_size, random_state = random_seed).index.tolist()
            
            # append both default and fully paid lists to the final list 
            lst_sample.extend(lst_def)
            lst_sample.extend(lst_paid)
        else:
            # if default obs are >5% then we split the sample btwn default and fully paid
            lst_def = df_def.sample(n = int(grp_sample_size / 2), random_state = random_seed).index.tolist()
            lst_paid = df_paid.sample(n = int(grp_sample_size / 2), random_state = random_seed).index.tolist()
            
            # append both default and fully paid lists to the final list
            lst_sample.extend(lst_def)
            lst_sample.extend(lst_paid)
    
    # use the index list (lst_sample) to get the sampled dataframe
    df_sample = df.iloc[lst_sample]
    
    return df_sample        

In [5]:
data_sample = sampling(data_completed)
data_sample.shape

(132894, 55)

In [None]:
data_sample_no_compromise = sampling(data_completed, compromise_size = False)
data_sample_no_compromise.shape

### Save Sampled Dataset

In [6]:
data_sample.to_csv(path_or_buf = 'Sampled Dataset_Balanced.csv.gz', compression = 'gzip', index = False)

In [None]:
data_sample_no_compromise.to_csv(path_or_buf = 'Sampled Dataset_Balanced (Size Compromised).csv.gz', compression = 'gzip', index = False)

##### Save Default & Fully-Paid Observations From Sampled Dataset

In [7]:
data_sample[data_sample.loan_status == 'Default'].to_csv(path_or_buf = 'Sampled Dataset_Default.csv.gz', compression = 'gzip', index = False)

In [8]:
data_sample[data_sample.loan_status != 'Default'].to_csv(path_or_buf = 'Sampled Dataset_Fully Paid.csv.gz', compression = 'gzip', index = False)

### Save Holdout Dataset

In [9]:
index_holdout = list(set(data_completed.index) - set(data_sample.index))
data_holdout = data_completed.iloc[index_holdout]

In [11]:
data_holdout.sample(n = int(0.05 * data_completed.shape[0]), random_state = 1).to_csv(path_or_buf = 'Holdout Dataset.csv.gz', compression = 'gzip', index = False)