# Can household characteristics predict who is having trouble with bills?

In [63]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

## Functions

In [34]:
#function from Kenny
def resample_data(data):
    # calculate the number of times to duplicate each sample
    weights_scaled = ((data['NWEIGHT']/data['NWEIGHT'].min())).astype(int)
    # duplicate the original indices based on weights_scaled
    resampled_idx = data.index.repeat(weights_scaled.values)
    # create dummy dataframe with duplicated index and join original data
    resampled_data = pd.DataFrame(index=resampled_idx, columns=['dummy']).join(data)
    # delete dummy column and reset index
    resampled_data = resampled_data.drop('dummy', axis=1).reset_index(drop=True)
    return resampled_data

In [49]:
df = pd.read_csv("recs2015_public_v4.csv")
df.shape

(5686, 759)

In [50]:
#remove columns with imputation codes
impute_columns = []
cols = df.columns

#Columns with impute binary code all start with Z
for i in enumerate(cols.str.contains('Z')):
    if i[1] == True:
        impute_columns.append(cols[i[0]])
        
df = df.loc[:, ~df.columns.isin(impute_columns)]
df.shape

(5686, 528)

In [48]:
# I want to try running the model without resampling to test whether or not 
# the expansion of the dataset is hindering our ability to predict without overfitting.
# Potentially there is some unintentional induced reinforcement that is causing overfit
# (perfect predictions in the trainset).

#df = resample_data(df)
#df.shape

In [51]:
# reorganizing the categories in the three columns that were identified as indicating  
# hardship so that they are ordinal
code_xform = {0:0, 1:3, 2:2, 3:1}

df['SCALEEf'] = df['SCALEE'].map(code_xform)
df['SCALEGf'] = df['SCALEG'].map(code_xform)
df['SCALEBf'] = df['SCALEB'].map(code_xform)

In [52]:
mask = (
    (df['SCALEEf'] > 0) |
    (df['SCALEGf'] > 0) |
    (df['SCALEBf'] > 0) 
)

# Defining a simple HARDSHIP variable for modeling
df.loc[:, 'HARDSHIP'] = 0
df.loc[mask, 'HARDSHIP'] = 1

In [53]:
df['HARDSHIP'].value_counts()

0    4116
1    1570
Name: HARDSHIP, dtype: int64

In [79]:
for col in df.select_dtypes('object').columns:
    hold = pd.get_dummies(df[col], prefix='{}'.format(col))
    df = pd.concat([df, hold], axis=1)

In [80]:
df.shape

(5686, 554)

In [98]:
df.dropna(inplace=True)

In [99]:
targets = ['SCALEEf', 'SCALEGf', 'SCALEBf', 'HARDSHIP']
drop_cols = [i for i in df.select_dtypes('object').columns] + targets
X = df.drop(drop_cols, axis=1)
Y = df['HARDSHIP']

# splitting the dataset into three subsets so that overfit can be evaluated and handled
# without there being any bleed over into the testset
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=1)
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.3, random_state=1)

print('Training Rows: {}'.format(len(X_train)))
print('Validation Rows: {}'.format(len(X_val)))
print('Test Rows: {}'.format(len(X_test)))

Training Rows: 1618
Validation Rows: 694
Test Rows: 992


In [106]:
rfc = RandomForestClassifier()
rfc.fit(X_train, Y_train)

Y_train_pred = rfc.predict(X_train)
Y_val_pred = rfc.predict(X_val)

print('Training Classification Metrics')
print(confusion_matrix(Y_train, Y_train_pred))
print(classification_report(Y_train, Y_train_pred))

print('Validation Classification Metrics')
print(confusion_matrix(Y_val, Y_val_pred))
print(classification_report(Y_val, Y_val_pred))

Training Classification Metrics
[[1193    0]
 [   2  423]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1193
           1       1.00      1.00      1.00       425

   micro avg       1.00      1.00      1.00      1618
   macro avg       1.00      1.00      1.00      1618
weighted avg       1.00      1.00      1.00      1618

Validation Classification Metrics
[[522   8]
 [ 37 127]]
              precision    recall  f1-score   support

           0       0.93      0.98      0.96       530
           1       0.94      0.77      0.85       164

   micro avg       0.94      0.94      0.94       694
   macro avg       0.94      0.88      0.90       694
weighted avg       0.94      0.94      0.93       694



