In [30]:
# Author: Alex Trostanovsky, 31-08-2018
#
# This notebook contains the Exploratory Data Analysis I've conducted on Lending Club data
# Using third party data sources from:
# -  United States Department of Labour
# -  International Health Organization
#
# This notebook:
#
# 1) categorizes 'emp_title' data into discrete categories outlined by the International Labour Organization's (ILO) 
#    International Standard Classification of Occupations (ISCO)
#
# 3) splits into training/testing datasets, and trains a LightGBM classifier to categorize loan candidates who're are likely
#    to default (e.g. either 'Charged Off', 'Default', '30-120 Days Late)
#
# 4) produces cross-validation (AUCROC) metric result on the testing data-set with the trained model
#
# 5) generates a '_feature_importances' table for the trained model 

from contextlib               import contextmanager
from lightgbm                 import LGBMClassifier
from lightgbm                 import LGBMRegressor
from sklearn.metrics          import roc_auc_score, roc_curve
from sklearn.metrics          import mean_squared_error
from sklearn.model_selection  import KFold, StratifiedKFold
from sklearn.cross_validation import train_test_split
from math                     import sqrt
from tqdm                     import tqdm

import numpy as np
import pandas as pd
import gc
import time
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [31]:
import string

invalidChars = set(string.punctuation)

# remove all non-alphabetical characters from string

def sanitize(job):
    for char in invalidChars:
        if char in job:
            job = job.replace(char, ' ')
    
    if len(job) <= 3:
        arr = job.split(' ')
        arr = filter(None, arr)
        job = ''.join(arr)
    
    return job

In [32]:
# translate all acronyms from a self-developed dict to their explicit strings

def explicate(occ):
  
    array = occ.split(' ')     
    array = [x.replace(x, acronyms[x]) if x in acronyms else x for x in array]
    return ' '.join(array)

In [33]:
# read in globally declared dicts necessary for translation and lookups performed in data preprocessing

import ast

# widely used acronyms in LC data 

with open('../data/lc/acronyms.txt') as infile: 
    acronyms = ast.literal_eval(infile.read().replace('acronyms = ', ''))

# top 200 most common 'emp_title' strings in LC data with their corresponding 2 digit isco_08 code

with open('../data/lc/top200occs_lc.txt') as infile: 
    sub_major_dict = ast.literal_eval(infile.read())

# all 2-digit sub-major isco_08 codes with their corresponding titles

with open('../data/third_party/sub_major_titles.txt') as infile: 
    sub_major_title_dict = ast.literal_eval(infile.read())

In [34]:
# checks for common columns in the train/test df and returns only those which are common to both

def filter_common_columns(train_df, test_df):
    
    array = []
    
    for element in test_df.columns.tolist():
        if element not in train_df.columns.tolist():
            array.append(element)
            
    for element in train_df.columns.tolist():
        if element not in test_df.columns.tolist():
            array.append(element)
            
    for col in array: 
        if col in test_df.columns.tolist():
            test_df.drop(columns = [col], inplace = True)
        elif col in train_df.columns.tolist():
            train_df.drop(columns = [col], inplace = True)
    
    return train_df, test_df

In [35]:
# Display/plot feature importance
def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('lgbm_importances01.png')

In [36]:
# read in all files other than 2007-2011 
#(07-11 df contains employee titles which describe the actual organization name (KFC)
# as opposed to the occupation type (deep fryer))
#
# *** would be interesting to conduct a similar categorization on 2007-2011 dataset based on organization names and types

import os

files = os.listdir('../data/lc/secure')

#files = os.listdir('../data/YOUR_LC_DATA_DIRECTORY')

df = pd.DataFrame()
for name in files:
    if name != 'LoanStats3a_securev1.csv':
        print('reading ' + name)
        #skip first
        temp = pd.read_csv('../data/lc/secure/' + name, skiprows = [0])
        df = df.append(temp, ignore_index = True)

reading LoanStats3b_securev1.csv


  interactivity=interactivity, compiler=compiler, result=result)


reading LoanStats3c_securev1.csv


  interactivity=interactivity, compiler=compiler, result=result)


reading LoanStats3d_securev1.csv


  interactivity=interactivity, compiler=compiler, result=result)


reading LoanStats_securev1_2016Q1.csv
reading LoanStats_securev1_2016Q2.csv
reading LoanStats_securev1_2016Q3.csv
reading LoanStats_securev1_2016Q4.csv
reading LoanStats_securev1_2017Q1.csv


  interactivity=interactivity, compiler=compiler, result=result)


reading LoanStats_securev1_2017Q2.csv


  interactivity=interactivity, compiler=compiler, result=result)


reading LoanStats_securev1_2017Q3.csv


  interactivity=interactivity, compiler=compiler, result=result)


reading LoanStats_securev1_2017Q4.csv


  interactivity=interactivity, compiler=compiler, result=result)


reading LoanStats_securev1_2018Q1.csv


  interactivity=interactivity, compiler=compiler, result=result)


reading LoanStats_securev1_2018Q2.csv


In [37]:
df['loan_status'].value_counts()

Current               902986
Fully Paid            788945
Charged Off           202993
Issued                 27596
Late (31-120 days)     20178
In Grace Period        11372
Late (16-30 days)       6179
Default                 1278
Name: loan_status, dtype: int64

In [38]:
print('translating employee titles')
    
df = categorize_emp_titles(df)

# translate (categorize) 'emp_title' column
df['sub_mjr_grp_isco_08_code']  = df['job'].dropna().replace(sub_major_dict)
df['sub_mjr_grp_isco_08_title'] = df['sub_mjr_grp_isco_08_code'].dropna().replace(sub_major_title_dict)

translating employee titles


100%|██████████| 1831568/1831568 [00:01<00:00, 1027815.21it/s]
100%|██████████| 1831568/1831568 [00:04<00:00, 415887.30it/s]
100%|██████████| 1831568/1831568 [00:03<00:00, 577778.00it/s]


In [39]:
# get rid of all rows which were not categorized

df['isnumeric'] = df['sub_mjr_grp_isco_08_code'].dropna().str.isnumeric()
df = df[df['isnumeric'] == True]
df.reset_index(inplace = True)

In [40]:
# code all 'Default', 'Charged off', and 'Late (30-120)' as 'target' == 1 (i.e. defaulted on loan)
# all other statuses as 0

df['status'] = df['loan_status'].apply(lambda x: x in ['Charged Off', 'Default', 'Late (31-120 days)'])
df['target'] = df.status.astype(int)

# remove all unneeded columns

droplist = ['status', 'isnumeric', 'sub_mjr_grp_isco_08_code', 'job', 'emp_title', 'index', 'loan_status']
df.drop(columns = droplist, inplace = True)

In [41]:
# export df prior to preprocessing of 3rd party data sources

df.to_csv('../data/exports/coded_emp_target.csv')