In [None]:
#%matplotlib notebook
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
import random
from sklearn import svm, ensemble
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, auc, precision_recall_curve
from sklearn.preprocessing import StandardScaler

In [None]:
ATTRIBUTE_DICT = {"SeriousDlqin2yrs" : "Has Had Serious Delinquincy in Past 2 Years",\
 "NumberOfTimes90DaysLate": "Number of Times Person was 90+ Days Late", \
 "RevolvingUtilizationOfUnsecuredLines" : "Credit Card Usage", \
 "NumberOfTime30-59DaysPastDueNotWorse": "Number of Times 30-59 Days Past Due", \
 "DebtRatio": "Debt Ration", \
 "NumberOfDependents": "Number of Dependents", \
 "MonthlyIncome": "Monthly Income", \
 "NumberOfOpenCreditLinesAndLoans" : "Number of Open Credit Lines and Loans", \
 "NumberRealEstateLoansOrLines": "Number of Real Estate Loans or Lines", \
 "NumberOfTime60-89DaysPastDueNotWorse": "Number of Times 60-89 Days Past Due", \
 "age": "Age", 'bins_age' : "Age Range", 'bins_MonthlyIncome' : "Income Range", \
                 'bins_RevolvingUtilizationOfUnsecuredLines' : "Credit Card Usage Range", \
                 'bins_DebtRatio' : 'Debt Ratio Range'}

In [None]:
HISTO_PARAMS = {"SeriousDlqin2yrs" : {"title" : "Individuals With and Without Serious Delinquancy in Past 2 Years", \
    "bins" : 2, "y_label" : "Number of Inividuals"},
    "NumberOfTimes90DaysLate": {"title" : "Individuals by Number of Times Person was 90+ Days Late", "bins": 3, \
    "y_label" : "Number of Individuals"},
    "RevolvingUtilizationOfUnsecuredLines" : {"title" : "Individuals by Credit Card Usage", "bins": 10, \
    "y_label" : "Number of Individuals"},
    "NumberOfTime30-59DaysPastDueNotWorse": {"title" : "Individuals by Number of Times 30-59 Days Past Due", "bins": 50, \
    "y_label" : "Number of Individuals"},
    "DebtRatio": {"title" : "Individuals by Debt Ration", "bins": 20, \
    "y_label" : "Number of Inidviduals"},
    "NumberOfDependents": {"title" : "Individuals by Number of Dependents", "bins": 20, \
    "y_label" : "Number of Individuals"},
    "MonthlyIncome": {"title" : "Individuals by Monthly Income", "bins" : 30, \
    "y_label" : "Number of Individuals"},
    "NumberOfOpenCreditLinesAndLoans" : {"title" : "Individuals by Number of Open Credit Lines and Loans", "bins": 20, \
    "y_label" : "Number of Individuals"},
    "NumberRealEstateLoansOrLines": {"title": "Individuals by Number of Real Estate Loans or Lines", "bins" : 20, \
    "y_label" : "Number of Inividuals"},
    "NumberOfTime60-89DaysPastDueNotWorse": {"title" : "Individuals by Number of Times 60-89 Days Past Due", "bins": 4, \
    "y_label" : "Number of Individuals"},
    "age": {"title" : "Individuals by Age", "bins": 11, \
    "y_label" : "Number of Individuals"},
    "bins_age": {"title" : "Individuals by Age Range", "bins": 11, "bin_labels" : [], \
    "y_label" : "Number of Individuals"},
    "bins_MonthlyIncome": {"title" : "Individuals by Monthly Income Range", "bins": 11, "bin_labels" : [], \
    "y_label" : "Number of Individuals"},
    "bins_RevolvingUtilizationOfUnsecuredLines": {"title" : "Individuals by Credit Card Usage Range", "bins": 11, "bin_labels" : [], \
    "y_label" : "Number of Individuals"},
    "bins_DebtRatio": {"title" : "Individuals by Debt Ratio Range", "bins": 11, "bin_labels" : [], \
    "y_label" : "Number of Individuals"}}

In [None]:
MODELS = {'RF': RandomForestClassifier(n_estimators=50, n_jobs=-1),
    'LR': LogisticRegression(penalty='l1', C=1e5),
    'SVM': svm.SVC(kernel='linear', probability=True, random_state=0),
    'GB': GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=10),
    'DT': DecisionTreeClassifier(),
    'KNN': KNeighborsClassifier(n_neighbors=3),
    'BAG': BaggingClassifier()
        }

In [None]:
def read_in (path, source_type, has_index=False):
    '''
    Read in data.
    
    Takes path to data file, string indicating type of data storage,
    and indicator of whether data includes an index (N/A for json)
    
    Returns pandas DataFrame
    '''
    if source_type.lower() == "csv":
        if has_index:
            index_col_number=0
        else:
            index_col_number=None
        return pd.read_csv(path, index_col=index_col_number, header=0)
    if source_type.lower() == "excel":
        if has_index:
            index_col_number=0
        else:
            index_col_number=None
        return pd.read_excel(path, index_col=index_col_number)
    if source_type.lower() == "json":
        return pd.read_json(path)
    if source_type.lower() == "stata":
        if has_index:
            index_col_number=0
        else:
            index_col_number=None
        return pd.read_csv(path, index=index_col_number)

In [None]:
def get_description(df, save_to_file_name):
    df1 = df.describe().T
    modes = df.mode().T
    modes.rename(columns={0 : "mode"}, inplace = True)
    medians = pd.DataFrame(df.median())
    medians.rename(columns={0 : "median"}, inplace = True)
    stats_final = df1.join(modes, how="left").join(medians, how="left")
    stats_final.to_csv(save_to_file_name)
    return

In [None]:
def histo_plot(df, x_label, y_label, title, bins, bin_locs, limits, axis):
    plt.clf()
    plt.rcParams["figure.figsize"] = [18.0, 8.0]
    plt.hist(df, bins)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.title(title)
    plt.axis(axis)
    plt.grid(True)
    plt.xticks(bin_locs)
    plt.xlim(limits)
    plt.savefig("results/" + x_label + ".png")

In [None]:
def build_model(current_model, train_data, train_results):
    model = current_model
    model.fit(train_data, train_results)
    return model

In [None]:
def test_model(model, test_data):
    predicted_results = model.predict(test_data)
    return predicted_results

In [None]:
def eval_model(predicted_results, actual_results):
    acc = accuracy_score(actual_results, predicted_results)
    prec = precision_score(actual_results, predicted_results)
    rec = recall_score(actual_results, predicted_results)
    f1 = f1_score(actual_results, predicted_results)
    prec_curve, rec_curve, pr_thresholds = precision_recall_curve(actual_results, predicted_results)
    return acc, prec, rec, f1

In [None]:
def splits(df, train_portion):
    rand_list = random.sample(range(len(df)), len(df))
    divide_point = int(train_portion*len(df))
    train_list = rand_list[:divide_point]
    test_list = rand_list[divide_point:]
    train_df = df.iloc[train_list, :]
    test_df = df.iloc[test_list, :]
    return train_df, test_df

In [None]:
def impute(data, column, imp_data, method):
    '''
    (adapted from Dani)
    '''
    if method == 'median':
        median = imp_data.median()
        data[column].fillna(median, inplace=True)
        return median
    elif method == 'mode':
        mode = int(imp_data.mode()[0])
        data[column].fillna(mode,  inplace=True)
        return mode 
    else:
        mean = imp_data.mean()
        data[column].fillna(mean,  inplace=True)
        return mean

In [None]:
def model_loop(train_data, test_data, train_results, test_results, model_dict=MODELS):
    results_header = ["accuracy", "precision", "recall", "F1", "AUC"]
    results_index = []
    results = []
    for abbr, model in model_dict.items():
        print(abbr)
        results_index.append(abbr)
        model_t = build_model(model, train_data, train_results)
        predicted_results = test_model(model_t, test_data)
        acc, prec, rec, f1 = eval_model(predicted_results, test_results)
        if hasattr(model, 'predict_proba'):
            pred_prob = model_t.predict_proba(test_data)[:,1]
        else:
            pred_prob = model_t.decision_function
        print(pred_prob)
        precision_curve, recall_curve, thresholds = precision_recall_curve(test_results, pred_prob)
        precision = precision_curve[:-1]
        recall = recall_curve[:-1]
        AUC = auc(recall, precision)
        results.append([acc, prec, rec, f1, AUC])
    final_results = pd.DataFrame(results, index=results_index, columns=results_header)
    return final_results

In [None]:
def generate_features(train, test):
    '''
    Adapted from Dani
    '''
    revolve_bins = [0, 0.029867442, 0.154180737, 0.559046248, 50709]
    debt_bins = [0, 0.175073832, 0.366507841, 0.868253773, 329664]
    income_bins = list(range(0, 10000, 1000)) + [train['MonthlyIncome'].max()]
    age_bins = [0] + list(range(20, 80, 5)) + [120]

    age_bucket = create_bins(train, 'age', age_bins)
    income_bucket = create_bins(train, 'MonthlyIncome', income_bins)
    cat_creditrev = create_bins(train, "RevolvingUtilizationOfUnsecuredLines", revolve_bins)
    debt_bucket = create_bins(train, "DebtRatio", debt_bins)

    age_bucket = create_bins(test, 'age', age_bins)
    income_bucket = create_bins(test, 'MonthlyIncome', income_bins)
    cat_creditrev = create_bins(test, "RevolvingUtilizationOfUnsecuredLines", revolve_bins)
    debt_bucket = create_bins(test, "DebtRatio", debt_bins)

    return age_bucket, income_bucket, cat_creditrev, debt_bucket

In [None]:
def create_bins(data, column, bins, verbose=False):
    '''
    Adapted from Dani
    '''
    new_col = 'bins_' + str(column)

    data[new_col] = pd.cut(data[column], bins=bins, include_lowest=True, labels=False)

    if verbose:
        print(pd.value_counts(data[new_col]))

    return new_col

In [None]:
def go(read_file, write_desc, write_results, train_portion, predicted_col):
    # Read in data to DataFrame
    df = read_in(read_file, "csv", has_index = True)

    # Generate description of data
    get_description(df, "results/description.csv")    
    
    # Split data, prepare for use
    train_df, test_df = splits(df, train_portion)
    train_data = train_df.drop(predicted_col, axis=1)
    train_results = train_df[predicted_col]
    test_data = test_df.drop(predicted_col, axis=1)
    test_results = test_df[predicted_col]
    
    # Impute missing values
    impute(train_data, "MonthlyIncome", train_data["MonthlyIncome"], "median")
    impute(test_data, "MonthlyIncome", train_data["MonthlyIncome"], "median")
    impute(train_data, "NumberOfDependents", train_data["NumberOfDependents"], "mode")
    impute(test_data, "NumberOfDependents", train_data["NumberOfDependents"], "mode")
    train_data.to_csv("results/train_after_impute.csv")
    
    # Alter columns with features, drop original
    added_features = generate_features(train_data, test_data)
    train_data.drop(['MonthlyIncome', 'age', 'DebtRatio', 'RevolvingUtilizationOfUnsecuredLines'], axis=1, inplace = True)
    test_data.drop(['MonthlyIncome', 'age', 'DebtRatio', 'RevolvingUtilizationOfUnsecuredLines'], axis=1, inplace = True)
    print(train_data.columns)
    train_data.to_csv("results/train_after_alter.csv")
    test_data.to_csv("results/test_after_alter.csv")
    
    # Generate histograms for all columns

    for attribute in df.columns:
        x_label = ATTRIBUTE_DICT[attribute]
        y_label = HISTO_PARAMS[attribute]["y_label"]
        title = HISTO_PARAMS[attribute]["title"]
        column_data = df[attribute].dropna()
        val_counts = column_data.value_counts()
        col_max = column_data.max()
        col_min = column_data.min()
        axis = [col_min, col_max, 0, val_counts.iloc[0]*1.2]
        #bin_count = HISTO_PARAMS[attribute]["bins"]
        unique_ans = column_data.nunique()
        if unique_ans > 20:
            bin_count = 20
        else:
            bin_count = unique_ans
        inc = (col_max-col_min)/bin_count
        bins = np.array([x*inc + col_min for x in range(bin_count + 2)]) - inc/2 # +1 for alignment
        bin_locs = np.array([x*inc + col_min for x in range(bin_count + 1)])
        limits = [col_min-inc, col_max+inc]
        histo_plot(column_data, x_label, y_label, title, bins, bin_locs, limits, axis)
    
    # Loop through models
    results = model_loop(train_data, test_data, train_results, test_results)
    results.to_csv(write_results)

In [None]:
go("cs-training.csv", "results/description2.csv", "results/model_results.csv", .8, "SeriousDlqin2yrs")