In [3]:
import numpy as np
import pandas as pd
import math
import os
from pathlib import Path

def features_probability(pan_series, lenth):
    """
    Compute a specific feature probability given that panda series
    :param pan_series:  'A panda series contains all data for that feature'
    :param lenth： lenth of the pandas series
    Output: feature_prob  'A dictionay with key as data value and a probaility corresponding to probability of occurance'
            that repesent 'P(xi|y)' in Naive Bayes
    Example:
    '>>> features_probability(df['loan_type'])
    '>>> '{1: 0.7726327301673637,
         2: 0.148994405648276,
         3: 0.07526288084019156,
         4: 0.0031099833441687666}'
    """
    assert isinstance(pan_series, pd.core.series.Series)
    return dict(pan_series.value_counts() / lenth)


def train_feature_probability(df):
    """
    Compute all features'probability as a list of dictionaries
    :param 'df' training_data dataframe
    Output: List of dictionaries that represens joint distribution of all features 'P(x|y)'
    """
    assert isinstance(df, pd.core.frame.DataFrame)
    train_features_list = [df['loan_type'], df['property_type'], df['loan_purpose'],
                           df['owner_occupancy'],
                           df['preapproval'],
                           df['applicant_ethnicity'], df['applicant_race_1'], df['applicant_sex'],
                           df['loan_amount_000s']]  # ,df['action_taken']]
    df = pd.concat(train_features_list, axis=1)
    df = df.dropna()
    return [features_probability(df.iloc[:, i], len(df)) for i in range(len(df.columns) - 1)], df


def loan_prediction(x, approved_dict, not_approved_dict, df_a, df_na, P_a, P_na):
    """
    Given a loan applicant information as a pd.series. Compute how likely he/she will get the loan
    Input: 'df_a' A dataframe that contians all Approved datasets
            'df_na' A dataframe that contians all Unapproved datasets
    Output: A string that that represents the predicted loan application result
    """
    assert isinstance(x, pd.core.series.Series)
    assert isinstance(approved_dict,list)
    assert isinstance(not_approved_dict, list)
    assert isinstance(df_na,pd.core.frame.DataFrame)
    assert isinstance(df_a,pd.core.frame.DataFrame)
    assert isinstance(P_a, int) or isinstance(P_a, float)
    assert isinstance(P_na, int) or isinstance(P_a, float)
    assert P_a + P_na == 1 and P_a >=0 and P_na >= 0

    # print(P_a,P_na)
    def get_prob_of_loan_from_approved(x):
        """
        Get the probability of loan amount x from approved datasets
        """
        d = np.array(df_a['loan_amount_000s'].astype(int))
        return ((d > math.floor(x) - 5).sum() - (d > math.ceil(x) + 5).sum()) / len(d)

    def get_prob_of_loan_from_unapproved(x):
        """
        Get the probability of loan amount x from un_approved datasets
        """
        d = np.array(df_na['loan_amount_000s'].astype(int))
        return ((d > math.floor(x) - 5).sum() - (d > math.ceil(x) + 5).sum()) / len(d)

    test_features_list = ['loan_type', 'property_type', 'loan_purpose', 'owner_occupancy', 'preapproval',
                          'applicant_ethnicity', 'applicant_race_1', 'applicant_sex', 'loan_amount_000s']

    array_of_x = [x[i] for i in test_features_list]
    p_x_a = 1
    p_x_na = 1
    for count, item in enumerate(array_of_x):
        if count != len(array_of_x) - 2:
            p_x_a *= approved_dict[count][item]
            p_x_na *= not_approved_dict[count][item]
        else:
            p_x_a *= get_prob_of_loan_from_approved(item)
            p_x_na *= get_prob_of_loan_from_unapproved(item)
            break
    p_x_a *= P_a
    p_x_na *= P_na
    # return  "Higher Probability You wil Get the Loan" if p_x_a> p_x_na else "Higher Probability You wil not Get the Loan"
    return 1 if p_x_a > p_x_na else 0


In [5]:
df = pd.read_csv('hmda_2017_ca_all-records_labels.csv')


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [7]:
# groupby action taken
a = df.groupby('action_taken')
t1 = a.get_group(1)
t2 = a.get_group(2)
t8 = a.get_group(8)
t3 = a.get_group(3)
t7 = a.get_group(7)

# generate approve dataframe and not-approved dataframe
df_a = pd.concat([t1, t2, t8], ignore_index=True)
df_na = pd.concat([t3, t7], ignore_index=True)

In [12]:
#get the posterior proability P(x|y) for each class
approved_dict, df_a1 = train_feature_probability(df_a)
not_approved_dict, df_na1 = train_feature_probability(df_na)

P_a = len(df_a1) / (len(df_a1) + len(df_na1))
P_na = len(df_na1) / (len(df_a1) + len(df_na1))

# training error using only d_na and d_a
df3 = df_a.append(df_na, ignore_index=True)
df_4 = df3.sample(frac=1).reset_index(drop=True)
print('Approved:', approved_dict)
print('Not Approved:', not_approved_dict)

Approved: [{1: 0.7987603860634738, 2: 0.1296041076981744, 3: 0.0689145858733424, 4: 0.0027209203650093725}, {1: 0.9781069569841073, 2: 0.011388550221014357, 3: 0.010504492794878268}, {3: 0.49189793057517445, 1: 0.43952477885136987, 2: 0.06857729057345571}, {1: 0.871660212581974, 2: 0.11817688667844693, 3: 0.010162900739579026}, {3: 0.8326574895132314, 2: 0.14282737248035576, 1: 0.024515138006412907}, {2: 0.6550178046802408, 1: 0.19871634432049498, 3: 0.13046560715839453, 4: 0.015800243840869664}, {5: 0.6307808815867918, 6: 0.15342102295004484, 2: 0.14330646070885722, 3: 0.03896620064773588, 7: 0.01546294854098299, 4: 0.009699925343874706, 1: 0.008362560221712579}, {1: 0.6425636591169094, 2: 0.26857621638460255, 3: 0.07335528259223253, 4: 0.01550484190625554}]
Not Approved: [{1: 0.7919277348506653, 2: 0.1315429406425045, 3: 0.07441273496136559, 4: 0.002116589545464609}, {1: 0.9550601102574888, 2: 0.04105209555649036, 3: 0.0038877941860207673}, {3: 0.6474284322624926, 1: 0.23238204884096

In [14]:
# training error using only d_na and d_a

#combine two dataset and shuffle it
df3 = df_a.append(df_na, ignore_index=True)
df_4 = df3.sample(frac=1).reset_index(drop=True)
#using 2000000 training data to get the error rate
truth = df_4['action_taken'][:200000]
truth = np.array(truth)
truth = truth[:, None]

def foo(x):
    if x == 1 or x == 2 or x == 8:
        return 1
    else:
        return 0

truth = np.apply_along_axis(foo, 1, truth)

result = []
lenth = 200000
for i in range(lenth):
    result.append(loan_prediction(df_4.iloc[i],approved_dict,not_approved_dict,df_a1,df_na1,P_a,P_na))
result = np.array(result)
error = np.logical_xor(result, truth).sum()/lenth
print(error)

0.29
