# 2. (20 points) 
A bank is implementing a system to identify potential customers who have higher probablity of availing loans to increase its profit.  
**Implement Naive Bayes classifier** on this dataset to help bank achieve its goal.  
Report your observations and accuracy of the model.
Data is available at http://preon.iiit.ac.in/~sanjoy_chowdhury/LoanDataset.zip

In [113]:
import pandas as pd
import numpy as np
import seaborn as sns

from utils import train_test_split

In [114]:
df = pd.read_csv("../data/loan/data.csv", header=None, sep=',')

# add column names header
df.columns = ["ID", "age", "experience", "income", "zipcode", "family size", 
              "avg spends per month","education level", "mortgage value of house", "label",
              "securities account", "CD account", "netbanking", "credit card"]

df.drop(["ID", "zipcode"], axis=1, inplace=True)

cols = df.columns.tolist()
cols.remove("label")
df = df[cols +["label"]]

df.head()

Unnamed: 0,age,experience,income,family size,avg spends per month,education level,mortgage value of house,securities account,CD account,netbanking,credit card,label
0,25,1.1,49,4,1.6,1.2,0,1.3,0.2,0.3,0.4,0.1
1,31,5.0,39,4,2.2,2.0,0,0.0,0.0,1.0,1.0,0.0
2,42,18.0,54,1,1.8,1.0,0,0.0,0.0,1.0,0.0,0.0
3,59,35.0,40,4,0.4,1.0,0,0.0,0.0,0.0,0.0,0.0
4,38,14.0,80,2,2.7,1.0,0,0.0,0.0,1.0,0.0,0.0


In [115]:
categorical_features = ["education level", "securities account", "CD account", "netbanking",
                        "credit card"] + ["family size"]
numerical_features = ["age", "experience", "income", "avg spends per month", "mortgage value of house",]

# Data Inspection

In [116]:
def inspect_data(df):
    for col in df.columns:
        unique_vals, counts=np.unique(df[col].values, return_counts=True)
        print(col,len(unique_vals),"min:", min(unique_vals), "max:", max(unique_vals))
        if len(unique_vals)<=10:
            print(unique_vals)
            print(counts)
        print("------------------")

    print("Datframe size:", len(df))
inspect_data(df)

age 45 min: 23 max: 67
------------------
experience 48 min: -3.0 max: 43.0
------------------
income 162 min: 8 max: 224
------------------
family size 4 min: 1 max: 4
[1 2 3 4]
[1317 1173  915 1095]
------------------
avg spends per month 105 min: 0.0 max: 10.0
------------------
education level 4 min: 1.0 max: 3.0
[1.  1.2 2.  3. ]
[1882    1 1262 1355]
------------------
mortgage value of house 338 min: 0 max: 635
------------------
securities account 3 min: 0.0 max: 1.3
[0.  1.  1.3]
[4021  478    1]
------------------
CD account 3 min: 0.0 max: 1.0
[0.  0.2 1. ]
[4226    1  273]
------------------
netbanking 3 min: 0.0 max: 1.0
[0.  0.3 1. ]
[1791    1 2708]
------------------
credit card 3 min: 0.0 max: 1.0
[0.  0.4 1. ]
[3183    1 1316]
------------------
label 3 min: 0.0 max: 1.0
[0.  0.1 1. ]
[4065    1  434]
------------------
Datframe size: 4500


# Clean data

In [117]:
# first we cleanup categorical data
df.loc[df["education level"] == 1.2, "education level"] = df['education level'].value_counts().idxmax()
df.loc[df["label"] == 0.1, "label"] = df['label'].value_counts().idxmax()
df.loc[df["securities account"] == 1.3, "securities account"] = df['securities account'].value_counts().idxmax()
df.loc[df["CD account"] == 0.2, "CD account"] = df['CD account'].value_counts().idxmax()
df.loc[df["netbanking"] == 0.3, "netbanking"] = df['netbanking'].value_counts().idxmax()
df.loc[df["credit card"] == 0.4, "credit card"] = df['credit card'].value_counts().idxmax()


In [118]:
# Fix experience. It can't be negative
df.loc[df.experience<0, "experience"] = df.loc[df.experience>=0, "experience"].mean()

In [129]:
inspect_data(df)

age 45 min: 23 max: 67
------------------
experience 46 min: 0.0 max: 43.0
------------------
income 162 min: 8 max: 224
------------------
family size 4 min: 1 max: 4
[1 2 3 4]
[1317 1173  915 1095]
------------------
avg spends per month 105 min: 0.0 max: 10.0
------------------
education level 3 min: 1.0 max: 3.0
[1. 2. 3.]
[1883 1262 1355]
------------------
mortgage value of house 338 min: 0 max: 635
------------------
securities account 2 min: 0.0 max: 1.0
[0. 1.]
[4022  478]
------------------
CD account 2 min: 0.0 max: 1.0
[0. 1.]
[4227  273]
------------------
netbanking 2 min: 0.0 max: 1.0
[0. 1.]
[1791 2709]
------------------
credit card 2 min: 0.0 max: 1.0
[0. 1.]
[3184 1316]
------------------
label 2 min: 0.0 max: 1.0
[0. 1.]
[4066  434]
------------------
Datframe size: 4500


In [120]:
train_df, validate_df = train_test_split(df, test_size = 0.2, random_state=0)

In [121]:
prob_y_is_1 = len(train_df[train_df.label==1]) / len(train_df)
prob_y_is_0 = len(train_df[train_df.label==0]) / len(train_df)
print(prob_y_is_0)
print(prob_y_is_1)

0.9044444444444445
0.09555555555555556


{
    feature_name:
         {
             feat_val1: 
                {
                    Class1:Prob,
                    Class2:prob
                }
         },
    
    numerical_feature_name:
        {
            mean:
                {
                    class1: Prob,
                    class2: Prob
                },
            std:
                {
                    
                }
        }
}

In [None]:
# prob education level = 1 | Yes


In [122]:
def conditional_probability(feature, class_label):
    """Returns conditional probability P(A|B) = P(A and B) / P(B) = n(A and B) / n(B)"""
    feature_name, feature_value = feature
    class_name, class_value = class_label # for our purposes class_name will always be "label" and value can be 0 or 1
    n_A_and_B = len(df[(df[feature_name]==feature_value) & (df[class_name]==class_value)]) 
    n_B = len(df[df[class_name]==class_value])
    return n_A_and_B / n_B
    
    
    
def probability(df, feature_name, feature_value):
    # P(A) = n(A) / n(Sample space)
    return len(df[df[feature_name]==feature_value]) / len(df)

In [92]:
# df[(df["education level"]==1) & (df["label"]==1)]
len(df[df["label"]==0])

4066

In [103]:
def get_probability(train_df, feature):
    if feature in categorical_features:
        unique_values = np.unique(train_df[feature].values)
        pass
    elif feature in numerical_features:
        pass
    else: # skip it's unneccesary feature
        pass

In [102]:
def classify_example(train_df, example):
    predicted_label = None
    
    for feature in example.columns:
        get_probability(train_df, feature)
        

SyntaxError: unexpected EOF while parsing (<ipython-input-102-28af916ee18f>, line 4)

In [187]:
# prob_yes = probability(df, "label", 1)
# prob_no = probability(df, "label", 0)

probability_matrix = {}

unique_classes = np.unique(df["label"])

for feature_name in df.columns:
    if feature_name in categorical_features:
        if feature_name not in probability_matrix:
            probability_matrix[feature_name] = {}

        unique_values = np.unique(df[feature_name].values)
        for feature_val in unique_values:
            if feature_val not in probability_matrix[feature_name]:
                probability_matrix[feature_name][feature_val] = {}

            for class_value in unique_classes:
                if class_value not in probability_matrix[feature_name][feature_val]:
                    probability_matrix[feature_name][feature_val][class_value] = 0

                probability_matrix[feature_name][feature_val][class_value] = conditional_probability((feature_name, feature_val), ("label", class_value))
    
    elif feature_name in numerical_features:
        if feature_name not in probability_matrix:
            probability_matrix[feature_name] = {"mean":{}, "std":{}}
            # find mean and standard deviation
            for class_value in  unique_classes:
                probability_matrix[feature_name]["mean"][class_value] = df[df["label"]==class_value][feature_name].mean()
            
            for class_value in  unique_classes:
                probability_matrix[feature_name]["std"][class_value] = df[df["label"]==class_value][feature_name].mean()



In [185]:
df[df["label"]==1]["experience"].std()

11.578392755828805

In [188]:
from pprint import pprint
pprint(probability_matrix)

{'CD account': {0.0: {0.0: 0.9638465322183964, 1.0: 0.7096774193548387},
                1.0: {0.0: 0.03615346778160354, 1.0: 0.2903225806451613}},
 'age': {'mean': {0.0: 45.41195277914412, 1.0: 44.97004608294931},
         'std': {0.0: 45.41195277914412, 1.0: 44.97004608294931}},
 'avg spends per month': {'mean': {0.0: 1.7277471716674866,
                                   1.0: 3.914400921658986},
                          'std': {0.0: 1.7277471716674866,
                                  1.0: 3.914400921658986}},
 'credit card': {0.0: {0.0: 0.7075750122970978, 1.0: 0.7073732718894009},
                 1.0: {0.0: 0.2924249877029021, 1.0: 0.2926267281105991}},
 'education level': {1.0: {0.0: 0.44318740777176585, 1.0: 0.18663594470046083},
                     2.0: {0.0: 0.2705361534677816, 1.0: 0.37327188940092165},
                     3.0: {0.0: 0.28627643876045256, 1.0: 0.4400921658986175}},
 'experience': {'mean': {0.0: 20.411061625901528, 1.0: 19.74884792626728},
                

In [191]:
import math
def normpdf(x, mean, sd):
    var = float(sd)**2
    denom = (2*math.pi*var)**.5
    num = math.exp(-(float(x)-float(mean))**2/(2*var))
    return num/denom

In [195]:
example = validate_df.head(1)
class_label=0
proposition_prior_prob=probability(df, "label",class_label)

likelihood = 1
for feature_name in example.columns:
    feature_value = example[feature_name].values[0]
    if feature_name == "label":
        continue # skip
    if feature_name in categorical_features:
        prob = probability_matrix[feature_name][feature_value][class_label]
    elif feature_name in numerical_features:
        mean = probability_matrix[feature_name]["mean"][class_label]
        std = probability_matrix[feature_name]["std"][class_label]
        prob = normpdf(feature_value,mean, std)
        likelihood *= prob
        
likelihood *= proposition_prior_prob
likelihood

6.677897177371906e-12

In [158]:
prob_yes = probability(df, "label", 1)
prob_no = probability(df, "label", 0)
