# House Type Classification Using Naive Bayes 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import math
import calendar
from sklearn.naive_bayes import GaussianNB 

## Goal

Use Naive Bayes to predict the house type of homes in Melbourne and compare the results of our classifier to skikit-learn's Naive Bayes classifier. 

## Load data

In [2]:
df_melb = pd.read_csv('https://gist.githubusercontent.com/yanyanzheng96/81b236aecee57f6cf65e60afd865d2bb/raw/56ddb53aa90c26ab1bdbfd0b8d8229c8d08ce45a/melb_data_train.csv')
target_col = 'Type'

## Change date attribute to be numeric 

In [3]:
# function to convert date to seconds since epoch 
def standardize_date(d):
    split = d.split('/')
    dateStr = split[0] + ' ' + split[1] + ' '
    if len(split[2]) == 2:
        dateStr += split[2]
    else:
        yearSplit = list(split[2])
        dateStr += yearSplit[2]
        dateStr += yearSplit[3]
    return dateStr

In [4]:
df_melb['Date'] = df_melb['Date'].apply( standardize_date )
df_melb['unixtime'] = df_melb['Date'].apply(lambda x: int(time.mktime(time.strptime(x, "%d %m %y"))))
df_melb = df_melb.drop(columns="Date")
print("The min unixtime is {:d} and the max unixtime is {:d}".format(df_melb['unixtime'].min(),df_melb['unixtime'].max()))

The min unixtime is 1454565600 and the max unixtime is 1506142800


## Calculate prior probabilities for each possible type

In [5]:
dict_priors = {}
counts = df_melb['Type'].value_counts()

for typePos in counts.index:
    dict_priors[typePos] = (counts[typePos] / len(df_melb))
    
# show the priors
dict_priors

{'h': 0.452, 'u': 0.418, 't': 0.13}

## Create a model for the distribution of all the numeric attributes
For each attribute, the sample mean and sample standard deviation are calculated and stored in the nested dictionary 'dict_nb_model'. 

In [6]:
dict_nb_model = dict()
df2 = df_melb.drop('Type', axis = 1)

for target in dict_priors.keys():
    dict_nb_model[target] = dict()
    for col in df2.columns:
        mean = df_melb[df_melb['Type'] == target][col].mean()
        sd = df_melb[df_melb['Type'] == target][col].std()
        dict_nb_model[target][col] = (mean, sd)

In [7]:
display(dict_nb_model)

{'h': {'Rooms': (3.269911504424779, 0.7258264201127756),
  'Price': (1189022.3451327435, 586296.5794417895),
  'Distance': (12.086725663716816, 7.397501132737295),
  'Postcode': (3103.8982300884954, 98.35750345419703),
  'Bathroom': (1.5619469026548674, 0.6720871086493075),
  'Car': (1.7777777777777777, 0.932759177140425),
  'Landsize': (932.9646017699115, 3830.7934157687164),
  'BuildingArea': (156.2433962264151, 54.62662837301434),
  'YearBuilt': (1954.900826446281, 32.461876347154686),
  'unixtime': (1485736247.7876105, 13838422.086267859)},
 'u': {'Rooms': (2.0430622009569377, 0.5908453859944267),
  'Price': (634207.1770334928, 217947.32866736987),
  'Distance': (8.760287081339714, 5.609778714430755),
  'Postcode': (3120.4545454545455, 87.18475679946482),
  'Bathroom': (1.1818181818181819, 0.42228151548662185),
  'Car': (1.1483253588516746, 0.47231993860296956),
  'Landsize': (436.23444976076553, 1394.3403794653254),
  'BuildingArea': (83.85585585585585, 45.959438015166604),
  'Yea

## Write a function that calculates the probability of a Gaussian
Given the mean, SD, and observed point (x), this function returns the probability using the formula $p(x) = \frac{1}{\sigma \sqrt{2 \pi}} e^{-\frac{1}{2}(\frac{x-\mu}{\sigma})^2}$. 

In [8]:
def get_p( mu, sigma, x):
    term1 = 1 / (sigma * math.sqrt(2*math.pi))
    term2 = np.exp((-0.5) * (((x-mu)/sigma)**2))
    p = term1 * term2
    return p

## Generate the Naive Bayes classifier function
This function takes in the prior probability dictionary, the dictionary containing all of the gaussian distribution information for each attribue, and a single observation row of the test dataframe. It returns the target classification. 

In [9]:
def nb_class( dict_priors, dict_nb_model, observation):
    probs = []
    columns = dict_nb_model[list(dict_nb_model.keys())[0]].keys()

    for key in dict_nb_model.keys():
        keyProbs = []
        for col in columns:
            if not math.isnan(observation[col]):
                if col != 'Type':
                    mu = dict_nb_model[key][col][0]
                    sigma = dict_nb_model[key][col][1]
                    x = observation[col]
                    keyProb = get_p(mu, sigma, x)
                    keyProbs.append(keyProb)
        prob = 1
        for p in keyProbs:
            prob *= p
        probs.append(prob * dict_priors[key])
    
    maxIdx = np.argmax(probs)
    return list(dict_nb_model.keys())[maxIdx]

## Calculate the accuracy of our classifier 

In [10]:
df_test = pd.read_csv('https://gist.githubusercontent.com/yanyanzheng96/c3d53303cebbd986b166591d19254bac/raw/94eb3b2d500d5f7bbc0441a8419cd855349d5d8e/melb_data_test.csv')
df_test['Date'] = df_test['Date'].apply( standardize_date )
df_test['unixtime'] = df_test['Date'].apply(lambda x: int(time.mktime(time.strptime(x, "%d %m %y"))))
df_test = df_test.drop(columns="Date")

In [11]:
predictions = []

for (indx,row) in df_test.iterrows():
    pred = nb_class(dict_priors, dict_nb_model, row)
    predictions.append(pred)   

In [13]:
correct = 0
for i in range(len(predictions)):
    if df_test['Type'][i] == predictions[i]:
        correct += 1

acc = correct / len(predictions)
print('Accuracy is {:.2f}%'.format(acc*100))

Accuracy is 57.00%


## Compare our classifier to scikit-learn's classfier. 

In [14]:
# Imputation training
dict_imputation = dict()
for col in df_melb.columns:
    if col != target_col:
        dict_imputation[col] = df_melb[col].mean()
        
# Imputation - apply on the test data
df_melb.fillna(value=dict_imputation, inplace=True)

# Seperate the attributes from the target_col
df_X = df_melb.drop('Type', axis = 1)
s_y = df_melb['Type']

In [15]:
gnb = GaussianNB()
fit = gnb.fit(df_X, s_y)

df_test.fillna(value=dict_imputation, inplace=True)

y_pred = fit.predict(df_test.drop('Type', axis = 1))
y_actual = df_test['Type']

In [16]:
correct = 0
for i in range(len(y_actual)):
    if y_pred[i] == y_actual[i]:
        correct += 1
        
acc = correct / len(y_actual)
print('Accuracy is {:.2f}%'.format(acc*100))

Accuracy is 37.00%


- Based on the fact that the accuracy decreased by 20% when using imputation, I would say that the imputation process hurt the classifier. This could be because the mean values used to fill in missing values might not accurately represent that observation, causing it to be misclassified. 