In [82]:
from __future__ import division
import pandas as pd
from collections import Counter
from itertools import chain, groupby
from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split
from sklearn import linear_model
import numpy as np
import random
import math

In [83]:
class NaiveBayesClassifier:
    
    def __init__(self, df, output_col_name, features):
        self.df = df
        self.conditional_probabilites = {}
        self.prior_probabilites = {}
        self.output = output_col_name
        self.features = features
        
    def calculate_prior_probabilites(self):
        groups = self.df.groupby(self.output).groups
        for x in groups:
            self.prior_probabilites[x] = len(groups[x])/len(df)
    
    def calculate_conditional_probabilites(self):
        for x in self.features:
            if x != self.output:
                self.conditional_probabilites[x] = self.df.groupby(self.output)[x].value_counts()/self.df.groupby(self.output)[x].count()
                
    def train(self):
        self.calculate_prior_probabilites()
        self.calculate_conditional_probabilites()
    
    def predict(self, row):
        posterior_prob = {}
        
        # Initialize the posterior probability as same as the priors
        for x in self.prior_probabilites:
            posterior_prob[x] = self.prior_probabilites[x]
        
        for label in posterior_prob:
            for x in self.features:
                if x != self.output:
                    if label in self.conditional_probabilites[x] and row[x] in self.conditional_probabilites[x].get(label):
                        posterior_prob[label] *= self.conditional_probabilites[x].get(label).get(row[x])
                    else:
                        posterior_prob[label] = 0
                        break
        return max(posterior_prob, key=posterior_prob.get)

In [71]:
class LogisticRegression:
    
    def __init__(self, X, output, w, alpha):
        self.X = X
        self.output = output
        self.alpha = alpha
        self.w = w
    
    @staticmethod
    def sigmoid_function(x):
        val = 1/(1 + np.exp(-x))
        return val
    
    @staticmethod
    def compute_regressor(w, X):
        regressor = np.dot(w, X)
        return LogisticRegression.sigmoid_function(regressor)
       
    def gradient(self, j):
        total_error = 0
        for i in xrange(len(self.output)):
            xi = self.X[i]
            xij = xi[j]
            yi = self.output[i]
            ri = LogisticRegression.compute_regressor(self.w, xi)
            if yi == -1:
                yi = 0
            diff = xij*(yi-ri)
            total_error += diff
        constant = float(self.alpha)/float(len(self.output))
        grad = constant * total_error
        return grad
    
    
    def gradient_ascent(self):
        wi = []
        for j in xrange(len(self.w)):
            grad = self.gradient(j)
            wij = self.w[j] + grad[0]
            wi.append(wij)
        return wi
    
    
    def train(self, max_iters):
        for counter in xrange(max_iters):
            wi = self.gradient_ascent()
            self.w = wi;
    
    def predict(self, x):
        posterior_prob = LogisticRegression.compute_regressor(self.w, x)
        if posterior_prob >= 0.5:
            return 1
        else:
            return -1

In [84]:
dataPath = './DataSets/breast-cancer-wisconsin.data.txt'
columns = ['id', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'output']
features = ['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'output']
df = pd.read_csv(dataPath, names=columns)
df = df.replace('?', np.nan)
df[['x6']] = df[['x6']].apply(pd.to_numeric)
df = df.replace(np.nan, df['x6'].mean())
df.isnull().sum()

id        0
x1        0
x2        0
x3        0
x4        0
x5        0
x6        0
x7        0
x8        0
x9        0
output    0
dtype: int64

In [85]:
df['output'] = df['output'].apply(lambda x : 1 if x == 2 else -1)

In [86]:
df_X_train, df_X_test, df_y_train, df_y_test = train_test_split(df[features], df['output'], 
                                                                test_size=0.33, random_state=42)
print len(df_X_train)
print len(df_y_train)

468
468


In [87]:
fractions = [0.01, 0.02, 0.03, 0.125, 0.625, 1]
training_data_size = []
for fraction in fractions:
    df_train = df_X_train.sample(frac=fraction)
    training_data_size.append(len(df_train))
    nbclassifier = NaiveBayesClassifier(df=df_train, output_col_name='output', features=features)
    nbclassifier.train()
    df_X_test['prediction'] = df_X_test.apply(lambda row : nbclassifier.predict(row), axis=1)
    accuracy_results = pd.crosstab(df_X_test['prediction'], df_y_test)
    
    misclassifcation = 0;
    
    if 1 in accuracy_results:
        if -1 in accuracy_results[1]:
            misclassifcation += accuracy_results[1][-1]
        
    
    if -1 in accuracy_results:
        if 1 in accuracy_results[-1]:
            misclassifcation += accuracy_results[-1][1]
    
    print misclassifcation

77
77
77
53
10
7


In [81]:
fractions = [0.01, 0.02, 0.03, 0.125, 0.625, 1]
training_data_size = []
cols = ['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9']
#df_X_train['x0'] = df_X_train.apply(lambda row : 1)
#df_X_test['x0'] = df_X_test.apply(lambda row : 1)

for fraction in fractions:
    df_train = df_X_train.sample(frac=fraction)
    training_data_size.append(len(df_train))
    w = [0, 0, 0, 0, 0, 0, 0, 0, 0]
    
    lr = LogisticRegression(df_train.as_matrix(columns=cols), df_train.as_matrix(columns=['output']), w, 0.1)
    lr.train(100)
    #print lr.w
    df_X_test['lprediction'] = df_X_test.apply(lambda row : lr.predict(row.as_matrix(columns=cols)), axis=1)
    accuracy_results = pd.crosstab(df_X_test['lprediction'], df_y_test)
    print accuracy_results

output       -1    1
lprediction         
-1           47   11
 1           30  143
output       -1    1
lprediction         
1            77  154
output       -1    1
lprediction         
-1           35   19
 1           42  135
output       -1    1
lprediction         
-1           64   30
 1           13  124
output       -1    1
lprediction         
-1           62   23
 1           15  131
output       -1    1
lprediction         
-1           60   24
 1           17  130


In [78]:
fractions = [0.01, 0.02, 0.03, 0.125, 0.625, 1]
training_data_size = []
cols = ['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9']
#df_X_train['x0'] = df_X_train.apply(lambda row : 1)
#df_X_test['x0'] = df_X_test.apply(lambda row : 1)

for fraction in fractions:
    df_train = df_X_train.sample(frac=fraction)
    training_data_size.append(len(df_train))
    w = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0])
    w = w.reshape(1, 9)
    lr = linear_model.LogisticRegression()
    lr.fit(df_train[cols], df_train['output'])
    #print lr.coef_
    df_X_test['lprediction'] = lr.predict(df_X_test[cols])

    accuracy_results = pd.crosstab(df_X_test['lprediction'], df_y_test)
    print accuracy_results

output       -1    1
lprediction         
-1           24    4
 1           53  150
output       -1    1
lprediction         
-1           38   11
 1           39  143
output       -1    1
lprediction         
-1           37    2
 1           40  152
output       -1    1
lprediction         
-1           61    7
 1           16  147
output       -1    1
lprediction         
-1           69    3
 1            8  151
output       -1    1
lprediction         
-1           71    2
 1            6  152
