In [1]:
%matplotlib inline
import math
import pandas as pd
import matplotlib as plt
import numpy as np
import seaborn as sns; sns.set(style='ticks', color_codes=True)

from pandas.plotting import parallel_coordinates
from sklearn import preprocessing
from operator import itemgetter 

In [2]:
#Distance calculation

def bhattacharyyan_distance(v1, v2):
    if len(v1) != len(v2):
        return -1
    
    return sum([math.sqrt(v1[i] * v2[i]) for i in range(0, len(v1))])

def euclidean_distance(v1,v2):
    if len(v1) != len(v2):
        return -1
    
    return math.sqrt(sum((v1[i]-v2[i])**2 for i in range(0, len(v1))))

In [3]:
#Normalization

def normalize_max_unknown(base_df):
    df = base_df.copy()
    
    col_size = len(df.columns)
    for i in range(1, col_size):
        vector = df.iloc[:,i]
        min = np.min(vector)
        max = np.max(vector)
        
        df.iloc[:,i] = [(x - float(min)) / (float(max) - float(min)) for x in vector]
        
    return df

def normalize_time_series(base_df):
    df = base_df.copy()
    mean_list = list()
    stdev_list = list()
    
    for i in range(1, len(df)):
        vector = df.iloc[i].values
        
        mean = np.mean(vector)
        stdev = np.std(vector, ddof=1)
        mean_list.append(mean)
        stdev_list.append(stdev)
        
        df.iloc[i] = [(x - mean) / stdev for x in vector]
        
    return df
#     return df, mean_list, stdev_list

In [4]:
#Data Visualization

def parallel_lines(df, target_category):
    plt.pyplot.figure(figsize=(30,10))

    parallel_plt = parallel_coordinates(df, target_category)
    plt.pyplot.savefig('%s_parallel.png' % target_category)
    
def scatter_plot(df, target_category):
    scatter_plt = sns.pairplot(df, hue=target_category)
    scatter_plt.savefig('%s_scatter.png' % target_category)

In [5]:
#Naive Bayes

class NaiveBayes(object):
    
    def __init__(self):
        self.label_ratio = list()
        self.mean_variance_dict = {}
    
    def train(self, data, label, label_name):
        #Compute for classification ratio - P(A) = P(0), P(A) = P(1)
#         self.label_ratio.append(label.count(0) / len(label))
#         self.label_ratio.append(label.count(1) / len(label))
        
        unique, counts = np.unique(label, return_counts=True)
        label_count_dict = dict(zip(unique, counts))
        self.label_ratio.append(label_count_dict[0])
        self.label_ratio.append(label_count_dict[1])

        zero_data = data.loc[data[label_name] == 0]
        one_data = data.loc[data[label_name] == 1]
        
        for column in zero_data:
            mean = np.mean(zero_data[column])
            variance = np.var(zero_data[column])
            self.mean_variance_dict[0] = {'mean':{column:mean}}
            self.mean_variance_dict[0] = {'variance':{column:variance}}

            
        for column in one_data:
            mean = np.mean(one_data[column])
            variance = np.var(one_data[column])
            self.mean_variance_dict[1] = {'mean':{column:mean}}
            self.mean_variance_dict[1] = {'variance':{column:variance}}
        
df = pd.read_csv('sample_data.csv')
clf = NaiveBayes().train(df, df['y'].values, 'y')

In [6]:
v1 = df.loc[0]
v2 = df.loc[1]

euclidean_distance(v1,v2)

0.45128684260947494