In [202]:
"""We will follow the steps below to execute this tutorial:

* Prepare the Data: Load the data from a file and split it into training and test datasets. One can also load directly from the online database but this should be covered in a different tutorial.
* Split the Data: We split our data, using a given ratio, to create our training set and test set.
* Classify our Data:  We divide the training data into classes (using the given ones) such that we can associate probabilities with each class.
* Build the Model: We use the Gaussian distribution function (equation above) to create our probability calculator. This require to calculate the mean $\mu$ and the standard deviation $\sigma$ of each set of attributes and then we associate that with the given classes. 
* Predict: After training our model we use the outcome (the summary/result of the Gaussian model evaluation for each class), we generate predictions given the test the model outcome.
* Measure Accuracy: We check how accurate is our model
* Compare Accuracy: We compare the prediction made by our model to that made by 'scikit-learn' model."""

import os, csv, random, math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb

from pandas import Series
import matplotlib.pyplot as plt


class Data_Manipulate(object):
    def __init__(self, _folder, _file):
        self._folder = _folder
        self._file = _file
        #self._dataset = _dataset
        
    def data_loader(self,):
        _dir = os.path.join(os.path.dirname(os.path.realpath("__file__")), self._folder)
        file_path = _dir+'/'+self._file
        data = csv.reader(open(file_path, "rb"))
        dataset = list(data)
        for line in range(len(dataset)):
            dataset[line] = [float(x) for x in dataset[line]] 
        dataset = dataset[:-1]
        print "Data loaded from '{0}' with size: {1}".format(self._file, len(dataset))
    
        return self._file, dataset
    
    def data_splitter(self, _dataset, split_ratio):
        train_size = int(len(_dataset)*split_ratio)
        train_set = []
        test_set = list(_dataset)
        while len(train_set) < train_size:
            index = random.randrange(len(test_set))
            train_set.append(test_set.pop(index))
        print "Training set size: {0}, Test set size: {1}".format(abs(len(train_set)), len(test_set))
        
        return [train_set, test_set]
    
class Data_Classify(object):
    
    """Classifications: a simple scenario that we take each vector from our dataset (i.e. row by row) and 
    and test it against our argument of classification: the vectors that contain large number of 
    elements that are a like will be put in same class, and so on"""
    
    def __init__(self, _dataset):
        self._dataset = _dataset
        
    def data_class(self, train_set):
        _class = {}
        for x in range(len(self._dataset)):
            _row = self._dataset[x]
            if (_row[-1] not in _class): 
                #[-1] means takes the last element and classify accordingly
                # the reason of that because our 'class:0,1' is at the end of the row
                _class[_row[-1]] = []
            _class[_row[-1]].append(_row)
            
        print "Classes found in the dataset: {0}".format(_class.keys())
        for i in range(len(_class.keys())):
            print "Data has a class key of {0} with count of {1}".format(i, 
                [item[-1] for item in train_set].count(_class.keys()[i]))
       
        return _class

class Data_Summary(object):
    
    """- Here are our estimation tools, the mean and the standard deviation 
    - We put the training data, after measuring the mean and standard deviation, 
    into summary class, such that we will have [(mean_1, stdev_1), ... (mean_n, stdev_n)]
    - Separat the training dataset into instances grouped by class, then calculate the 
    summaries (which decrib the mean and stdev, from above fucntion) for each attribute 
    this should give {class: (mean, stdev)} such that: 
    {0: [(m1, st1), (m2, st2), ...], 1: [(m1, st1), (m2, st2), ...]}"""
    
    def __init__(self, _dataset):
        #Data_Classify.__init__(self, _dataset)
        self._dataset = _dataset
        
    #@staticmethod
    def mean(self, enteries):
        enteries = enteries#[:-1]
        mean = sum(enteries)/float(len(enteries))
        #print mean
    
        return mean

    #@staticmethod
    def stdev(self, enteries):
        enteries = enteries#[:-1]
        _mean = self.mean(enteries)
        variance = sum([pow(x-_mean,2) for x in enteries])/float(len(enteries)-1)
        _stdev = math.sqrt(variance)
    
        return _stdev
    #@classmethod
    def data_summary(self,):
        summary = [(self.mean(attribute), self.stdev(attribute)) for attribute in zip(*self._dataset)]
        del summary[-1]

#         print '-*'*25, '\nSummary sample:'
#         for i in range(len(summary[:3])): 
#             print('Attributes:{0}, mean = {1}, stdev = {2}').format(
#                 self._dataset[i], 
#                 self.mean(self._dataset[i]), 
#                 self.stdev(self._dataset[i]))
#         print '-*'*25
        
        return summary
    
    def sort_data(self, dataset):
        init = Data_Classify(dataset)
        separated = init.data_class(dataset)
        classes = {}
        for classValue, instances in separated.iteritems():
            classes[classValue] = self.data_summary()
            
        print '-*'*25, '\nSample of summary:'
        for i in range(len(dataset[:3])): 
            print('Attributes:{0}, mean = {1}, stdev = {2}').format(
                dataset[i], 
                self.mean(dataset[i]), 
                self.stdev(dataset[i]))

        #print('\nData sorted by class value: {0}').format(classes)
        print '-*'*25
        
        return classes

class Probability_Calculator(object):
    """
    All that being given, we are now ready to carry our predictions using the training set. 
    The process is now simple, we just need to allow for this scenario: calculate the probability 
    that a given set of attributes will belong to a specific class, then we mark the class with 
    the highest probability scor as our prediction. In steps:
        Calculate Gaussian Probability Density Function per class,
        Calculate all the probabilities for all classes,
        Predict,
        Measure the model accuracy.
        """
#     def __init__(self, _trainSet, _testSet):
#         self._trainSet = _trainSet
#         self._testSet = _testSet
        
    def gaussian_estimator(self, x, mean, stdev):
        base = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
        estimator = (1.0 / (math.sqrt(2*math.pi) * stdev)) * base
    
        return estimator
    
    def class_probability(self, train, test):
        probabilities = {}
        for target, vector in train.iteritems():
            probabilities[target] = 1 # we need to have initial value here.
            for i in range(len(vector)):
                mean, stdev = vector[i]
                x = test[i]
            probabilities[target] *= self.gaussian_estimator(x, mean, stdev)
            
        return probabilities
    
    def predict(self, train_set, test_set):
        predictions = []
        for i in range(len(test_set)):
            probabilities = self.class_probability(train_set, test_set[i])
            _class, _prob = None, -1
            for Class, probability in probabilities.iteritems():
                if _class is None or probability > _prob:
                    _prob = probability
                    _class = Class
            predictions.append(_class)
            
        print predictions
        return predictions
    
#     def class_probability(self, trainSet, testSet):
#         probabilities = {}
#         for target, vector in trainSet.iteritems():
#             probabilities[target] = 1 # we need to have initial value here.
#             for i in range(len(vector)):
#                 mean, stdev = vector[i]
#                 x = self._testSet[i]
#                 probabilities[target] *= self.gaussian_estimator(x, mean, stdev)
         
#         return probabilities
    
#     def predict(self, trainSet, testSet):
#         predictions = []
#         for i in range(len(testSet)):
#             print testSet[i]
#             probabilities = self.class_probability(trainSet, testSet[i])
#             _class, _prob = None, -1
#             for Class, probability in probabilities.iteritems():
#                 if _class is None or probability > _prob:
#                     _prob = probability
#                     _class = Class
#             predictions.append(_class)
            
#         print predictions
#         return predictions
    
        
# self test:---
if __name__ == '__main__':
    file_ = 'pima-indians-diabetes.csv'
    folder_ = 'data'
    data_manipulate = Data_Manipulate(folder_, file_)
    _file, _dataset = data_manipulate.data_loader()
    split_ratio = 0.69
    train_set, test_set = data_manipulate.data_splitter(_dataset, split_ratio)
    data_classify = Data_Classify(_dataset)
    data_classify.data_class(train_set)
    
    summary = Data_Summary(train_set)  
    summary.data_summary()
    sorted_data = summary.sort_data(_dataset)
    print sorted_data
    test= [[1.0, 85.0, 66.0, 29.0, 0.0, 26.6, 0.351, 31.0,'?'],
            [6.0, 148.0, 72.0, 35.0, 0.0, 33.6, 0.627, 50.0, '?'], 
        [5.0, 116.0, 74.0, 0.0, 0.0, 25.6, 0.201, 30.0, '?'], 
        [3.0, 78.0, 50.0, 32.0, 88.0, 31.0, 0.248, 26.0, '?']]
    trained = {0.0: [(3.1729106628242074, 2.922716172887118), (108.71469740634005, 25.20303421516836), (67.54755043227665, 18.547234296564014), (19.80979827089337, 14.796055431011643), (69.28530259365995, 98.47869924672679), (30.52247838616713, 7.398200192897232), (0.4184351585014407, 0.2815707967066099), (30.746397694524497, 11.60646657969856)], 1.0: [(4.791208791208791, 3.665627963256114), (139.33516483516485, 32.8364905279732), (70.47802197802197, 21.672271676077802), (23.13186813186813, 17.853429521021233), (96.63736263736264, 132.61459270582085), (35.89505494505495, 6.814076083365414), (0.5798076923076926, 0.37589898335371674), (36.043956043956044, 10.462151821080543)]}
    probabilities = Probability_Calculator()
        #probability = probabilities.class_probability()
    probabilities.predict(sorted_data, test_set)
        
        #print("Attribute's probability: {0}" ).format(probability)
    


Data loaded from 'pima-indians-diabetes.csv' with size: 768
Training set size: 529, Test set size: 239
Classes found in the dataset: [0.0, 1.0]
Data has a class key of 0 with count of 337
Data has a class key of 1 with count of 192
Classes found in the dataset: [0.0, 1.0]
Data has a class key of 0 with count of 500
Data has a class key of 1 with count of 268
-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-* 
Sample of summary:
Attributes:[6.0, 148.0, 72.0, 35.0, 0.0, 33.6, 0.627, 50.0, 1.0], mean = 38.4696666667, stdev = 48.2961124833
Attributes:[1.0, 85.0, 66.0, 29.0, 0.0, 26.6, 0.351, 31.0, 0.0], mean = 26.5501111111, stdev = 31.1197437347
Attributes:[8.0, 183.0, 64.0, 0.0, 0.0, 23.3, 0.672, 32.0, 1.0], mean = 34.6635555556, stdev = 59.585319952
-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
{0.0: [(3.9262759924385633, 3.3360192106287285), (120.77693761814744, 32.11283606046778), (68.8468809073724, 20.725871209961873), (20.75047258979206, 15.602123397508596), (81.15879017013232,