In [1]:
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import unique_labels
from sklearn.metrics import euclidean_distances
from sklearn import datasets

class Bayes(BaseEstimator):
    dataset = [] #dataset
    targets = [] #array contenedor de las etiquetas de cada registro del dataset
    targets_name = [] #array contenedor de todas las posibles etiquetas
    new_tuple = [] #nuevo registro: Contenedor de F1..Fn (atributos)
    aProbF = [] #array contenedor de probabilidades de Fs (columnas del dataset)
    aProbC = [] #array contenedor de probabilidades de C (todas las posibles etiquetas)
    aProbC_new_tuple = [] #array contenecdor de probabilidades de C (respecto al nuevo registro)
    
    #Constructor recibe como parametros:
    def __init__(self, targets_name):
        self.targets_name = targets_name
    
    def fit(self, X, y):
        self.dataset = self.verifyType(X)
        self.targets = y
        self.initArrays()
        for i in range(0, len(self.targets_name)):
            cont = 0.0
            for j in range(0, len(self.targets)):
                if(self.targets_name[i] == self.targets[j]):
                    cont = cont + 1.0
            self.aProbC[i] = cont / len(self.targets)
        return self
    
    def predict(self, X):
        y = [-1 for d in range( len(X) )]
        for i in range(0, len(X) ):
            y[i] = self.perform(X[i])
        return y
    
    #funcion que verifica si el dataset recibido como parametro es de tipo DataFrame, si no es asi, se convierte a este tipo
    def verifyType(self, X):
        if(not type(X)==pd.DataFrame):
            X = pd.DataFrame(data=X)
        return X
    #Funcion que inicializa los arrays de probabilidades
    def initArrays(self):
        self.aProbF = [0.0 for x in range(self.dataset.shape[1])]
        self.aProbC = [0.0 for x in range(len(self.targets_name))]
        self.aProbC_new_tuple = [0.0 for x in range(len(self.targets_name))]
        
    #Funcion que ejecuta el algoritmo
    def perform(self, new_tuple):
        for x in range(0, len(self.targets_name)):
            cont = 0.0
            self.aProbF = [0.0 for d in range(self.dataset.shape[1])]
            for i in range(0, self.dataset.shape[0]):
                if(self.targets_name[x]==self.targets[i]):
                    cont = cont + 1.0
                    for j in range(0, self.dataset.shape[1]):
                        if(self.dataset.at[i,j] == new_tuple[j]):
                            self.aProbF[j] = self.aProbF[j] + 1.0
            self.exProbFandC(cont,x)
        return self.getLabel()
        
        
    #Funcion que calcula las probabilidades de cada etiqueta posible
    def exProbFandC(self, norm, indC):
        acum = 1.0
        for i in range(0, len(self.aProbF)):
            self.aProbF[i] = self.aProbF[i] / norm
            acum = acum * self.aProbF[i]
        self.aProbC_new_tuple[indC] = self.aProbC[indC] * acum
    
    #Funcion que retorna la etiqueta con la probabilidad mas alta
    def getLabel(self):
        aux=0.0
        ind = -1
        for i in range(0,len(self.aProbC)):
            if(self.aProbC_new_tuple[i] >= aux):
                aux=self.aProbC_new_tuple[i]
                ind = i
        return self.targets_name[ind]
    
    #Funcion que imprime las probabilidades de todas las etiquetas posibles
    def printProb(self):
        print "*Probabilidades de las etiquetas*"
        for i in range(0,len(self.aProbC)):
            print "C[",self.targets_name[i],"| F]"," = ",self.aProbC[i]

In [2]:
#CROSS VALIDATION

# Senado 
#cargar dataset
dataset =pd.read_csv("senado.csv", header=None)
# inicializar matriz contnedora de la transformacion de los votos
Matrix = [[0 for x in range(628)] for y in range(129)]
ii=0
jj=0
#Reocrte de dataset (solo tomar los votos) y transformar los votos representacion numerica
for i in range(1,129):
    for j in range(3,630):
        if(dataset.at[i,j]=="PRO"):
            Matrix[ii][jj]= 1
        elif(dataset.at[i,j]=="CONTRA"):
            Matrix[ii][jj]= -1
        else:
            Matrix[ii][jj] = 0
        jj=jj+1
    jj=0
    ii=ii+1

ii=0
#obtener las etiquetas de cada registro del dataset
#targets = [0 for x in range(129)]
targets = [0 for x in range(129)]
for i in range(1,129):
    if(dataset.at[i,1]=="PRI"):
        targets[ii]=0
    elif(dataset.at[i,1]=="SG"):
        targets[ii]=1
    elif(dataset.at[i,1]=="PAN"):
        targets[ii]=2
    elif(dataset.at[i,1]=="PRD"):
        targets[ii]=3
    elif(dataset.at[i,1]=="PT"):
        targets[ii]=4
    elif(dataset.at[i,1]=="PVEM"):
        targets[ii]=5
    ii=ii+1
    
targets_name = [0,1,2,3,4,5]
bayes = Bayes(targets_name)

#Senado dataset with Naive Bayes

scores = cross_val_score(bayes, Matrix, targets, scoring="accuracy", cv=10)
print "***CROSS VALIDATION SCORE***"
print "***Naive Bayes**"
print ""
print "==Senado=="
print "Scores: ",scores
print "Average: ", scores.mean()
print "std: ", scores.std()


***CROSS VALIDATION SCORE***
***Naive Bayes**

==Senado==
Scores:  [0.69230769 0.76923077 0.69230769 0.38461538 0.53846154 0.46153846
 0.53846154 0.61538462 0.69230769 0.5       ]
Average:  0.5884615384615384
std:  0.11672300696533475


In [3]:
#Iris dataset with Naive Bayes
iris = datasets.load_iris()
iris_target_names = [0,1,2]
bayes_iris = Bayes(iris_target_names)
scores = cross_val_score(bayes_iris, iris.data, iris.target, scoring="accuracy", cv=10)
print "==Iris=="
print "Scores: ",scores
print "Average: ", scores.mean()
print "std: ", scores.std()



==Iris==
Scores:  [0.8        0.73333333 0.66666667 0.53333333 0.53333333 0.53333333
 0.93333333 0.86666667 0.73333333 0.93333333]
Average:  0.7266666666666667
std:  0.15040685563571304


In [4]:
#Digits dataset with Naive Bayes
digits = datasets.load_digits()
bayes_digits = Bayes(digits.target_names)
scores = cross_val_score(bayes_digits, digits.data, digits.target, scoring="accuracy", cv=10)
print "==Digits=="
print "Scores: ",scores
print "Average: ", scores.mean()
print "std: ", scores.std()

==Digits==
Scores:  [0.64444444 0.76666667 0.63333333 0.60555556 0.65       0.63888889
 0.65       0.62569832 0.70391061 0.7150838 ]
Average:  0.6633581626319056
std:  0.0468689023984097
