TODO

1. Conferir perguntas Decision Stump
2. Adicionar Pesos dos pontos

In [1]:
import os, time

import numpy as np
import matplotlib as plt
import pandas as pd

pd.options.display.max_rows = 10

In [2]:
data = pd.read_csv('data/tic-tac-toe.data', header=None)
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,x,x,x,x,o,o,x,o,o,positive
1,x,x,x,x,o,o,o,x,o,positive
2,x,x,x,x,o,o,o,o,x,positive
3,x,x,x,x,o,o,o,b,b,positive
4,x,x,x,x,o,o,b,o,b,positive
...,...,...,...,...,...,...,...,...,...,...
953,o,x,x,x,o,o,o,x,x,negative
954,o,x,o,x,x,o,x,o,x,negative
955,o,x,o,x,o,x,x,o,x,negative
956,o,x,o,o,x,x,x,o,x,negative


In [3]:
# Armazenar o label correto dos numeros
data = data.rename(columns = {9:'label'})
data['label'] = data['label'].replace({'positive': 1, 'negative': -1})

# Separar 20% para teste
train = data.sample(frac = 0.8)
test = data.drop(train.index)

y_train = train.label
y_test  = test.label

# Retirar a primeira coluna referente aos labels
X_train = train.drop("label", axis=1)
X_test  = test.drop("label", axis=1)

In [44]:
class DecisionStump:
    def __init__(self):
        # Possiveis categorias que o dado vai ter para poder realizar as perguntas (Tem x na posição 0?)
        # No caso do tick-tack-toe vai ser x, o e b
        self.unique_categories = []
        # Dimensionalidade do dado, para saber quantas possíveis posições pode ter
        self.data_dimension = 0
        
        self.model = {
            'question_column' : None,
            'question_cat'    : None,
            'value'           : None,
            'train_error'     : None
        }
        
    def fit(self, X, y, weights):
        """Seleciona o modelo com base em X, y e os pesos de cada entrada"""
        
        # Guarda todos os valores possiveis para todas as colunas e a dimensao dos dados
        # No caso do Tick-Tack-Toe, sera x,o,b para todas
        for column in X:
            self.unique_categories.append(X[column].unique())
        self.data_dimension = len(list(X))
        
        best_model_error = 1
        for i in range(0, self.data_dimension):
            for cat in self.unique_categories[0]:
                for value in (-1, 1):
                    error = self._compute_training_error(X, y, i, cat, value, weights=weights)
                    if (error < best_model_error):
                        best_model_error  = error
                        self.model['question_column'] = i
                        self.model['question_cat']    = cat
                        self.model['value']           = value
                        self.model['train_error']     = error
                        print('Best model {}-{}-{} with error {}'.format(
                            i, cat, value, error))
        
        print('Best model {} {} {} {}'.format(self.model['question_column'], self.model['question_cat'],
                                        self.model['value'], self.model['train_error']))
        
        return
    
    def predict(self, X):
        """Prevê os valores das entradas em X"""
        if (self.model['question_column'] == None):
            print('Model was not fitted')
            return
        
        predictions = []
        for row in X:
            if( row[self.model['question_column']] == self.model['question_cat'] ):
                predictions.append(self.model['value'])
            else:
                predictions.append(-self.model['value'])
        return predictions
    
    def _compute_training_error(self, X, y, question_column, question_cat, value, weights=None):
        nb_samples  = X.count()[0]
        right_count = 0
        error = 0
        for i, row in X.iterrows():
            if( (row[question_column] == question_cat and y[i] == value) or 
                  (row[question_column] != question_cat and y[i] != value) ):
                right_count += 1
            else:
                error += weights[i]
        
        return error
    
    def get_empiracal_error(self):
        return self.model['train_error']
    
dt = DecisionStump()
weights = pd.Series(index=y_train.index.copy())
weights[:] = 1.0 / y_train.shape[0]
dt.fit(X_train, y_train, weights = weights)

predictions = dt.predict(X_test.values)
right_count = 0
for i, row in enumerate(predictions):
    if (predictions[i] == y_test.values[i]):
        right_count += 1
print('Accuracy on test: {}'.format(right_count / float(len(y_test.values))) )

Best model 0-x--1 with error 0.5221932114882519
Best model 0-x-1 with error 0.4778067885117514
Best model 0-o--1 with error 0.40078328981723405
Best model 2-o--1 with error 0.3916449086161896
Best model 4-x-1 with error 0.36553524804177695
Best model 4-o--1 with error 0.29895561357702466
Best model 4 o -1 0.29895561357702466
Accuracy on test: 0.6927083333333334


In [None]:
class AdaBoost:
    def __init__(self, nb_estimators = 30):
        self.nb_estimators = nb_estimators
        self.models  = []
        self.weights = []
        self.alphas  = []
        
    def fit(self, X, y):
        
        # Calculate initial weights
        weights = pd.Series(index=y.index.copy())
        weights[:] = 1.0 / y.shape[0]
        self.weights.append( weights )
        
        for i in range(0, nb_estimators):
            dt = DecisionStump()
            dt.fit(X, y, self.weights[i])
            
        
        return
    
    def predict(self, X):
        return

In [38]:
weights

274    0.001305
247    0.001305
249    0.001305
747    0.001305
882    0.001305
         ...   
430    0.001305
603    0.001305
102    0.001305
778    0.001305
89     0.001305
Length: 766, dtype: float64

4      1.0
5      1.0
14     1.0
15     1.0
31     1.0
      ... 
924    1.0
931    1.0
940    1.0
945    1.0
947    1.0
Length: 192, dtype: float64