TODO

1. Conferir perguntas Decision Stump
2. Adicionar Pesos dos pontos

In [45]:
import os, time, math

import numpy as np
import matplotlib as plt
import pandas as pd

pd.options.display.max_rows = 10

In [2]:
data = pd.read_csv('data/tic-tac-toe.data', header=None)
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,x,x,x,x,o,o,x,o,o,positive
1,x,x,x,x,o,o,o,x,o,positive
2,x,x,x,x,o,o,o,o,x,positive
3,x,x,x,x,o,o,o,b,b,positive
4,x,x,x,x,o,o,b,o,b,positive
...,...,...,...,...,...,...,...,...,...,...
953,o,x,x,x,o,o,o,x,x,negative
954,o,x,o,x,x,o,x,o,x,negative
955,o,x,o,x,o,x,x,o,x,negative
956,o,x,o,o,x,x,x,o,x,negative


In [3]:
# Armazenar o label correto dos numeros
data = data.rename(columns = {9:'label'})
data['label'] = data['label'].replace({'positive': 1, 'negative': -1})

# Separar 20% para teste
train = data.sample(frac = 0.8)
test = data.drop(train.index)

y_train = train.label
y_test  = test.label

# Retirar a primeira coluna referente aos labels
X_train = train.drop("label", axis=1)
X_test  = test.drop("label", axis=1)

In [93]:
class DecisionStump:
    def __init__(self):
        # Possiveis categorias que o dado vai ter para poder realizar as perguntas (Tem x na posição 0?)
        # No caso do tick-tack-toe vai ser x, o e b
        self.unique_categories = []
        # Dimensionalidade do dado, para saber quantas possíveis posições pode ter
        self.data_dimension = 0
        
        self.model = {
            'question_column' : None,
            'question_cat'    : None,
            'value'           : None,
            'train_error'     : None
        }
        
    def fit(self, X, y, weights):
        """Seleciona o modelo com base em X, y e os pesos de cada entrada"""
        
        # Guarda todos os valores possiveis para todas as colunas e a dimensao dos dados
        # No caso do Tick-Tack-Toe, sera x,o,b para todas
        for column in X:
            self.unique_categories.append(X[column].unique())
        self.data_dimension = len(list(X))
        
        best_model_error = 1
        for i in range(0, self.data_dimension):
            for cat in self.unique_categories[0]:
                for value in (-1, 1):
                    error = self._compute_training_error(X, y, i, cat, value, weights=weights)
                    if (error < best_model_error):
                        best_model_error  = error
                        self.model['question_column'] = i
                        self.model['question_cat']    = cat
                        self.model['value']           = value
                        self.model['train_error']     = error
#                         print('Best model {}-{}-{} with error {}'.format(
#                             i, cat, value, error))
        
#         print('Best model {} {} {} {}'.format(self.model['question_column'], self.model['question_cat'],
#                                         self.model['value'], self.model['train_error']))
        
        return
    
    def predict(self, X):
        """Prevê os valores das entradas em X"""
        if (self.model['question_column'] == None):
            print('Model was not fitted')
            return
        
        predictions = []
        for row in X:
            if( row[self.model['question_column']] == self.model['question_cat'] ):
                predictions.append(self.model['value'])
            else:
                predictions.append(-self.model['value'])
        return predictions
    
    def _compute_training_error(self, X, y, question_column, question_cat, value, weights=None):
        nb_samples  = X.count()[0]
        right_count = 0
        error = 0
        for i, row in X.iterrows():
            if( (row[question_column] == question_cat and y[i] == value) or 
                  (row[question_column] != question_cat and y[i] != value) ):
                right_count += 1
            else:
                error += weights[i]
        
        return error
    
    def get_empirical_error(self):
        return self.model['train_error']
    
dt = DecisionStump()
weights = pd.Series(index=y_train.index.copy())
weights[:] = 1.0 / y_train.shape[0]
dt.fit(X_train, y_train, weights = weights)

predictions = dt.predict(X_test.values)
right_count = 0
for i, row in enumerate(predictions):
    if (predictions[i] == y_test.values[i]):
        right_count += 1
print('Accuracy on test: {}'.format(right_count / float(len(y_test.values))) )

Accuracy on test: 0.6927083333333334


In [None]:
class AdaBoost:
    def __init__(self, nb_estimators = 30):
        self.nb_estimators = nb_estimators
        self.models  = []
        self.weights = []
        self.alphas  = []
        
    def fit(self, X, y):
        
        # Calculate initial weights
        weights = pd.Series(index=y.index.copy())
        weights[:] = 1.0 / y.shape[0]
        self.weights.append( weights )
        
        for i in range(0, self.nb_estimators):
            # Calculate new models
            dt = DecisionStump()
            dt.fit(X, y, self.weights[i])
            self.models.append(dt)
            
            # Calculate new alpha
            error = dt.get_empirical_error()
            alpha = 0.5 * math.log( (1 - error) / error )
            self.alphas.append(alpha)
            
            # Calculate new weights
            new_weights = pd.Series(index=y.index.copy())
            for index, _ in new_weights.iteritems():
                prediction = dt.predict(np.array([ X.loc[index].values ]))[0]
                true_y     = y[index]
                new_weights[index] = self.weights[i][index] * math.exp( -alpha * prediction * true_y )
            new_weights = new_weights.apply(lambda x: float(x) / new_weights.sum())
            self.weights.append( new_weights )
        
        print(self.alphas)
        
        return self
    
    def _sign(self, value):
        if (value >= 0):
            return 1
        return -1
    
    def predict(self, X):
        """Prevê os valores das entradas em X"""
        if ( len(self.models) == 0 ):
            print('Model was not fitted')
            return
        
        predictions = []
        for row in X:
            sum_predictions = np.sum([ dt.predict([row])[0] * self.alphas[i] for i,dt in enumerate(self.models)])
            predictions.append( self._sign(sum_predictions))
        return predictions

xs = []
ys = []
for estimators in range(2, 20):
    start = time.time()
    adb = AdaBoost(nb_estimators=estimators)
    adb.fit(X_train, y_train)
    end = time.time()
    predictions = adb.predict(X_test.values)
    right_count = 0
    for i, row in enumerate(predictions):
        if (predictions[i] == y_test.values[i]):
            right_count += 1
    
    accuracy = right_count / float(len(y_test.values))
    xs.append(estimators)
    ys.append(accuracy)
    
    print('Fitting took {} seconds'.format(end-start))
    print('Accuracy on test with {} estimators: {}'.format(estimators, accuracy))

[0.42613804547730977, 0.24846607593288209]
Fitting took 7.076622009277344 seconds
Accuracy on test with 2 estimators: 0.6927083333333334
[0.42613804547730977, 0.24846607593288209, 0.264624011407255]
Fitting took 10.655147552490234 seconds
Accuracy on test with 3 estimators: 0.6770833333333334
[0.42613804547730977, 0.24846607593288209, 0.264624011407255, 0.24796043589861338]
Fitting took 14.834981679916382 seconds
Accuracy on test with 4 estimators: 0.7291666666666666
[0.42613804547730977, 0.24846607593288209, 0.264624011407255, 0.24796043589861338, 0.25246177126406294]
Fitting took 18.870929479599 seconds
Accuracy on test with 5 estimators: 0.8020833333333334
[0.42613804547730977, 0.24846607593288209, 0.264624011407255, 0.24796043589861338, 0.25246177126406294, 0.19544508781436926]
Fitting took 22.331513166427612 seconds
Accuracy on test with 6 estimators: 0.7291666666666666
[0.42613804547730977, 0.24846607593288209, 0.264624011407255, 0.24796043589861338, 0.25246177126406294, 0.195445

In [None]:
def plot_loss_curve(xs, ys):
    fig = plt.figure(figsize=(9, 9), dpi= 80)
    plt.style.use('ggplot')
    plt.title('Loss Curve')
    plt.plot(xs, ys, 'bo-')
    return plt

In [None]:
plot_loss_curve(xs, ys)

4      1.0
5      1.0
14     1.0
15     1.0
31     1.0
      ... 
924    1.0
931    1.0
940    1.0
945    1.0
947    1.0
Length: 192, dtype: float64