In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pre_processing as pp
import analysis as anl
import pca

category = 'test'

hNeg = True #if true, add negative bigrams for negative reviews
noun = False #if true, add nouns

X, Y, vocabulary = pp.bow(category, hNeg, noun)

print("Vocabulário possui " + str(len(vocabulary)) + " palavras!")

Vocabulário possui 1509 palavras!


In [2]:
def normalizar(X):

    m, n = X.shape # m = qtde de objetos e n = qtde de atributos por objeto
    
    # Inicializa as variaves de saída
    X_norm = np.zeros( (m,n) ) #inicializa X_norm (base normalizada)
    #X_norm = lil_matrix((m, n))
    mu = 0 # inicializa a média
    sigma = 1 # inicializa o desvio padrão
      
    mu = np.mean(X, axis=0)
    sigma = np.std(X, axis=0, ddof=1)
    
    for i in range(m):
        for j in range(n):
            X_norm[i][j] = (X[i][j] - mu[j])/(sigma[j])
        
    return X_norm

Separa os dados em treinamento e teste:

In [3]:
# semente usada na randomizacao dos dados.
randomSeed = 10 

# gera os indices aleatorios que irao definir a ordem dos dados
idx_perm = np.random.RandomState(randomSeed).permutation(range(len(Y)))

# ordena os dados de acordo com os indices gerados aleatoriamente
X2, Y2 = X[idx_perm, :], Y[idx_perm]

#X2, Y2 = X[idx_perm, :], Y[idx_perm]

pTrain = 0.8

train_index, test_index = anl.stratified_holdOut(Y, pTrain)

Xtrain, Xval = X2[train_index, :], X2[test_index, :]
Ytrain, Yval = Y2[train_index], Y2[test_index]

Seleciona features com chi-quadrado (a partir dos dados de treinamento):

In [4]:
Xtrain, new_vocabulary, index = pp.chi2(Xtrain, Ytrain, vocabulary)
Xval = Xval[:, index]

Xtrain = Xtrain.toarray()
Xval = Xval.toarray()

In [5]:
print("Número de features antes do chi-quadrado: " + str(len(vocabulary)))
print("----------------------------------------")
print("Número de features após chi-quadrado: " + str(len(new_vocabulary)))
print(new_vocabulary)

Número de features antes do chi-quadrado: 1509
----------------------------------------
Número de features após chi-quadrado: 26
['arab', 'pain', 'good', 'love', 'entir', 'tri', 'said', 'written', 'horribl', 'wast', 'wish', 'hormon', 'explain', 'paleolith', 'die', 'well', 'bad', 'wild', 'legal', 'mean', 'whole', 'true', 'heal', 'not', 'less', 'eat']


In [6]:
def distancia(x, X):
               
    m = X.shape[0] 
    D = np.zeros(m) 

    i = 0
    for amostra in X:
        D[i] = np.linalg.norm(amostra - x)
        i += 1
                    
    return D

In [7]:
def knn(x, X, Y, K):
        
    y = 0 
    
    ind_viz = np.ones(K, dtype=int)
    
    D = distancia(x, X)
    
    votos = np.zeros(len(set(Y)))

    ind_viz = np.argsort(D)[:K]
    
    for indice in ind_viz:
        votos[Y[indice]] += 1 
    y = np.argmax(votos)
        
    return y

In [8]:
def predicao(K, Xtrain, Ytrain, Xval, Yval):
    
    Ypred = []

    for i in range(Xval.shape[0]):
        y = knn(Xval[i], Xtrain, Ytrain, K)
        Ypred.append(y)
        
    acuracia = np.sum(Ypred==Yval)/len(Yval)
    
    return Ypred, acuracia


classes = np.unique(Y)

for k in range(3, 12, 2):
    print("K = " + str(k))
    Ypred, acuracia = predicao(k, Xtrain, Ytrain, Xval, Yval)
    print(acuracia)
    auxResults = anl.relatorioDesempenho(Yval, Ypred, classes, imprimeRelatorio=True)

K = 3
[ 4.12310563  2.44948974  3.16227766  3.          9.          3.
  4.69041576  2.64575131  7.54983444  2.82842712  3.          3.16227766
  4.12310563  5.74456265  2.64575131  3.31662479  2.64575131  3.31662479
  4.79583152  1.73205081  2.44948974  2.64575131  2.44948974  2.64575131
  5.74456265  2.82842712  2.82842712  3.87298335  3.60555128  2.82842712
  2.64575131  2.82842712  2.          2.64575131  2.82842712  2.64575131
  2.44948974  3.60555128  3.46410162  2.64575131  2.64575131  2.64575131
  3.          2.64575131  6.78232998  3.          3.31662479  2.64575131
  3.31662479  5.          7.81024968  2.82842712  4.79583152  2.64575131
  3.          2.64575131  2.64575131  2.64575131  7.34846923  3.16227766
  2.82842712  2.82842712  2.64575131  6.8556546 ]
[19 32  1 20 36 22 56 21 55 16 33 14 30 62 35 39  7 40 41 43 53 47 57 23 31
 34 29 51  9 26 25 61 60 42  5  3 54 10 45  2 59 11 46 15 17 48 38 37 28 27
  0 12  6 52 18 49 24 13 44 63 58  8 50  4]
[ 4.12310563  1.41421356  

[ 5.91607978  4.47213595  2.44948974  4.35889894  6.8556546   4.58257569
  2.44948974  4.35889894  5.19615242  3.74165739  4.58257569  3.16227766
  5.91607978  6.70820393  4.35889894  4.35889894  4.58257569  4.79583152
  3.31662479  4.79583152  3.74165739  4.35889894  3.74165739  4.35889894
  6.70820393  4.47213595  3.74165739  3.87298335  4.12310563  4.47213595
  4.35889894  4.47213595  4.47213595  4.35889894  4.47213595  4.35889894
  3.74165739  2.64575131  2.82842712  4.35889894  4.35889894  4.35889894
  4.58257569  3.87298335  7.87400787  4.58257569  4.79583152  4.35889894
  4.79583152  3.87298335  8.66025404  4.47213595  2.64575131  4.35889894
  4.58257569  4.35889894  4.35889894  3.          8.60232527  4.47213595
  3.74165739  4.47213595  4.35889894  4.35889894]
[ 2  6 37 52 38 57 11 18 36 60 26  9 22 20 27 43 49 28  3 35 39 33 40 62 30
 41 15 63  7 47 23 56 21 55 53 14 61 59 51 31 32 29 25  1 34 42 45  5 10 54
 16 17 19 48 46  8 12  0 13 24  4 44 58 50]
[ 6.08276253  4.69041576

In [9]:
def curva_aprendizado(X, Y, Xval, Yval):
   
    """
    Funcao usada gerar a curva de aprendizado.
  
    Parametros
    ----------
  
    X : matriz com os dados de treinamento
  
    Y : vetor com as classes dos dados de treinamento
  
    Xval : matriz com os dados de validação
  
    Yval : vetor com as classes dos dados de validação
  
    """
    
    k = 5

    # inicializa as listas que guardarao a performance no treinamento e na validacao
    perf_train = []
    perf_val = []

    classes = np.unique(Y)
    
    for i in range(10, len(Y)):
        
        Ypred, acuracia = predicao(k, X[:i], Y[:i], X[:i], Y[:i])
        perf_train.append(acuracia)

        Ypred, acuracia = predicao(k, X[:i], Y[:i], Xval, Yval)
        perf_val.append(acuracia)


    ##################################################################################
       
    # Define o tamanho da figura 
    plt.figure(figsize=(20,12))

    # Plota os dados
    plt.plot(perf_train, color='blue', linestyle='-', linewidth=1.5, label='Treino') 
    plt.plot(perf_val, color='red', linestyle='-', linewidth=1.5, label='Validação')

    # Define os nomes do eixo x e do eixo y
    plt.xlabel(r'# Qtd. de dados de treinamento',fontsize='x-large') 
    plt.ylabel(r'Acuracia',fontsize='x-large') 

    # Define o título do gráfico
    plt.title(r'Curva de aprendizado', fontsize='x-large')

    # Acrescenta um grid no gráfico
    plt.grid(axis='both')

    # Plota a legenda
    plt.legend()
    
    plt.show()

In [10]:
#curva_aprendizado(Xtrain, Ytrain, Xval, Yval)

In [11]:
#K-folds

import k_folds as kf

#Pega todos os tipos de classes 
classes = classes = np.unique(Y)

# semente usada na randomizacao dos dados.
randomSeed = 10 

# gera os indices aleatorios que irao definir a ordem dos dados
idx_perm = np.random.RandomState(randomSeed).permutation(range(len(Y)))

# ordena os dados de acordo com os indices gerados aleatoriamente
X3, Y3 = X[idx_perm, :], Y[idx_perm]

# separa os dados em k folds
nFolds = 5
folds = kf.stratified_kfolds(Y3, nFolds, classes)

k = 1
resultados=[] # cria uma lista vazia para guardar os resultados obtidos em cada fold
for train_index, test_index in folds:

    print('\n-----------\n%d-fold: \n-----------\n' % (k) )
    
    # se train_index ou test_index forem vazios, interrompe o laco de repeticao
    if len(train_index)==0 or len(test_index)==0: 
        print('\tErro: o vetor com os indices de treinamento ou o vetor com os indices de teste esta vazio')      
        break
        
    totalFold = len(train_index)+len(test_index)
    
    Xtrain, Xtest = X3[train_index, :], X3[test_index, :];
    Ytrain, Ytest = Y3[train_index], Y3[test_index];
    
    Xtrain, new_vocabulary, index = pp.chi2(Xtrain, Ytrain, vocabulary)
    Xtest = Xtest[:,index]

    Xtrain = Xtrain.toarray()
    Xtest = Xtest.toarray()
    
    for k in range(3, 12, 2):
        print("K = " + str(k))
        Ypred, acuracia = predicao(k, Xtrain, Ytrain, Xtest, Ytest)
        print(acuracia)
        auxResults = anl.relatorioDesempenho(Ytest, Ypred, classes, imprimeRelatorio=True)
    

    # adiciona os resultados do fold atual na lista de resultados
    resultados.append( auxResults ) 
        
    k+=1
    
kf.mediaFolds( resultados, classes )


-----------
1-fold: 
-----------

K = 3
[  2.23606798   1.73205081   1.73205081   2.44948974   1.41421356   2.
   2.44948974   2.23606798   3.           4.58257569   1.41421356
   6.08276253   2.           3.31662479   2.64575131   1.73205081
   4.69041576   2.23606798   2.           1.73205081   3.31662479   2.
   3.74165739   1.73205081  11.95826074   5.47722558   1.41421356   3.
   2.82842712   1.41421356   2.           1.73205081   3.           1.41421356
   1.41421356   6.4807407    1.41421356   3.16227766   7.61577311
   1.73205081   1.41421356   2.64575131   4.47213595   4.47213595
   1.73205081   1.73205081   3.           1.73205081   3.31662479
   1.41421356   1.41421356   9.64365076   1.41421356  25.84569597
   1.73205081   1.73205081   6.164414     2.           1.41421356
   2.44948974   2.           4.35889894   1.73205081]
[58 29 33  4 52 50 49 34 26 10 36 40 31 45 44 47 54 55 39 23 62 19  1  2 15
 21 60 57 18  5 30 12  7  0 17 59  3  6 41 14 28 27  8 46 32 37 48 20 13 22

  fmedida = 2 * ((precisao * revocacao)/(precisao + revocacao))


[  2.44948974   1.41421356   1.41421356   2.23606798   1.           1.73205081
   2.23606798   2.44948974   2.44948974   4.47213595   1.           6.
   1.73205081   3.16227766   2.           1.41421356   5.19615242
   1.41421356   1.73205081   1.41421356   3.16227766   1.73205081
   3.60555128   0.          11.91637529   5.38516481   1.           2.82842712
   2.64575131   1.           1.73205081   1.41421356   2.82842712   1.           1.
   6.55743852   1.           3.           7.81024968   1.41421356   1.
   2.44948974   4.35889894   4.35889894   1.41421356   1.41421356
   2.82842712   1.41421356   3.16227766   1.           1.           9.79795897
   1.73205081  25.78759392   1.41421356   1.41421356   6.08276253
   1.73205081   1.73205081   2.23606798   1.73205081   4.24264069
   1.41421356]
[23 40 26 29 50  4 49 33 34 36 10 31 44 45 47 54 55 39 19 62 15 17  2  1 12
  5 52 30 18 58 60 21 57 14 59  3  6  8  7 41  0 28 46 32 27 37 20 48 13 22
 61 42 43  9 16 25 11 56 35 38 51 24 53]

[  3.74165739   1.73205081   2.           2.           3.74165739
   7.14142843   2.23606798   3.87298335   2.64575131   3.46410162
   9.05538514   4.47213595   2.           7.28010989   2.64575131
   2.44948974   1.41421356   5.29150262   2.44948974   2.           2.
   3.74165739   2.           3.60555128   1.73205081  13.19090596
   5.19615242   1.41421356   2.64575131   2.82842712   1.41421356   2.
   2.82842712   1.73205081   1.41421356   1.41421356   6.78232998
   1.41421356   2.44948974   9.           1.73205081   1.41421356
   2.44948974   4.47213595   4.35889894   2.           1.41421356   2.
   1.73205081   3.           1.41421356   1.41421356   9.94987437
   2.64575131  26.36285265   1.73205081   2.           6.164414
   2.23606798   2.44948974   2.64575131   1.73205081   5.19615242
   1.41421356]
[63 34 35 37 41 46 16 30 50 51 27 24 55 40 61  1 48 33 45 56 47 22 31 20  2
 12 19  3 58  6 15 38 42 18 59  8 14 28 53 60 29 32 49  9 23  4  0 21  7 44
 43 11 62 26 17 57 36  5 13 

K = 3
[  3.60555128   2.44948974   3.16227766   3.16227766   3.60555128
   7.61577311   3.           4.79583152   3.31662479   3.16227766
   9.21954446   4.79583152   2.82842712   7.61577311   3.74165739
   3.31662479   3.           2.           2.64575131   3.           3.
   2.82842712   3.           2.82842712   2.23606798   3.60555128   5.
   2.64575131   5.83095189   2.82842712   4.           3.31662479
   2.82842712   3.31662479   2.82842712   3.           6.78232998   3.           4.
   8.           2.82842712   3.           3.60555128   5.09901951
   5.09901951   3.           2.64575131   3.16227766   3.16227766
   3.87298335   2.82842712   2.82842712   9.32737905   3.46410162
  25.82634314   2.64575131   2.82842712   6.70820393   3.           3.
   3.60555128   3.           4.89897949   3.        ]
[17 24  1 27 46 55 18 32 40 29 23 21 50 34 51 12 56 58 37 61 41 59 45  6 63
 20 19 35 16 22  9  3 47  2 48 31  8 33 15 53  0  4 60 42 25 14 49 38 30 11
  7 62 26 44 43 28 57 36  5 1

[  3.60555128   1.73205081   2.           2.82842712   3.60555128
   7.14142843   2.           3.87298335   2.           3.87298335
   8.36660027   3.46410162   1.73205081   6.55743852   2.23606798
   3.31662479   1.41421356   2.82842712   1.           1.41421356
   1.41421356   2.44948974   1.73205081   1.41421356   2.           2.44948974
   3.31662479   1.41421356   5.19615242   1.41421356   3.31662479   2.           2.
   5.19615242   2.23606798   1.           2.           3.           2.64575131
   1.41421356   2.          14.83239697   4.47213595   1.73205081
   1.41421356   2.           1.41421356   2.           1.73205081
   2.44948974   1.41421356   1.73205081   9.69535971   2.44948974
  25.70992026   1.41421356   2.44948974   5.09901951   1.73205081
   2.64575131   2.82842712   1.73205081   4.35889894   1.41421356]
[35 18 63 23 20 19 16 39 29 44 27 46 50 55 12 48 51 58 61 22  1 43 47 40 36
 32 45 31  6 24  2  8 34 14 49 25 53 21 56 59 38  3 17 60 37 26 30 15 11  0
  4  9  7 6

[  4.24264069   2.           2.           2.           4.24264069
   4.35889894   1.73205081   1.73205081   2.64575131   2.44948974
   8.77496439   4.69041576   1.41421356   7.34846923   2.           3.60555128
   2.           2.44948974   2.           1.73205081   1.73205081   2.           2.
   1.73205081   1.73205081   3.           4.69041576   2.           5.47722558
   1.73205081   3.16227766   2.23606798   2.           5.           2.           2.
   2.           2.23606798   2.23606798   2.           1.41421356
  12.52996409   4.89897949   1.41421356   3.31662479   2.82842712
   1.73205081   1.73205081   1.73205081   2.           1.73205081
   2.23606798   4.47213595   1.41421356   2.64575131   6.92820323
   1.73205081   2.           2.64575131   5.47722558   5.09901951
   1.41421356   2.23606798   3.           2.        ]
[12 61 53 40 43 29 20 19 23 46 24 48  7  6 50 56 47 49 39 36 35 34 57 27 32
 64 22  3 21  2 14  1 18 16 38 37 51 31 62  9 17  8 58 54 45 63 25 30 44 15
  4  0