# Pré processamento: 
Funções do arquivo pre_processing.py

# Maximum Entropy Classifier

In [30]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pre_processing as pp
import analysis as anl
import pca

category = 'dvd'

hNeg = True #if true, add negative bigrams for negative reviews
noun = False #if true, add nouns

X, Y, vocabulary = pp.bow(category, hNeg, noun)

print("Vocabulário possui " + str(len(vocabulary)) + " palavras!")

Vocabulário possui 10642 palavras!


In [31]:
# semente usada na randomizacao dos dados.
randomSeed = 10 

# gera os indices aleatorios que irao definir a ordem dos dados
idx_perm = np.random.RandomState(randomSeed).permutation(range(len(Y)))

# ordena os dados de acordo com os indices gerados aleatoriamente
X2, Y2 = X[idx_perm, :], Y[idx_perm]

#X2, Y2 = X[idx_perm, :], Y[idx_perm]

pTrain = 0.8

train_index, test_index = anl.stratified_holdOut(Y, pTrain)

Xtrain, Xval = X2[train_index, :], X2[test_index, :]
Ytrain, Yval = Y2[train_index], Y2[test_index]

In [32]:
Xtrain, new_vocabulary, index = pp.chi2(Xtrain, Ytrain, vocabulary)
Xval = Xval[:, index]

In [33]:
print("Número de features antes do chi-quadrado: " + str(len(vocabulary)))
print("----------------------------------------")
print("Número de features após chi-quadrado: " + str(len(new_vocabulary)))
#print(new_vocabulary)

Número de features antes do chi-quadrado: 10642
----------------------------------------
Número de features após chi-quadrado: 254


In [None]:
def gradienteDescente(X, Y, theta, alpha, m, num_iter):

    for it in range(num_iter):
        h_theta = (X * theta).sum(axis=1)
        theta = theta - alpha * (1/m) *(X.T * (h_theta - Y)).sum(axis=1)
      
    return theta

numIterations = 100
alpha = 0.55
m,n = np.shape(Xtrain)
theta = np.ones(n)
theta = gradienteDescente(Xtrain, Ytrain, theta, alpha, m, numIterations)
print(theta)
# acuracia = np.sum(classes==Yval)/len(Yval)
# print(acuracia)

# Random Forest

In [None]:
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()
clf.fit(Xtrain, Ytrain)
classes = clf.predict(Xval)
acuracia = np.sum(classes==Yval)/len(Yval)
print("Acurácia é " + str(acuracia))  

# Decision Tree Clssifier implementação

In [34]:
def dt_train( data, max_depth, min_size = 1):
    max_depth = int(max_depth)
    min_size = int(min_size)
    attr, split_val, left, right = split(data)
    tree = {"attribute": attr, "split": split_val, "left": left, "right": right, "current_mode": leaf(data)}
    decision(tree,max_depth,min_size)
    return tree

def gini(node):
    """Calculate the gini impurity for a node. Aim is to minimize gini impurity(gain function)."""
    ##Find the number of classifications in current node.
    classifications = node[:,-1]
    samples = classifications.size
    unique, counts = np.unique(classifications, return_counts = True)
    ##calculate gini based on number of classes
    gini = 1
    for i in range (0, unique.size):
        proportion =  counts[i] / samples
        ##
        gini = gini - proportion * proportion
    return gini
    
def gain(values, cur_gini, attribute, split):
    """Calculate information gain for an attribute split at each level.
    Inputs are the current subset of data, initial gini at parent node,
    attribute to be split and split number."""
    i = attribute
    samples = values[:,-1].size
    left = values[values[:,i] < split, :]
    right = values[values[:,i] >= split, :]
    left_samples = left[:,-1].size
    right_samples = right[1:,-1].size
    
    ##Calculate left and right side gini
    left_gini = gini(left)
    right_gini = gini(right)
    
    ##Calculate information gain at this split value.
    gain = cur_gini - (left_samples/samples)*left_gini - (right_samples/samples)*right_gini
    return gain, left, right
    
def split(node):
    """Find the ideal split point by searching for the best information gain
    of all attributes and their potential split values.
    If no gain improves, node is split for leaf node creation as right side left at 0 samples."""
    cur_gini = gini(node)
    best_gain = 0
    best_attr = 0
    best_split = 0
    ##Implement greedy, exhaustive search for best information gain
    variables = len(node[0])
    best_left = node
    best_right = np.empty([0,variables])
    
    ##Seach through each unique value to find best division
    for v in range(0, variables-1):
        uniques = np.unique(node[:, v])
        for row in uniques:
            new_gain, left, right  = gain(node, cur_gini, v, row)
            
            ##Select the best gain, and associated attributes
            if new_gain > best_gain:
                best_gain = new_gain
                best_attr = v
                best_split = row
                best_left = left
                best_right = right
    #return {"attribute": best_attr, "split": best_split, "left": best_left, "right": best_right}
    return best_attr, best_split, best_left, best_right

def leaf(node):
    """Return classification value for leaf node, 
    when either maximum depth of tree reached or node is suitably weighted to one class."""
    classes = node[:, -1].tolist()
    return max(set(node[:,-1]), key = classes.count)

def decision(tree, max_depth=10, min_size=0, depth=0):
    """Uses split and leaf functions to build a tree, using a root data set.
    Will assign leaf nodes if either maximum depth or minimum samples are reached.
    root node contains both current node data, as well as decision rules to that point.
    """
    left = tree["left"]
    right = tree["right"]
      
    ##If tree is at max depth, assign most common member.
    if depth >= max_depth:
        tree['left'] = leaf(left)
        tree['right'] = leaf(right)
    ##If continuing sampling
    else:
        
        ##Left side child
        ##If minimum samples exist in current node, make it a leaf with max occuring value in samples.
        if left[:, -1].size <= min_size:
            tree['left'] = leaf(left)
        ##Else continue building tree.
        else:
            left_attr, left_split, left_left, left_right = split(left)
            ##Check if node is terminal. Make it a leaf node if so.
            if left_left.size == 0 or left_right.size == 0:
                tree['left'] = leaf(np.vstack([left_left,left_right]))   
            ##Continue elsewise.
            else:
                tree['left'] = {"attribute": left_attr, "split": left_split, "left": left_left, "right": left_right, "current_mode": leaf(left)}
                decision(tree['left'], max_depth, min_size, depth+1)
                
        ##right side child. Same process as above.
        if right[:, -1].size <= min_size:
            tree['right'] = leaf(right)
        else:
            right_attr, right_split, right_left, right_right = split(right)
            if right_left.size == 0 or right_right.size == 0:
                tree['right'] = leaf(np.vstack([right_left,right_right]))
            else:
                tree['right'] = {"attribute": right_attr, "split": right_split, "left": right_left, "right": right_right, "current_mode": leaf(right)}
                decision(tree['right'], max_depth, min_size, depth+1)

def classify(tree,row):
    """classify new data based on current row.
    Involves searching through tree based on the attributes of validation data.
    Will return classification value once leaf of tree is reached."""
    ##Look at each sample to classify. append to list of output values.
    ##Recursively search through branches until an append can be made.
    if row[tree['attribute']] < tree['split']:
        if isinstance(tree['left'],dict):
            return classify(tree['left'], row)
        else:
            return tree['left']
    else:
        if isinstance(tree['right'],dict):
            return classify(tree['right'], row)
        else:
            return tree['right']

def dt_predict( tree, data):
    """For every row in the validation data,
    a call to the classify function is done,
    with results appended to prediction data."""
    predictions = []
    for row in data:
        pred = classify(tree, row)
        predictions.append(int(pred))
    return predictions

##functions for validation and pruning.
def dt_confusion_matrix( predicted, actual,classes):
    """Return a confusion matrix showing the difference between actual values,
    and model predicted values. Also returns total accuracy"""
    
    matrix = np.zeros((len(classes), len(classes)))
    for a, p in zip(actual, predicted):
        matrix[a][p] += 1
    accuracy = (actual == predicted).sum() / float(len(actual))*100
    return matrix, accuracy        
    
def print_dt(tree, depth = 0):
    """"Iterate through decision tree, printing out values."""
    print ((" " * depth) + "attribute " + str(tree['attribute']) + " > " + str(tree['split']))
    if isinstance(tree['left'], dict):
        print_dt(tree['left'], depth + 1)
    else:
        print ((" " *(depth + 1)) + str(tree['left']))
    if isinstance(tree['right'], dict):
        print_dt(tree['right'], depth + 1)
    else:
        print ((" " *(depth + 1)) + str(tree['right']))
        

"""Bagged decision trees contain a user-specified number of decision trees.
Classification of a sample is done by using the mode of each of these decision trees.
subsample is a fraction of the total dataset to be used.
trees refers to the number of trees to use in "forest" of trees.
By leaving default values for subsample and trees, a single decision tree classifier is created."""
def bt_train( data, max_depth, min_size = 1, subsample_ratio = 1,trees =1):
    
    ##Create a series of trees using sampling with replacement.
    size = data[:, -1].size
    division = int(size * subsample_ratio)        
    forest = []
    for i in range (0,trees):
        samples = data[np.random.choice(data.shape[0], division, replace = True)]
        forest.append([])
        forest[i] = dt_train(samples, max_depth, min_size)
    return forest

def bt_predict( forest, data):
    """"Classify validation data set based on built bagged trees.
    This is done by taking the mode of the classifications of each decision tree."""
    ##Use predict function from decision tree.
    ##Number of trees in forest, number of validation samples. Used to create empty array showing classifications.
    forest_size = len(forest)
    samples = len(data)
    tree_classification = np.zeros((samples, forest_size))
    ##With each tree, find the classification of each validation sample.
    for i in range (0, forest_size):
        tree_classification[:, i] = dt_predict(forest[i], data)
    ##Create list of modes for each sample, using tree_classification matrix.
    predictions = []
    for i in range(0, samples):
        tree_pred = tree_classification[i,:].tolist()
        predictions.append(int(max(set(tree_pred), key = tree_pred.count)))
    return predictions

def bt_confusion_matrix( predicted, actual,classes):
    """Create confusion matrix for bagged trees. Makes call to DT method."""
    matrix, accuracy = dt_confusion_matrix(predicted, actual, classes)
    return matrix, accuracy

In [35]:
classes = [0, 1]
#Xtrain = Xtrain.toarray()
train = np.column_stack((Xtrain.toarray() ,Ytrain))

test = np.column_stack((Xval.toarray() ,Yval))
print("start train")
tree = dt_train(train, 20,5)
print("finish train")
validation_dt = dt_predict(tree, test)
print(validation_dt)
confusion_dt,accuracy_dt = dt_confusion_matrix(validation_dt, Yval, classes)
print (accuracy_dt)
#print (print_dt(tree))


start train
finish train
[0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1,

In [None]:
print("start train")
forest = bt_train(train, 20, 5, 1, 300)
print("finish train")
validation_rf = bt_predict(forest, test)
confusion_rf, accuracy_rf = bt_confusion_matrix(validation_rf, test[:, -1].astype(int),classes)
print (accuracy_rf)

# INVENTAR +++

In [48]:
import scipy.stats as st

def entropy(Y) :
    entropia = st.entropy(Y)
    return entropia


In [50]:
def infoGain(X, Y, split_atribute) :
    totalEntropy = entropy(Y)
    valores, count = np.unique(X[:,split_atribute], return_counts = True)
    entropyPond = 0
    for i in range(len(valores)) : 
        
        myatt = entropy(np.where(X[:,split_atribute]==valores[i])[0])
        entropyPond += np.sum([(count[i]/np.sum(count)*myatt)])
      
    informationGain =  totalEntropy - entropyPond
    
    return informationGain

In [51]:
def ID3(X, originalX, Y, originalY, features, parentNodeClasse = None) :
    if (len(np.unique(Y)) <= 1) :
        return np.unique(Y)[0]
    if (len(X)==0) :
        return np.unique(originalY)[np.argmax(np.unique(originalY, return_counts = True))]
    if (len(features)==0) :
        return parentNodeClasse
    else :
        parentNodeClasse = np.unique(Y)[np.argmax(np.unique(Y,return_counts = True)[1])]
        bestGain = -330
        for f in features :
            gain_new = infoGain(X,Y, f)
            if(gain_new > bestGain) : 
                bestGain = gain_new
                bestFeature = f
                
        tree = {bestFeature:{}}
        
        #features.delete(bestFeature)
        features = [i for i in features if i != bestFeature]
        for value in np.unique(X[:,bestFeature]) :
            value = value
        
            subX_indes = np.where(X[:,bestFeature] == value)[0]
            subX = X[subX_indes]
            subY = Y[np.where(X[:,bestFeature] == value)]
            subTree = ID3(subX, X, subY, Y, features, parentNodeClasse)
            
            tree[bestFeature][value] = subTree
            
        return(tree) 
        

    

In [52]:
def predict(row, tree,default=1) :
    features = np.arange(Xtrain.shape[1])
    for f in features :
        if f in list(tree.keys()) :
            try :
                classe = tree[f][row[f]]
            except:
                print("manooo")
                return default    
            
            classe = tree[f][row[f]]
            if isinstance(classe,dict):
                return predict(row,classe)
            else:
                return classe
        

In [53]:
def test(test, tree) :
    
    for row in range(test.shape[0]) :
        t = test[row]
        result = predict(t, tree)
    return result    

In [54]:
print(Xtrain.shape[1])
from pprint import pprint
features_index = np.arange(Xtrain.shape[1])
#Xtrain = Xtrain.toarray()
tree = ID3(Xtrain.toarray(), Xtrain.toarray() ,Ytrain, Ytrain, features_index)

#pprint(tree)
result = test(Xval.toarray(), tree)
acuracia = np.sum(result==Yval)/len(Yval)
print(acuracia)

254
manooo
manooo
manooo
manooo
manooo
manooo
manooo
manooo
manooo
manooo
manooo
manooo
manooo
manooo
manooo
manooo
manooo
manooo
manooo
manooo
manooo
manooo
manooo
manooo
manooo
manooo
manooo
manooo
manooo
manooo
manooo
manooo
manooo
manooo
manooo
manooo
manooo
manooo
manooo
manooo
manooo
0.5125
