In [1]:
import collections
import nltk.classify.util, nltk.metrics
from nltk.classify import NaiveBayesClassifier, MaxentClassifier, SklearnClassifier, DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation
from sklearn.svm import LinearSVC, SVC
import random
from nltk.corpus import stopwords
import itertools
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk import precision
from nltk.metrics import *
from nltk.metrics import spearman
from nltk.metrics import paice
from nltk.metrics import scores
import csv
 



In [2]:
"""OpLexicon"""

'OpLexicon'

In [2]:
posdata = []
with open('/home/bruno/Documents/artigo/R/Sentiment/positive_op.csv', 'rt') as myfile:    
    reader = csv.reader(myfile, delimiter='\t')
    for val in reader:
        posdata.append(val[0])        
 
 

In [3]:
neudata = []
with open('/home/bruno/Documents/artigo/R/Sentiment/neutral_op.csv', 'rt') as myfile:    
    reader = csv.reader(myfile, delimiter='\t')
    for val in reader:
        neudata.append(val[0])  

In [4]:
negdata = []
with open('/home/bruno/Documents/artigo/R/Sentiment/negative_op.csv', 'rt') as myfile:    
    reader = csv.reader(myfile, delimiter='\t')
    for val in reader:
        negdata.append(val[0])  

In [5]:
def word_split(data):    
    data_new = []
    for word in data:
        word_filter = [i.lower() for i in word.split()]
        data_new.append(word_filter)
    return data_new

In [6]:
def word_split_sentiment(data):
    data_new = []
    for (word, sentiment) in data:
        word_filter = [i.lower() for i in word.split()]
        data_new.append((word_filter, sentiment))
    return data_new

In [7]:
def word_feats(words):    
    return dict([(word, True) for word in words])

In [8]:

stopset = set(stopwords.words('portuguese'))

In [9]:
def stopword_filtered_word_feats(words):
    return dict([(word, True) for word in words if word not in stopset])

In [10]:
def bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    """
    print words
    for ngram in itertools.chain(words, bigrams): 
        if ngram not in stopset: 
            print ngram
    exit()
    """    
    return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])

In [11]:
def bigram_word_feats_stopwords(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    """
    print words
    for ngram in itertools.chain(words, bigrams): 
        if ngram not in stopset: 
            print ngram
    exit()
    """    
    return dict([(ngram, True) for ngram in itertools.chain(words, bigrams) if ngram not in stopset])

In [None]:
def evaluate_classifier(featx):
    
    negfeats = [(featx(f), 'neg') for f in word_split(negdata)]
    posfeats = [(featx(f), 'pos') for f in word_split(posdata)]
    neufeats = [(featx(f), 'neu') for f in word_split(neudata)]    
    
    negcutoff = int(len(negfeats)*3/4)
    poscutoff = int(len(posfeats)*3/4)
    neucutoff = int(len(neufeats)*3/4)
    
 
    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] + neufeats[:neucutoff]
    testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] + neufeats[neucutoff:]
    
    # using 3 classifiers
    classifier_list = ['nb', 'svm','maxent','decision_tree']     
        
    for cl in classifier_list:
        
        if cl == 'maxent':
            classifierName = 'Maximum Entropy'
            algorithm = nltk.classify.MaxentClassifier.ALGORITHMS[0]
            classifier = nltk.MaxentClassifier.train(trainfeats, algorithm,max_iter=3)
        elif cl == 'svm':
            classifierName = 'SVM'
            classifier = SklearnClassifier(LinearSVC(), sparse=False)
            classifier.train(trainfeats)
            
        elif cl == 'decision_tree':
            classifierName = 'Decision Tree'
            classifier = DecisionTreeClassifier.train(trainfeats, binary=True, depth_cutoff=20, support_cutoff=20, entropy_cutoff=0.01)
            
        else:
            classifierName = 'Naive Bayes'
            classifier = NaiveBayesClassifier.train(trainfeats)
            
        refsets = collections.defaultdict(set)
        testsets = collections.defaultdict(set)
 
        for i, (feats, label) in enumerate(testfeats):
                refsets[label].add(i)
                observed = classifier.classify(feats)
                testsets[observed].add(i)
 
        accuracy = nltk.classify.util.accuracy(classifier, testfeats)
    
        pos_precision = scores.precision(refsets['pos'], testsets['pos'])
        pos_recall = scores.recall(refsets['pos'], testsets['pos'])
        pos_fmeasure = scores.f_measure(refsets['pos'], testsets['pos'])
        
        neg_precision = scores.precision(refsets['neg'], testsets['neg'])
        neg_recall = scores.recall(refsets['neg'], testsets['neg'])
        neg_fmeasure =  scores.f_measure(refsets['neg'], testsets['neg'])
        
        neu_precision = scores.precision(refsets['neu'], testsets['neu'])
        neu_recall = scores.recall(refsets['neu'], testsets['neu'])
        neu_fmeasure = scores.f_measure(refsets['neu'], testsets['neu'])
        
        print ('')
        print ('---------------------------------------')
        print ('SINGLE FOLD RESULT ' + '(' + classifierName + ')')
        print ('---------------------------------------')
        print ('accuracy:', accuracy)
        print ('precision', (pos_precision + neg_precision + neu_precision) / 3)
        print ('recall', (pos_recall + neg_recall + neu_recall) / 3)
        print ('f-measure', (pos_fmeasure + neg_fmeasure + neu_fmeasure) / 3)    
                
        #classifier.show_most_informative_features()
    
    print ('')
    
    ## CROSS VALIDATION
    
    trainfeats = negfeats + posfeats + neufeats   
    
    # SHUFFLE TRAIN SET
    # As in cross validation, the test chunk might have only negative or only positive data    
    random.shuffle(trainfeats)    
    n = 10 # 5-fold cross-validation    
    
    for cl in classifier_list:
        
        subset_size = int(len(trainfeats) / n)
        accuracy = []
        
        pos_precision = []
        pos_recall = []
        pos_fmeasure = []
        
        neg_precision = []
        neg_recall = []
        neg_fmeasure = []
        
        neu_precision = []
        neu_recall = []
        neu_fmeasure = []

        cv_count = 1
        for i in range(n):        
            testing_this_round = trainfeats[i*subset_size:][:subset_size]
            training_this_round = trainfeats[:i*subset_size] + trainfeats[(i+1)*subset_size:]
            
            if cl == 'maxent':
                classifierName = 'Maximum Entropy'
                algorithm = nltk.classify.MaxentClassifier.ALGORITHMS[0]
                classifier = nltk.MaxentClassifier.train(training_this_round, algorithm,max_iter=3)
            elif cl == 'svm':
                classifierName = 'SVM'
                classifier = SklearnClassifier(LinearSVC(), sparse=False)
                classifier.train(training_this_round)
            
            elif cl == 'decision_tree':
                classifierName = 'Decision Tree'
                classifier = DecisionTreeClassifier.train(training_this_round, binary=True, depth_cutoff=20, support_cutoff=20, entropy_cutoff=0.01)
            else:
                classifierName = 'Naive Bayes'
                classifier = NaiveBayesClassifier.train(training_this_round)
                    
            refsets = collections.defaultdict(set)
            testsets = collections.defaultdict(set)
            for i, (feats, label) in enumerate(testing_this_round):
                refsets[label].add(i)
                observed = classifier.classify(feats)
                testsets[observed].add(i)
            
            cv_accuracy = nltk.classify.util.accuracy(classifier, testing_this_round)
            cv_pos_precision = scores.precision(refsets['pos'], testsets['pos'])
            cv_pos_recall = scores.recall(refsets['pos'], testsets['pos'])
            cv_pos_fmeasure = scores.f_measure(refsets['pos'], testsets['pos'])
            
            cv_neg_precision = scores.precision(refsets['neg'], testsets['neg'])
            cv_neg_recall = scores.recall(refsets['neg'], testsets['neg'])
            cv_neg_fmeasure =  scores.f_measure(refsets['neg'], testsets['neg'])
            
            cv_neu_precision = scores.precision(refsets['neu'], testsets['neu'])
            cv_neu_recall = scores.recall(refsets['neu'], testsets['neu'])
            cv_neu_fmeasure = scores.f_measure(refsets['neu'], testsets['neu'])
                    
            accuracy.append(cv_accuracy)
            pos_precision.append(cv_pos_precision)
            pos_recall.append(cv_pos_recall)
            pos_fmeasure.append(cv_pos_fmeasure)

            neg_precision.append(cv_neg_precision)
            neg_recall.append(cv_neg_recall)
            neg_fmeasure.append(cv_neg_fmeasure)
            
            neu_precision.append(cv_neu_precision)
            neu_recall.append(cv_neu_recall)
            neu_fmeasure.append(cv_neu_fmeasure)
            
            cv_count += 1
                
        print ('---------------------------------------')
        print ('N-FOLD CROSS VALIDATION RESULT ' + '(' + classifierName + ')')
        print ('---------------------------------------')
        print ('accuracy:', sum(accuracy) / n)
        print ('precision', (sum(pos_precision)/n + sum(neg_precision)/n + sum(neu_precision)/n) / 3)
        print ('recall',  (sum(pos_recall)/n + sum(neg_recall)/n + sum(neu_precision)/n) / 3)
        print ('f-measure', (sum(pos_fmeasure)/n + sum(neg_fmeasure)/n + sum(neu_fmeasure)/n) / 3)
        print ('')
    
        
evaluate_classifier(word_feats)


---------------------------------------
SINGLE FOLD RESULT (Naive Bayes)
---------------------------------------
accuracy: 0.9310375432679506
precision 0.9289425407804587
recall 0.9208777864563791
f-measure 0.9240368281723924

---------------------------------------
SINGLE FOLD RESULT (SVM)
---------------------------------------
accuracy: 0.9928108635839177
precision 0.9928382723669756
recall 0.9916277160626336
f-measure 0.9922131115742525
  ==> Training (3 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -1.09861        0.251
             2          -1.09825        0.678
         Final          -1.09788        0.678

---------------------------------------
SINGLE FOLD RESULT (Maximum Entropy)
---------------------------------------
accuracy: 0.6757788231117423
precision 0.8376024255831954
recall 0.5843311285349632
f-measure 0.605458196076152

---------------------------------------
SINGLE FOLD RESULT (De

In [15]:
def evaluate_classifier(featx):
    
    negfeats = [(featx(f), 'neg') for f in word_split(negdata)]
    posfeats = [(featx(f), 'pos') for f in word_split(posdata)]
    neufeats = [(featx(f), 'neu') for f in word_split(neudata)]    
    
    negcutoff = int(len(negfeats)*3/4)
    poscutoff = int(len(posfeats)*3/4)
    neucutoff = int(len(neufeats)*3/4)
    
 
    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] + neufeats[:neucutoff]
    testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] + neufeats[neucutoff:]
    
    print(testfeats)
evaluate_classifier(word_feats)

[({'dilma': True, 'vai': True, 'ganhar': True, 'novo': True, 'povo': True, 'burro': True}, 'neg'), ({'contra': True, 'dilma': True, 'pobre': True}, 'neg'), ({'brasil': True, 'dilma': True, 'ja': True, 'ferrou': True, 'literalmente': True, 'fato': True, 'agora': True, 'tira': True, 'la': True, 'poder': True, 'afastar': True, 'fantasma': True, 'comunista': True, 'solo': True, 'brasileiro': True}, 'neg'), ({'dilma': True, 'ganhar': True, 'brasil': True, 'perder': True}, 'neg'), ({'turno': True, 'vou': True, 'nulo': True, 'aecio': True, 'dilma': True, 'nunca': True, 'c': True}, 'neg'), ({'pqp': True, 'agr': True, 'ferrou': True, 'tudo': True, 'dilma': True, 'aecio': True, 'acho': True, 'q': True, 'vou': True, 'julguem': True, 'ai': True, 'droga': True, 'mentira': True, 'nao': True, 'sei': True, 'votar': True}, 'neg'), ({'sinceramente': True, 'odeio': True, 'todos': True, 'vcs': True, 'q': True, 'votaram': True, 'dilma': True, 'sabe': True, 'pq': True, 'reclamam': True, 'jeito': True, 'ta':