# NLTK with Naive Bayes

In [1]:
# open python and nltk packages needed for processing
import os
import sys
import random
import nltk
from nltk.corpus import stopwords
from nltk.corpus import sentence_polarity
import re
from sklearn.feature_extraction.text import CountVectorizer 
from nltk import FreqDist
from nltk.collocations import *
from nltk.metrics import ConfusionMatrix
from nltk.corpus import stopwords
from cleantext import clean
from LIWCtools.LIWCtools import *
#from liwc import Liwc
from cleantext import clean
import cleantext
import sentiment_read_LIWC_pos_neg_words

bigram_measures = nltk.collocations.BigramAssocMeasures()

#### - Featureset funct

In [2]:
##HERE IS THE CODE TO ADD OR CHANGE FEATURES
##THIS PARTICULAR CODE CREATES DE BASELINE AS BAG OF WORDS (BOW) OR UNIGRAMS
# each feature is 'V_(keyword)' and is true or false depending
# on whether that keyword is in the document
def document_features(document, word_features):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['V_{}'.format(word)] = (word in document_words)
    return features

In [3]:
stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend([',', '.', '-', 'movie', 'film', '``', '`', "'", "...", '--'])

In [4]:
# negation
def Not_features(document, word_features, negationwords):
    features = {}
    for word in word_features:
        features['V_{}'.format(word)] = False
        features['V_NOT{}'.format(word)] = False
    # go through document words in order
    for i in range(0, len(document)):
        word = document[i]
        if ((i + 1) < len(document)) and ((word in negationwords) or (word.endswith("n't"))):
            i += 1
            features['V_NOT{}'.format(document[i])] = (document[i] in word_features)
        else:
            features['V_{}'.format(word)] = (word in word_features)
    return features

# this list of negation words includes some "approximate negators" like hardly and rarely
negationwords = ['no', 'not', 'never', 'none', 'nowhere', 'nothing', 'noone', 'rather', 'hardly', 'scarcely', 
                 'rarely', 'seldom', 'neither', 'nor']

In [5]:
# define features that include words as before 
# add the most frequent significant bigrams
# this function takes the list of words in a document as an argument and returns a feature dictionary
# it depends on the variables word_features and bigram_features
def bigram_document_features(document, word_features, bigram_features):
    document_words = set(document)
    document_bigrams = nltk.bigrams(document)
    features = {}
    for word in word_features:
        features['V_{}'.format(word)] = (word in document_words)
    for bigram in bigram_features:
        features['B_{}_{}'.format(bigram[0], bigram[1])] = (bigram in document_bigrams)    
    return features


In [6]:
# bigram_featuresets

# this function takes the number of folds, the feature sets
# it iterates over the folds, using different sections for training and testing in turn
#   it prints the accuracy for each fold and the average accuracy at the end
def cross_validation_accuracy(num_folds, bigram_featuresets):
    subset_size = int(len(bigram_featuresets)/num_folds)
    print('Each fold size:', subset_size)
    accuracy_list = []
    # iterate over the folds
    for i in range(num_folds):
        test_this_round = bigram_featuresets[(i*subset_size):][:subset_size]
        train_this_round = bigram_featuresets[:(i*subset_size)] + bigram_featuresets[((i+1)*subset_size):]
        # train using train_this_round
        classifier = nltk.NaiveBayesClassifier.train(train_this_round)
        # evaluate against test_this_round and save accuracy
        accuracy_this_round = nltk.classify.accuracy(classifier, test_this_round)
        print (i, accuracy_this_round)
        accuracy_list.append(accuracy_this_round)
    # find mean accuracy over all rounds
    print ('mean accuracy', sum(accuracy_list) / num_folds)

In [7]:
def readSubjectivity(path): # pass to where the file is
    flexicon = open(path, 'r')
    # initialize an empty dictionary
    sldict = { } # starts a dictionary 
    for line in flexicon: # for each line
        fields = line.split()   # default is to split on whitespace
        # split each field on the '=' and keep the second part as the value
        strength = fields[0].split("=")[1]
        word = fields[2].split("=")[1]
        posTag = fields[3].split("=")[1]
        stemmed = fields[4].split("=")[1]
        polarity = fields[5].split("=")[1]
        if (stemmed == 'y'):
            isStemmed = True
        else:
            isStemmed = False
        # put a dictionary entry with the word as the keyword
        #     and a list of the other values
        sldict[word] = [strength, posTag, isStemmed, polarity]
    return sldict

In [8]:
SLpath = "./SentimentLexicons/subjclueslen1-HLTEMNLP05.tff"
SL = readSubjectivity(SLpath)
len(SL.keys())

6885

In [9]:
# define features that include word counts of subjectivity words
# negative feature will have number of weakly negative words +
#    2 * number of strongly negative words
# positive feature has similar definition
#    not counting neutral words
def SL_features(document, word_features, SL): #add polarity feature as well, takes doc, features and lexicon
    document_words = set(document) # 2000 feature 
    features = {}
    for word in word_features:
        features['V_{}'.format(word)] = (word in document_words)
    # count variables for the 4 classes of subjectivity
    weakPos = 0
    strongPos = 0
    weakNeg = 0
    strongNeg = 0
    for word in document_words:
        if word in SL:
            strength, posTag, isStemmed, polarity = SL[word]
            if strength == 'weaksubj' and polarity == 'positive':
                weakPos += 1
            if strength == 'strongsubj' and polarity == 'positive':
                strongPos += 1
            if strength == 'weaksubj' and polarity == 'negative':
                weakNeg += 1
            if strength == 'strongsubj' and polarity == 'negative':
                strongNeg += 1
            features['positivecount'] = weakPos + (2 * strongPos) # added features; count the numb of pos and neg
            features['negativecount'] = weakNeg + (2 * strongNeg)      
    return features

In [10]:
poslist,neglist = sentiment_read_LIWC_pos_neg_words.read_words()
#print(poslist,neglist)
poslist = poslist
neglist = neglist
#print(poslist)
#print(neglist)

In [11]:
def liwc_features(document, word_features, poslist, neglist):  
    document_words = set(document)
    features = {}  
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
        pos = 0
        neg = 0
    for word in document_words:     
        if sentiment_read_LIWC_pos_neg_words.isPresent(word,poslist):   
            pos += 1
        if sentiment_read_LIWC_pos_neg_words.isPresent(word,neglist):       
            neg += 1     
            features['positivecount'] = pos
            features['negativecount'] = neg
        if 'positivecount' not in features: 
            features['positivecount']=0   
        if 'negativecount' not in features:
            features['negativecount']=0
    return features 

In [12]:
def SL_liwc_features(document, word_features, SL,poslist,neglist):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
        # count variables for the 4 classes of subjectivity
        weakPos = 0
        strongPos = 0
        weakNeg = 0
        strongNeg = 0
    for word in document_words:
        if sentiment_read_LIWC_pos_neg_words.isPresent(word,poslist):
            strongPos += 1
        elif sentiment_read_LIWC_pos_neg_words.isPresent(word,neglist):
            strongNeg += 1
        elif word in SL:
            strength, posTag, isStemmed, polarity = SL[word]
            if strength == 'weaksubj' and polarity == 'positive':
                weakPos += 1
            if strength == 'strongsubj' and polarity == 'positive':
                strongPos += 1
            if strength == 'weaksubj' and polarity == 'negative':
                weakNeg += 1
            if strength == 'strongsubj' and polarity == 'negative':
                strongNeg += 1
    features['positivecount'] = weakPos + (2 * strongPos)
    features['negativecount'] = weakNeg + (2 * strongNeg)
    if 'positivecount' not in features:
        features['positivecount']=0
    if 'negativecount' not in features:
        features['negativecount']=0      
    return features

In [13]:
# All features combined
def combined_document_features(document, word_features, SL, bigram_features, negationwords):
    document_words = set(document)
    document_bigrams = nltk.bigrams(document)
    features = {}
    # Subjectivity
    weakPos = 0
    strongPos = 0
    weakNeg = 0
    strongNeg = 0
    for word in document_words:
        if word in SL:
            strength, posTag, isStemmed, polarity = SL[word]
            if strength == 'weaksubj' and polarity == 'positive':
                weakPos += 1
            if strength == 'strongsubj' and polarity =='positive':
                strongPos += 1
            if strength == 'weaksubj' and polarity == 'negative':
                weakNeg += 1
            if strength == 'strongsubj' and polarity == 'negative':
                strongNeg += 1
            features['positiveStrengthCount'] = (2 * weakPos) + (5 * strongPos)
            features['negativeStrengthCount'] = (2 * weakNeg) + (5 * strongNeg)
        for word in word_features:
            features['V_{}'.format(word)] = False
            features['V_NOT{}'.format(word)] = False
        for word in word_features:
            for i in range(0, len(document)):
                word = document[i]
                if ((i + 1) < len(document)) and ((word in negationwords) or (word.endswith("n't"))):
                    i += 1
                    features['V_NOT{}'.format(document[i])] = (document[i] in word_features)
                else:
                    features['V_{}'.format(word)] = (word in word_features)
        for bigram in bigram_features:
            features['B_{}_{}'.format(bigram[0], bigram[1])] = (bigram in document_bigrams)
            return features

In [14]:
def generate_SLneg_features(document, SL, negationwords):
    document_words = set(document)
    features = {}
    features['length'] = len(document_words)
    weakPos = 0
    strongPos = 0
    weakNeg = 0
    strongNeg = 0
    negationWords = 0
    psc = 0
    nsc = 0
    for word in document_words:
        if word in negationwords:
            negationWords +=1
        features['negationwords'] = negationWords
        if word in SL:
            strength, posTag, isStemmed, polarity = SL[word]
            if strength == 'weaksubj' and polarity == 'positive':
                weakPos += 1
            if strength == 'strongsubj' and polarity == 'positive':
                strongPos += 1
            if strength == 'weaksubj' and polarity == 'negative': 
                weakNeg += 1
            if strength == 'strongsubj' and polarity == 'negative':
                strongNeg += 1
            psc = (weakPos) + (strongPos)   
            nsc = (weakNeg) + (strongNeg) 
            features['positiveStrengthCount'] = (2 * weakPos) + (5 * strongPos)      
            features['negativeStrengthCount'] = (2 * weakNeg) + (5 * strongNeg)
        length = len(document_words)
        if length > 10:
            features['percpositive'] = round(psc/length*100,2)
            features['percnegative'] = round(nsc/length*100,2)
        print(features)
        return features
            

In [15]:
def not_features(document, word_features, bigram_features, negationwords):
    document_words = set(document)
    document_bigrams = nltk.bigrams(document)
    features = {}
    for word in word_features:
        features['V_{}'.format(word)] = (word in document_words)
    for bigram in bigram_features:
        features['B_{}_{}'.format(bigram[0], bigram[1])] = (bigram in document_bigrams)
        #go through document words in order
    for i in range(0, len(document)):
        word = document[i]
        if ((i + 1) < len(document)) and ((word in negationwords) or (word.endswith("n't"))):
            i += 1
            features['V_NOT{}'.format(document[i])] = (document[i] in word_features)
        else:
            features['V_{}'.format(word)] = (word in word_features)
    return features
        

In [16]:
## cross-validation ##
# this function takes the number of folds, the feature sets and the labels
# it iterates over the folds, using different sections for training and testing in turn
#   it prints the performance for each fold and the average performance at the end
def cross_validation_PRF(num_folds, featuresets, labels):
    subset_size = int(len(featuresets)/num_folds)
    print('Each fold size:', subset_size)
    # for the number of labels - start the totals lists with zeroes
    num_labels = len(labels)
    total_precision_list = [0] * num_labels
    total_recall_list = [0] * num_labels
    total_F1_list = [0] * num_labels

    # iterate over the folds
    for i in range(num_folds):
        test_this_round = featuresets[(i*subset_size):][:subset_size]
        train_this_round = featuresets[:(i*subset_size)] + featuresets[((i+1)*subset_size):]
        # train using train_this_round
        classifier = nltk.NaiveBayesClassifier.train(train_this_round)
        # evaluate against test_this_round to produce the gold and predicted labels
        goldlist = []
        predictedlist = []
        for (features, label) in test_this_round:
            goldlist.append(label)
            predictedlist.append(classifier.classify(features))


        # computes evaluation measures for this fold and
        #   returns list of measures for each label
        print('Fold', i)
        (precision_list, recall_list, F1_list) \
                  = eval_measures(goldlist, predictedlist, labels)
        # take off triple string to print precision, recall and F1 for each fold
        '''
        print('\tPrecision\tRecall\t\tF1')
        # print measures for each label
        for i, lab in enumerate(labels):
            print(lab, '\t', "{:10.3f}".format(precision_list[i]), \
              "{:10.3f}".format(recall_list[i]), "{:10.3f}".format(F1_list[i]))
        '''
        # for each label add to the sums in the total lists
        for i in range(num_labels):
            # for each label, add the 3 measures to the 3 lists of totals
            total_precision_list[i] += precision_list[i]
            total_recall_list[i] += recall_list[i]
            total_F1_list[i] += F1_list[i]

    # find precision, recall and F measure averaged over all rounds for all labels
    # compute averages from the totals lists
    precision_list = [tot/num_folds for tot in total_precision_list]
    recall_list = [tot/num_folds for tot in total_recall_list]
    F1_list = [tot/num_folds for tot in total_F1_list]
    # the evaluation measures in a table with one row per label
    print('\nAverage Precision\tRecall\t\tF1 \tPer Label')
    # print measures for each label
    for i, lab in enumerate(labels):
        print(lab, '\t', "{:10.3f}".format(precision_list[i]), \
          "{:10.3f}".format(recall_list[i]), "{:10.3f}".format(F1_list[i]))
    
    # print macro average over all labels - treats each label equally
    print('\nMacro Average Precision\tRecall\t\tF1 \tOver All Labels')
    print('\t', "{:10.3f}".format(sum(precision_list)/num_labels), \
          "{:10.3f}".format(sum(recall_list)/num_labels), \
          "{:10.3f}".format(sum(F1_list)/num_labels))

    # for micro averaging, weight the scores for each label by the number of items
    #    this is better for labels with imbalance
    # first intialize a dictionary for label counts and then count them
    label_counts = {}
    for lab in labels:
      label_counts[lab] = 0 
    # count the labels
    for (doc, lab) in featuresets:
      label_counts[lab] += 1
    # make weights compared to the number of documents in featuresets
    num_docs = len(featuresets)
    label_weights = [(label_counts[lab] / num_docs) for lab in labels]
    print('\nLabel Counts', label_counts)
    #print('Label weights', label_weights)
    # print macro average over all labels
    print('Micro Average Precision\tRecall\t\tF1 \tOver All Labels')
    precision = sum([a * b for a,b in zip(precision_list, label_weights)])
    recall = sum([a * b for a,b in zip(recall_list, label_weights)])
    F1 = sum([a * b for a,b in zip(F1_list, label_weights)])
    print( '\t', "{:10.3f}".format(precision), \
      "{:10.3f}".format(recall), "{:10.3f}".format(F1))

In [17]:
##THIS CODE GETS THE DATA REQUIRED FOR EVALUATING THE MODEL:
def eval_measures(gold, predicted, labels):    
    # these lists have values for each label 
    recall_list = []
    precision_list = []
    F1_list = []
    for lab in labels:
        # for each label, compare gold and predicted lists and compute values
        TP = FP = FN = TN = 0
        for i, val in enumerate(gold):
            if val == lab and predicted[i] == lab:  TP += 1
            if val == lab and predicted[i] != lab:  FN += 1
            if val != lab and predicted[i] == lab:  FP += 1
            if val != lab and predicted[i] != lab:  TN += 1
        # use these to compute recall, precision, F1
        # for small numbers, guard against dividing by zero in computing measures
        if (TP == 0) or (FP == 0) or (FN == 0):
          recall_list.append (0)
          precision_list.append (0)
          F1_list.append(0)
        else:
          recall = TP / (TP + FP)
          precision = TP / (TP + FN)
          recall_list.append(recall)
          precision_list.append(precision)
          F1_list.append( 2 * (recall * precision) / (recall + precision))

    # the evaluation measures in a table with one row per label
    return (precision_list, recall_list, F1_list)

__Started file: random sample__

In [18]:
## function to read kaggle training file, train and test a classifier 
def process(dirPath,limitStr):
    limit = int(limitStr)
    dirPath = '/Users/aivii/programsmm/FP736/corpus/'
    f = open('./corpus/train.csv','r', encoding='utf-8')
    phrasedata = []
    for line in f:
        if (not line.startswith(',label')): # ignore the first line 
            line = line.strip()
        #line = fp.readline()
            phrasedata.append(line.split(',')[1:3])
  
  # pick a random sample of length limit because of phrase overlapping sequences
    random.shuffle(phrasedata)
    phraselist = phrasedata[:limit]

    print('Read', len(phrasedata), 'phrases, using', len(phraselist), 'random phrases')
  
  # create list of phrase documents as (list of words, label)
    phrasedocs = []
  # add all the phrases

  # each phrase has a list of tokens and the sentiment label ( 0 or 1)
    for phrase in phraselist:
        tokens = nltk.word_tokenize(phrase[1])
        
        # phrasedocs.append((tokens, int(phrase[0])))
        sentiment = int(phrase[0])
        
        if (sentiment == 0):
            phrasedocs.append((tokens, 'negative'))
        if (sentiment == 1):
            phrasedocs.append((tokens, 'positive'))


  # possibly filter tokens
  # lowercase - each phrase is a pair consisting of a token list and a label
    docs = []
    for phrase in phrasedocs:
        lowerphrase = ([w.lower() for w in phrase[0]], phrase[1])
        docs.append (lowerphrase)
  # print a few
    for phrase in docs[:10]:
        print (phrase)

  # continue as usual to get all words and create word features
    all_words_list = [word for (sent,cat) in docs for word in sent]
    all_words = nltk.FreqDist(all_words_list)
    print(len(all_words))

  # get the 2000 most frequently appearing keywords in the corpus
    word_items = all_words.most_common(2000)
    word_features = [word for (word,count) in word_items]

  # feature sets from a feature definition function
    featuresets = [(document_features(d, word_features), c) for (d, c) in docs]

  # train classifier and show performance in cross-validation
  # make a list of labels
    label_list = [c for (d,c) in docs]
    labels = list(set(label_list))    # gets only unique labels
    num_folds = 5
    cross_validation_PRF(num_folds, featuresets, labels)
    
    train_set, test_set = featuresets[round(.1*int(limit)):], featuresets[:round(.1*int(limit))]
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    print('Overall Accuracy', nltk.classify.accuracy(classifier, test_set))
    
    goldlist = [] # gold labels for the test set
    predictedlist = [] # test list, predicted 
    for (features, label) in test_set:
        goldlist.append(label)
        predictedlist.append(classifier.classify(features))
    cm = nltk.ConfusionMatrix(goldlist, predictedlist)
    print(cm.pretty_format(sort_by_count=True, truncate=9))
    
    print(cross_validation_accuracy(num_folds,featuresets))

In [19]:
dirPath0 = os.getcwd()
dirPath = dirPath0+'/corpus'
print(dirPath)
process(dirPath, 10000)

C:\Users\aivii\programsmm\FP736/corpus
Read 25000 phrases, using 10000 random phrases
(['``', 'i', 'am', 'not', 'sure', 'who', 'is', 'writing', 'these', 'glowing', 'reviews', 'for', 'this', 'movie', 'but', 'trust', 'me', 'it', 'stkinks', '.', 'i', 'have', 'seen', 'hundreds', 'of', 'horror', 'films', 'and', 'slasher', 'flicks', 'and', 'this', 'one', 'is', 'lame', 'it', 'is', 'only', 'about', '80min', 'long', 'and', 'believe', 'me', 'that', 'is', 'all', 'i', 'could', 'take', '.', 'plot', 'is', 'terrible'], 'negative')
(['``', 'this', 'is', 'a', 'so', 'called', "'feel-good", "'", 'movies'], 'positive')
(['``', 'as', 'a', 'gamer'], 'negative')
(['``', 'raising', 'victor', 'vargas', 'is', 'just', 'a', 'bad', 'film', '.', 'no', 'amount', 'of', 'denial', 'or', 'ad-dollar', 'supported', 'publicity', 'with', 'change', 'this', 'sad', 'fact', '.', 'maybe', 'peter', 'sollett', 'saw', 'he', 'did', "n't", 'have', 'the', 'money', 'to', 'do', 'the', 'movie', 'he', 'wanted', 'to', 'make', 'and', 'decid

__Starter file(wiith seed)__

In [20]:
## function to read kaggle training file, train and test a classifier 
def process(dirPath,limitStr):
    limit = int(limitStr)
    dirPath = '/Users/aivii/programsmm/FP736/corpus/'
    f = open('./corpus/train.csv','r', encoding='utf-8')
    phrasedata = []
    for line in f:
        if (not line.startswith(',label')): # ignore the first line 
            line = line.strip()
        #line = fp.readline()
            phrasedata.append(line.split(',')[1:3])
  
  # pick a random sample of length limit because of phrase overlapping sequences
    random.Random(1234).shuffle(phrasedata)
    phraselist = phrasedata[:limit]

    print('Read', len(phrasedata), 'phrases, using', len(phraselist), 'random phrases')
  
  # create list of phrase documents as (list of words, label)
    phrasedocs = []
  # add all the phrases

  # each phrase has a list of tokens and the sentiment label ( 0 or 1)
    for phrase in phraselist:
        tokens = nltk.word_tokenize(phrase[1])
        
        # phrasedocs.append((tokens, int(phrase[0])))
        sentiment = int(phrase[0])
        
        if (sentiment == 0):
            phrasedocs.append((tokens, 'negative'))
        if (sentiment == 1):
            phrasedocs.append((tokens, 'positive'))


  # possibly filter tokens
  # lowercase - each phrase is a pair consisting of a token list and a label
    docs = []
    for phrase in phrasedocs:
        lowerphrase = ([w.lower() for w in phrase[0]], phrase[1])
        docs.append (lowerphrase)
  # print a few
    for phrase in docs[:10]:
        print (phrase)

  # continue as usual to get all words and create word features
    all_words_list = [word for (sent,cat) in docs for word in sent]
    all_words = nltk.FreqDist(all_words_list)
    print(len(all_words))

  # get the 2000 most frequently appearing keywords in the corpus
    word_items = all_words.most_common(2000)
    word_features = [word for (word,count) in word_items]

  # feature sets from a feature definition function
    featuresets = [(document_features(d, word_features), c) for (d, c) in docs]

  # train classifier and show performance in cross-validation
  # make a list of labels
    label_list = [c for (d,c) in docs]
    labels = list(set(label_list))    # gets only unique labels
    num_folds = 5
    cross_validation_PRF(num_folds, featuresets, labels)
    
    train_set, test_set = featuresets[round(.1*int(limit)):], featuresets[:round(.1*int(limit))]
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    print('Overall Accuracy', nltk.classify.accuracy(classifier, test_set))
    
    goldlist = [] # gold labels for the test set
    predictedlist = [] # test list, predicted 
    for (features, label) in test_set:
        goldlist.append(label)
        predictedlist.append(classifier.classify(features))
    cm = nltk.ConfusionMatrix(goldlist, predictedlist)
    print(cm.pretty_format(sort_by_count=True, truncate=9))
    
    print(cross_validation_accuracy(num_folds,featuresets))

In [21]:
process(dirPath, 10000)

Read 25000 phrases, using 10000 random phrases
(['``', 'hundstage', 'is', 'an', 'intentionally', 'ugly', 'and', 'unnerving', 'study', 'of', 'life', 'in', 'a', 'particularly', 'dreary', 'suburb', 'of', 'vienna', '.', 'it', 'comes', 'from', 'former', 'documentary', 'director', 'ulrich', 'seidl', 'who', 'adopts', 'a', 'very', 'documentary-like', 'approach', 'to', 'the', 'material', '.', 'however'], 'positive')
(['``', 'oh', 'dear', 'we', 'do', "n't", 'like', 'it', 'when', 'our', 'super-hero', 'love', 'interest', 'develops', 'a', 'brain', 'do', 'we', '?', 'something', 'has', 'happened', 'to', 'people'], 'positive')
(['``', 'been', 'lurking', 'for', 'a', 'couple', 'of', 'years', 'or', 'so', '.', 'i', 'have', 'never', 'been', 'moved', 'to', 'post', 'on', 'here', 'before'], 'negative')
(['``', 'any', 'story', 'comprises', 'a', 'premise'], 'positive')
(['``', 'in', 'my', 'humble', 'opinion'], 'negative')
(['``', 'there', 'are', 'movies', 'like', '``', "''", 'plan', '9', "''", "''", 'that', 'ar

__Stop words removed__

In [22]:
## function to read kaggle training file, train and test a classifier 
def process(dirPath,limitStr):
    limit = int(limitStr)
    dirPath = '/Users/aivii/programsmm/FP736/corpus/'
    f = open('./corpus/train.csv','r', encoding='utf-8')
    phrasedata = []
    for line in f:
        if (not line.startswith(',label')): # ignore the first line 
            line = line.strip()
        #line = fp.readline()
            phrasedata.append(line.split(',')[1:3])
  
  # pick a random sample of length limit because of phrase overlapping sequences
    random.Random(1234).shuffle(phrasedata)
    phraselist = phrasedata[:limit]

    print('Read', len(phrasedata), 'phrases, using', len(phraselist), 'random phrases')
  
  # create list of phrase documents as (list of words, label)
    phrasedocs = []
  # add all the phrases

  # each phrase has a list of tokens and the sentiment label ( 0 or 1)
    for phrase in phraselist:
        tokens = nltk.word_tokenize(phrase[1])
        
        # phrasedocs.append((tokens, int(phrase[0])))
        sentiment = int(phrase[0])
        
        if (sentiment == 0):
            phrasedocs.append((tokens, 'negative'))
        if (sentiment == 1):
            phrasedocs.append((tokens, 'positive'))


  # possibly filter tokens
  # lowercase - each phrase is a pair consisting of a token list and a label
    docs = []
    for phrase in phrasedocs:
        lowerphrase = ([w.lower() for w in phrase[0]], phrase[1])
        docs.append (lowerphrase)
  # print a few
    for phrase in docs[:10]:
        print (phrase)

  # continue as usual to get all words and create word features
    all_words_list = [word for (sent, cat) in docs for word in sent if word not in stopwords] # with stop words removed
    all_words = nltk.FreqDist(all_words_list)
    print(len(all_words))

  # get the 2000 most frequently appearing keywords in the corpus
    word_items = all_words.most_common(2000)
    word_features = [word for (word,count) in word_items]

  # feature sets from a feature definition function
    featuresets = [(document_features(d, word_features), c) for (d, c) in docs]

  # train classifier and show performance in cross-validation
  # make a list of labels
    label_list = [c for (d,c) in docs]
    labels = list(set(label_list))    # gets only unique labels
    num_folds = 5
    cross_validation_PRF(num_folds, featuresets, labels)
    
    train_set, test_set = featuresets[round(.1*int(limit)):], featuresets[:round(.1*int(limit))]
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    print('Overall Accuracy', nltk.classify.accuracy(classifier, test_set))
    
    goldlist = [] # gold labels for the test set
    predictedlist = [] # test list, predicted 
    for (features, label) in test_set:
        goldlist.append(label)
        predictedlist.append(classifier.classify(features))
    cm = nltk.ConfusionMatrix(goldlist, predictedlist)
    print(cm.pretty_format(sort_by_count=True, truncate=9))
    
    print(cross_validation_accuracy(num_folds,featuresets))

In [23]:
process(dirPath, 10000)

Read 25000 phrases, using 10000 random phrases
(['``', 'hundstage', 'is', 'an', 'intentionally', 'ugly', 'and', 'unnerving', 'study', 'of', 'life', 'in', 'a', 'particularly', 'dreary', 'suburb', 'of', 'vienna', '.', 'it', 'comes', 'from', 'former', 'documentary', 'director', 'ulrich', 'seidl', 'who', 'adopts', 'a', 'very', 'documentary-like', 'approach', 'to', 'the', 'material', '.', 'however'], 'positive')
(['``', 'oh', 'dear', 'we', 'do', "n't", 'like', 'it', 'when', 'our', 'super-hero', 'love', 'interest', 'develops', 'a', 'brain', 'do', 'we', '?', 'something', 'has', 'happened', 'to', 'people'], 'positive')
(['``', 'been', 'lurking', 'for', 'a', 'couple', 'of', 'years', 'or', 'so', '.', 'i', 'have', 'never', 'been', 'moved', 'to', 'post', 'on', 'here', 'before'], 'negative')
(['``', 'any', 'story', 'comprises', 'a', 'premise'], 'positive')
(['``', 'in', 'my', 'humble', 'opinion'], 'negative')
(['``', 'there', 'are', 'movies', 'like', '``', "''", 'plan', '9', "''", "''", 'that', 'ar

__Negation features(with stop words)__

In [24]:
## function to read kaggle training file, train and test a classifier 
def process(dirPath,limitStr):
    limit = int(limitStr)
    dirPath = '/Users/aivii/programsmm/FP736/corpus/'
    f = open('./corpus/train.csv','r', encoding='utf-8')
    phrasedata = []
    for line in f:
        if (not line.startswith(',label')): # ignore the first line 
            line = line.strip()
        #line = fp.readline()
            phrasedata.append(line.split(',')[1:3])
  
  # pick a random sample of length limit because of phrase overlapping sequences
    random.Random(1234).shuffle(phrasedata)
    phraselist = phrasedata[:limit]

    print('Read', len(phrasedata), 'phrases, using', len(phraselist), 'random phrases')
  
  # create list of phrase documents as (list of words, label)
    phrasedocs = []
  # add all the phrases

  # each phrase has a list of tokens and the sentiment label ( 0 or 1)
    for phrase in phraselist:
        tokens = nltk.word_tokenize(phrase[1])
        
        # phrasedocs.append((tokens, int(phrase[0])))
        sentiment = int(phrase[0])
        
        if (sentiment == 0):
            phrasedocs.append((tokens, 'negative'))
        if (sentiment == 1):
            phrasedocs.append((tokens, 'positive'))


  # possibly filter tokens
  # lowercase - each phrase is a pair consisting of a token list and a label
    docs = []
    for phrase in phrasedocs:
        lowerphrase = ([w.lower() for w in phrase[0]], phrase[1])
        docs.append (lowerphrase)
  # print a few
    for phrase in docs[:10]:
        print (phrase)

  # continue as usual to get all words and create word features
    all_words_list = [word for (sent, cat) in docs for word in sent if word not in stopwords] # with stop words removed
    all_words = nltk.FreqDist(all_words_list)
    print(len(all_words))

  # get the 2000 most frequently appearing keywords in the corpus
    word_items = all_words.most_common(2000)
    word_features = [word for (word,count) in word_items]

   # this list of negation words includes some "approximate negators" like hardly and rarely
    #negationwords = ['no', 'not', 'never', 'none', 'nowhere', 'nothing', 'noone', 'rather', 'hardly', 'scarcely', 'rarely', 'seldom', 'neither', 'nor']
    

  # Not_feature sets from a feature definition function
    NOT_featuresets = [(Not_features(d, word_features, negationwords), c) for (d, c) in docs]
    print(len(NOT_featuresets))

  # train classifier and show performance in cross-validation
  # make a list of labels
    label_list = [c for (d,c) in docs]
    labels = list(set(label_list))    # gets only unique labels
    num_folds = 5
    cross_validation_PRF(num_folds, NOT_featuresets, labels)
        
    train_set, test_set = NOT_featuresets[round(.1*int(limit)):], NOT_featuresets[:round(.1*int(limit))]
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    print('Overall Accuracy', nltk.classify.accuracy(classifier, test_set))
    
    goldlist = [] # gold labels for the test set
    predictedlist = [] # test list, predicted 
    for (features, label) in test_set:
        goldlist.append(label)
        predictedlist.append(classifier.classify(features))
    cm = nltk.ConfusionMatrix(goldlist, predictedlist)
    print(cm.pretty_format(sort_by_count=True, truncate=9))
    
    print(cross_validation_accuracy(num_folds,NOT_featuresets))

In [25]:
process(dirPath, 10000)

Read 25000 phrases, using 10000 random phrases
(['``', 'hundstage', 'is', 'an', 'intentionally', 'ugly', 'and', 'unnerving', 'study', 'of', 'life', 'in', 'a', 'particularly', 'dreary', 'suburb', 'of', 'vienna', '.', 'it', 'comes', 'from', 'former', 'documentary', 'director', 'ulrich', 'seidl', 'who', 'adopts', 'a', 'very', 'documentary-like', 'approach', 'to', 'the', 'material', '.', 'however'], 'positive')
(['``', 'oh', 'dear', 'we', 'do', "n't", 'like', 'it', 'when', 'our', 'super-hero', 'love', 'interest', 'develops', 'a', 'brain', 'do', 'we', '?', 'something', 'has', 'happened', 'to', 'people'], 'positive')
(['``', 'been', 'lurking', 'for', 'a', 'couple', 'of', 'years', 'or', 'so', '.', 'i', 'have', 'never', 'been', 'moved', 'to', 'post', 'on', 'here', 'before'], 'negative')
(['``', 'any', 'story', 'comprises', 'a', 'premise'], 'positive')
(['``', 'in', 'my', 'humble', 'opinion'], 'negative')
(['``', 'there', 'are', 'movies', 'like', '``', "''", 'plan', '9', "''", "''", 'that', 'ar

__Negation features(stop words removed)__

In [26]:
stopwords = nltk.corpus.stopwords.words('english')
#print(len(stopwords)) #179
#print(stopwords)

# remove some negation words 
negationwords.extend(['ain', 'aren', 'couldn', 'didn', 'doesn', 'hadn', 'hasn', 'haven', 'isn', 'ma', 'mightn', 'mustn', 'needn', 'shan', 'shouldn', 'wasn', 'weren', 'won', 'wouldn'])

newstopwords = [word for word in stopwords if word not in negationwords]
#print(len(newstopwords)) #157

In [27]:
## function to read kaggle training file, train and test a classifier 
def process(dirPath,limitStr):
    limit = int(limitStr)
    dirPath = '/Users/aivii/programsmm/FP736/corpus/'
    f = open('./corpus/train.csv','r', encoding='utf-8')
    phrasedata = []
    for line in f:
        if (not line.startswith(',label')): # ignore the first line 
            line = line.strip()
        #line = fp.readline()
            phrasedata.append(line.split(',')[1:3])
  
  # pick a random sample of length limit because of phrase overlapping sequences
    random.Random(1234).shuffle(phrasedata)
    phraselist = phrasedata[:limit]

    print('Read', len(phrasedata), 'phrases, using', len(phraselist), 'random phrases')
  
  # create list of phrase documents as (list of words, label)
    phrasedocs = []
  # add all the phrases

  # each phrase has a list of tokens and the sentiment label ( 0 or 1)
    for phrase in phraselist:
        tokens = nltk.word_tokenize(phrase[1])
        
        # phrasedocs.append((tokens, int(phrase[0])))
        sentiment = int(phrase[0])
        
        if (sentiment == 0):
            phrasedocs.append((tokens, 'negative'))
        if (sentiment == 1):
            phrasedocs.append((tokens, 'positive'))


  # possibly filter tokens
  # lowercase - each phrase is a pair consisting of a token list and a label
    docs = []
    for phrase in phrasedocs:
        lowerphrase = ([w.lower() for w in phrase[0]], phrase[1])
        docs.append (lowerphrase)
  # print a few
    for phrase in docs[:10]:
        print (phrase)

  # continue as usual to get all words and create word features
    # remove stop words from the all words list
    new_all_words_list = [word for (sent,cat) in docs for word in sent if word not in newstopwords]
    
  # continue to define a new all words dictionary, get the 2000 most common as new_word_features
    new_all_words = nltk.FreqDist(new_all_words_list)
    new_word_items = new_all_words.most_common(2000)

 
    new_word_features = [word for (word,count) in new_word_items]
    
    
    # this list of negation words includes some "approximate negators" like hardly and rarely
    #negationwords = ['no', 'not', 'never', 'none', 'nowhere', 'nothing', 'noone', 'rather', 'hardly', 'scarcely', 'rarely', 'seldom', 'neither', 'nor']
    

  # Not_feature sets from a feature definition function
    NOT_featuresets = [(Not_features(d, new_word_features, negationwords), c) for (d, c) in docs]
    print(len(NOT_featuresets)) # 100

  # train classifier and show performance in cross-validation
  # make a list of labels
    label_list = [c for (d,c) in docs]
    labels = list(set(label_list))    # gets only unique labels
    num_folds = 5
    cross_validation_PRF(num_folds, NOT_featuresets, labels)
        
    train_set, test_set = NOT_featuresets[round(.1*int(limit)):], NOT_featuresets[:round(.1*int(limit))]
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    print('Overall Accuracy', nltk.classify.accuracy(classifier, test_set))
    
    goldlist = [] # gold labels for the test set
    predictedlist = [] # test list, predicted 
    for (features, label) in test_set:
        goldlist.append(label)
        predictedlist.append(classifier.classify(features))
    cm = nltk.ConfusionMatrix(goldlist, predictedlist)
    print(cm.pretty_format(sort_by_count=True, truncate=9))
    
    print(cross_validation_accuracy(num_folds, NOT_featuresets))

In [28]:
process(dirPath, 10000)

Read 25000 phrases, using 10000 random phrases
(['``', 'hundstage', 'is', 'an', 'intentionally', 'ugly', 'and', 'unnerving', 'study', 'of', 'life', 'in', 'a', 'particularly', 'dreary', 'suburb', 'of', 'vienna', '.', 'it', 'comes', 'from', 'former', 'documentary', 'director', 'ulrich', 'seidl', 'who', 'adopts', 'a', 'very', 'documentary-like', 'approach', 'to', 'the', 'material', '.', 'however'], 'positive')
(['``', 'oh', 'dear', 'we', 'do', "n't", 'like', 'it', 'when', 'our', 'super-hero', 'love', 'interest', 'develops', 'a', 'brain', 'do', 'we', '?', 'something', 'has', 'happened', 'to', 'people'], 'positive')
(['``', 'been', 'lurking', 'for', 'a', 'couple', 'of', 'years', 'or', 'so', '.', 'i', 'have', 'never', 'been', 'moved', 'to', 'post', 'on', 'here', 'before'], 'negative')
(['``', 'any', 'story', 'comprises', 'a', 'premise'], 'positive')
(['``', 'in', 'my', 'humble', 'opinion'], 'negative')
(['``', 'there', 'are', 'movies', 'like', '``', "''", 'plan', '9', "''", "''", 'that', 'ar

__Bigrams pmi__

In [29]:
## function to read kaggle training file, train and test a classifier 
def process(dirPath,limitStr):
    limit = int(limitStr)
    dirPath = '/Users/aivii/programsmm/FP736/corpus/'
    f = open('./corpus/train.csv','r', encoding='utf-8')
    phrasedata = []
    for line in f:
        if (not line.startswith(',label')): # ignore the first line 
            line = line.strip()
        #line = fp.readline()
            phrasedata.append(line.split(',')[1:3])
  
  # pick a random sample of length limit because of phrase overlapping sequences
    random.Random(1234).shuffle(phrasedata)
    phraselist = phrasedata[:limit]

    print('Read', len(phrasedata), 'phrases, using', len(phraselist), 'random phrases')
  
  # create list of phrase documents as (list of words, label)
    phrasedocs = []
  # add all the phrases

  # each phrase has a list of tokens and the sentiment label ( 0 or 1)
    for phrase in phraselist:
        tokens = nltk.word_tokenize(phrase[1])
        
        # phrasedocs.append((tokens, int(phrase[0])))
        sentiment = int(phrase[0])
        
        if (sentiment == 0):
            phrasedocs.append((tokens, 'negative'))
        if (sentiment == 1):
            phrasedocs.append((tokens, 'positive'))


  # possibly filter tokens
  # lowercase - each phrase is a pair consisting of a token list and a label
    docs = []
    for phrase in phrasedocs:
        lowerphrase = ([w.lower() for w in phrase[0]], phrase[1])
        docs.append (lowerphrase)
  # print a few
    for phrase in docs[:10]:
        print (phrase)

  # continue as usual to get all words and create word features
    all_words_list = [word for (sent, cat) in docs for word in sent if word not in stopwords] # with stop words removed
    all_words = nltk.FreqDist(all_words_list)
    # print(len(all_words))
    
    finder = BigramCollocationFinder.from_words(all_words_list)
    bigram_features = finder.nbest(bigram_measures.pmi, 500)

  # get the 2000 most frequently appearing keywords in the corpus
    word_items = all_words.most_common(2000)
    word_features = [word for (word,count) in word_items]

  # feature sets from a feature definition function
    bigram_featuresets = [(bigram_document_features(d, word_features, bigram_features), c) for (d, c) in docs]
    print(len(bigram_featuresets)) # 100

  # train classifier and show performance in cross-validation
  # make a list of labels
    label_list = [c for (d,c) in docs]
    labels = list(set(label_list))    # gets only unique labels
    num_folds = 5
    cross_validation_PRF(num_folds, bigram_featuresets, labels)
        
    train_set, test_set = bigram_featuresets[round(.1*int(limit)):], bigram_featuresets[:round(.1*int(limit))]
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    print('Overall Accuracy', nltk.classify.accuracy(classifier, test_set))
    
    goldlist = [] # gold labels for the test set
    predictedlist = [] # test list, predicted 
    for (bigram_features, label) in test_set:
        goldlist.append(label)
        predictedlist.append(classifier.classify(bigram_features))
    cm = nltk.ConfusionMatrix(goldlist, predictedlist)
    print(cm.pretty_format(sort_by_count=True, truncate=9))
    
    print(cross_validation_accuracy(num_folds, bigram_featuresets))

In [30]:
process(dirPath, 10000)

Read 25000 phrases, using 10000 random phrases
(['``', 'hundstage', 'is', 'an', 'intentionally', 'ugly', 'and', 'unnerving', 'study', 'of', 'life', 'in', 'a', 'particularly', 'dreary', 'suburb', 'of', 'vienna', '.', 'it', 'comes', 'from', 'former', 'documentary', 'director', 'ulrich', 'seidl', 'who', 'adopts', 'a', 'very', 'documentary-like', 'approach', 'to', 'the', 'material', '.', 'however'], 'positive')
(['``', 'oh', 'dear', 'we', 'do', "n't", 'like', 'it', 'when', 'our', 'super-hero', 'love', 'interest', 'develops', 'a', 'brain', 'do', 'we', '?', 'something', 'has', 'happened', 'to', 'people'], 'positive')
(['``', 'been', 'lurking', 'for', 'a', 'couple', 'of', 'years', 'or', 'so', '.', 'i', 'have', 'never', 'been', 'moved', 'to', 'post', 'on', 'here', 'before'], 'negative')
(['``', 'any', 'story', 'comprises', 'a', 'premise'], 'positive')
(['``', 'in', 'my', 'humble', 'opinion'], 'negative')
(['``', 'there', 'are', 'movies', 'like', '``', "''", 'plan', '9', "''", "''", 'that', 'ar

__Bigrams chi_square__

In [31]:
## function to read kaggle training file, train and test a classifier 
def process(dirPath,limitStr):
    limit = int(limitStr)
    dirPath = '/Users/aivii/programsmm/FP736/corpus/'
    f = open('./corpus/train.csv','r', encoding='utf-8')
    phrasedata = []
    for line in f:
        if (not line.startswith(',label')): # ignore the first line 
            line = line.strip()
        #line = fp.readline()
            phrasedata.append(line.split(',')[1:3])
  
  # pick a random sample of length limit because of phrase overlapping sequences
    random.Random(1234).shuffle(phrasedata)
    phraselist = phrasedata[:limit]

    print('Read', len(phrasedata), 'phrases, using', len(phraselist), 'random phrases')
  
  # create list of phrase documents as (list of words, label)
    phrasedocs = []
  # add all the phrases

  # each phrase has a list of tokens and the sentiment label ( 0 or 1)
    for phrase in phraselist:
        tokens = nltk.word_tokenize(phrase[1])
        
        # phrasedocs.append((tokens, int(phrase[0])))
        sentiment = int(phrase[0])
        
        if (sentiment == 0):
            phrasedocs.append((tokens, 'negative'))
        if (sentiment == 1):
            phrasedocs.append((tokens, 'positive'))


  # possibly filter tokens
  # lowercase - each phrase is a pair consisting of a token list and a label
    docs = []
    for phrase in phrasedocs:
        lowerphrase = ([w.lower() for w in phrase[0]], phrase[1])
        docs.append (lowerphrase)
  # print a few
    for phrase in docs[:10]:
        print (phrase)

  # continue as usual to get all words and create word features
    all_words_list = [word for (sent, cat) in docs for word in sent if word not in stopwords] # with stop words removed
    all_words = nltk.FreqDist(all_words_list)
    # print(len(all_words))
    
    finder = BigramCollocationFinder.from_words(all_words_list)
    bigram_features = finder.nbest(bigram_measures.chi_sq, 500)

  # get the 2000 most frequently appearing keywords in the corpus
    word_items = all_words.most_common(2000)
    word_features = [word for (word,count) in word_items]

  # feature sets from a feature definition function
    bigram_featuresets = [(bigram_document_features(d, word_features, bigram_features), c) for (d, c) in docs]
    print(len(bigram_featuresets)) # 100

  # train classifier and show performance in cross-validation
  # make a list of labels
    label_list = [c for (d,c) in docs]
    labels = list(set(label_list))    # gets only unique labels
    num_folds = 5
    cross_validation_PRF(num_folds, bigram_featuresets, labels)
        
    train_set, test_set = bigram_featuresets[round(.1*int(limit)):], bigram_featuresets[:round(.1*int(limit))]
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    print('Overall Accuracy', nltk.classify.accuracy(classifier, test_set))
    
    goldlist = [] # gold labels for the test set
    predictedlist = [] # test list, predicted 
    for (bigram_features, label) in test_set:
        goldlist.append(label)
        predictedlist.append(classifier.classify(bigram_features))
    cm = nltk.ConfusionMatrix(goldlist, predictedlist)
    print(cm.pretty_format(sort_by_count=True, truncate=9))
    
    print(cross_validation_accuracy(num_folds, bigram_featuresets))

In [32]:
process(dirPath, 10000)

Read 25000 phrases, using 10000 random phrases
(['``', 'hundstage', 'is', 'an', 'intentionally', 'ugly', 'and', 'unnerving', 'study', 'of', 'life', 'in', 'a', 'particularly', 'dreary', 'suburb', 'of', 'vienna', '.', 'it', 'comes', 'from', 'former', 'documentary', 'director', 'ulrich', 'seidl', 'who', 'adopts', 'a', 'very', 'documentary-like', 'approach', 'to', 'the', 'material', '.', 'however'], 'positive')
(['``', 'oh', 'dear', 'we', 'do', "n't", 'like', 'it', 'when', 'our', 'super-hero', 'love', 'interest', 'develops', 'a', 'brain', 'do', 'we', '?', 'something', 'has', 'happened', 'to', 'people'], 'positive')
(['``', 'been', 'lurking', 'for', 'a', 'couple', 'of', 'years', 'or', 'so', '.', 'i', 'have', 'never', 'been', 'moved', 'to', 'post', 'on', 'here', 'before'], 'negative')
(['``', 'any', 'story', 'comprises', 'a', 'premise'], 'positive')
(['``', 'in', 'my', 'humble', 'opinion'], 'negative')
(['``', 'there', 'are', 'movies', 'like', '``', "''", 'plan', '9', "''", "''", 'that', 'ar

__Bigrams raw_freq__

In [33]:
## function to read kaggle training file, train and test a classifier 
def process(dirPath,limitStr):
    limit = int(limitStr)
    dirPath = '/Users/aivii/programsmm/FP736/corpus/'
    f = open('./corpus/train.csv','r', encoding='utf-8')
    phrasedata = []
    for line in f:
        if (not line.startswith(',label')): # ignore the first line 
            line = line.strip()
        #line = fp.readline()
            phrasedata.append(line.split(',')[1:3])
  
  # pick a random sample of length limit because of phrase overlapping sequences
    random.Random(1234).shuffle(phrasedata)
    phraselist = phrasedata[:limit]

    print('Read', len(phrasedata), 'phrases, using', len(phraselist), 'random phrases')
  
  # create list of phrase documents as (list of words, label)
    phrasedocs = []
  # add all the phrases

  # each phrase has a list of tokens and the sentiment label ( 0 or 1)
    for phrase in phraselist:
        tokens = nltk.word_tokenize(phrase[1])
        
        # phrasedocs.append((tokens, int(phrase[0])))
        sentiment = int(phrase[0])
        
        if (sentiment == 0):
            phrasedocs.append((tokens, 'negative'))
        if (sentiment == 1):
            phrasedocs.append((tokens, 'positive'))


  # possibly filter tokens
  # lowercase - each phrase is a pair consisting of a token list and a label
    docs = []
    for phrase in phrasedocs:
        lowerphrase = ([w.lower() for w in phrase[0]], phrase[1])
        docs.append (lowerphrase)
  # print a few
    for phrase in docs[:10]:
        print (phrase)

  # continue as usual to get all words and create word features
    all_words_list = [word for (sent, cat) in docs for word in sent if word not in stopwords] # with stop words removed
    all_words = nltk.FreqDist(all_words_list)
    # print(len(all_words))
    
    finder = BigramCollocationFinder.from_words(all_words_list)
    bigram_features = finder.nbest(bigram_measures.raw_freq, 500)

  # get the 2000 most frequently appearing keywords in the corpus
    word_items = all_words.most_common(2000)
    word_features = [word for (word,count) in word_items]

  # feature sets from a feature definition function
    bigram_featuresets = [(bigram_document_features(d, word_features, bigram_features), c) for (d, c) in docs]
    print(len(bigram_featuresets)) # 100

  # train classifier and show performance in cross-validation
  # make a list of labels
    label_list = [c for (d,c) in docs]
    labels = list(set(label_list))    # gets only unique labels
    num_folds = 5
    cross_validation_PRF(num_folds, bigram_featuresets, labels)
        
    train_set, test_set = bigram_featuresets[round(.1*int(limit)):], bigram_featuresets[:round(.1*int(limit))]
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    print('Overall Accuracy', nltk.classify.accuracy(classifier, test_set))
    
    goldlist = [] # gold labels for the test set
    predictedlist = [] # test list, predicted 
    for (bigram_features, label) in test_set:
        goldlist.append(label)
        predictedlist.append(classifier.classify(bigram_features))
    cm = nltk.ConfusionMatrix(goldlist, predictedlist)
    print(cm.pretty_format(sort_by_count=True, truncate=9))
    
    print(cross_validation_accuracy(num_folds, bigram_featuresets))

In [34]:
process(dirPath, 10000)

Read 25000 phrases, using 10000 random phrases
(['``', 'hundstage', 'is', 'an', 'intentionally', 'ugly', 'and', 'unnerving', 'study', 'of', 'life', 'in', 'a', 'particularly', 'dreary', 'suburb', 'of', 'vienna', '.', 'it', 'comes', 'from', 'former', 'documentary', 'director', 'ulrich', 'seidl', 'who', 'adopts', 'a', 'very', 'documentary-like', 'approach', 'to', 'the', 'material', '.', 'however'], 'positive')
(['``', 'oh', 'dear', 'we', 'do', "n't", 'like', 'it', 'when', 'our', 'super-hero', 'love', 'interest', 'develops', 'a', 'brain', 'do', 'we', '?', 'something', 'has', 'happened', 'to', 'people'], 'positive')
(['``', 'been', 'lurking', 'for', 'a', 'couple', 'of', 'years', 'or', 'so', '.', 'i', 'have', 'never', 'been', 'moved', 'to', 'post', 'on', 'here', 'before'], 'negative')
(['``', 'any', 'story', 'comprises', 'a', 'premise'], 'positive')
(['``', 'in', 'my', 'humble', 'opinion'], 'negative')
(['``', 'there', 'are', 'movies', 'like', '``', "''", 'plan', '9', "''", "''", 'that', 'ar

__SL__

In [35]:
## function to read kaggle training file, train and test a classifier 
def process(dirPath,limitStr):
    limit = int(limitStr)
    dirPath = '/Users/aivii/programsmm/FP736/corpus/'
    f = open('./corpus/train.csv','r', encoding='utf-8')
    phrasedata = []
    for line in f:
        if (not line.startswith(',label')): # ignore the first line 
            line = line.strip()
        #line = fp.readline()
            phrasedata.append(line.split(',')[1:3])
  
  # pick a random sample of length limit because of phrase overlapping sequences
    random.Random(1234).shuffle(phrasedata)
    phraselist = phrasedata[:limit]

    print('Read', len(phrasedata), 'phrases, using', len(phraselist), 'random phrases')
  
  # create list of phrase documents as (list of words, label)
    phrasedocs = []
  # add all the phrases

  # each phrase has a list of tokens and the sentiment label ( 0 or 1)
    for phrase in phraselist:
        tokens = nltk.word_tokenize(phrase[1])
        
        # phrasedocs.append((tokens, int(phrase[0])))
        sentiment = int(phrase[0])
        
        if (sentiment == 0):
            phrasedocs.append((tokens, 'negative'))
        if (sentiment == 1):
            phrasedocs.append((tokens, 'positive'))


  # possibly filter tokens
  # lowercase - each phrase is a pair consisting of a token list and a label
    docs = []
    for phrase in phrasedocs:
        lowerphrase = ([w.lower() for w in phrase[0]], phrase[1])
        docs.append (lowerphrase)
  # print a few
    for phrase in docs[:10]:
        print (phrase)

  # continue as usual to get all words and create word features
    all_words_list = [word for (sent,cat) in docs for word in sent]
    all_words = nltk.FreqDist(all_words_list)
    print(len(all_words))

  # get the 2000 most frequently appearing keywords in the corpus
    word_items = all_words.most_common(2000)
    word_features = [word for (word,count) in word_items]

  # SL_feature sets from a feature definition function
    #featuresets = [(document_features(d, word_features), c) for (d, c) in docs]
    SL_featuresets = [(SL_features(d, word_features, SL), c) for (d, c) in docs]
    print(len(SL_featuresets)) # 100

  # train classifier and show performance in cross-validation
  # make a list of labels
    label_list = [c for (d,c) in docs]
    labels = list(set(label_list))    # gets only unique labels
    num_folds = 5
    cross_validation_PRF(num_folds, SL_featuresets, labels)
        
    train_set, test_set = SL_featuresets[round(.1*int(limit)):], SL_featuresets[:round(.1*int(limit))]
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    print('Overall Accuracy', nltk.classify.accuracy(classifier, test_set))
    
    goldlist = [] # gold labels for the test set
    predictedlist = [] # test list, predicted 
    for (features, label) in test_set:
        goldlist.append(label)
        predictedlist.append(classifier.classify(features))
    cm = nltk.ConfusionMatrix(goldlist, predictedlist)
    print(cm.pretty_format(sort_by_count=True, truncate=9))
    
    print(cross_validation_accuracy(num_folds, SL_featuresets))
        

In [36]:
process(dirPath, 10000)

Read 25000 phrases, using 10000 random phrases
(['``', 'hundstage', 'is', 'an', 'intentionally', 'ugly', 'and', 'unnerving', 'study', 'of', 'life', 'in', 'a', 'particularly', 'dreary', 'suburb', 'of', 'vienna', '.', 'it', 'comes', 'from', 'former', 'documentary', 'director', 'ulrich', 'seidl', 'who', 'adopts', 'a', 'very', 'documentary-like', 'approach', 'to', 'the', 'material', '.', 'however'], 'positive')
(['``', 'oh', 'dear', 'we', 'do', "n't", 'like', 'it', 'when', 'our', 'super-hero', 'love', 'interest', 'develops', 'a', 'brain', 'do', 'we', '?', 'something', 'has', 'happened', 'to', 'people'], 'positive')
(['``', 'been', 'lurking', 'for', 'a', 'couple', 'of', 'years', 'or', 'so', '.', 'i', 'have', 'never', 'been', 'moved', 'to', 'post', 'on', 'here', 'before'], 'negative')
(['``', 'any', 'story', 'comprises', 'a', 'premise'], 'positive')
(['``', 'in', 'my', 'humble', 'opinion'], 'negative')
(['``', 'there', 'are', 'movies', 'like', '``', "''", 'plan', '9', "''", "''", 'that', 'ar

__SL stop words removed__

In [37]:
## function to read kaggle training file, train and test a classifier 
def process(dirPath,limitStr):
    limit = int(limitStr)
    dirPath = '/Users/aivii/programsmm/FP736/corpus/'
    f = open('./corpus/train.csv','r', encoding='utf-8')
    phrasedata = []
    for line in f:
        if (not line.startswith(',label')): # ignore the first line 
            line = line.strip()
        #line = fp.readline()
            phrasedata.append(line.split(',')[1:3])
  
  # pick a random sample of length limit because of phrase overlapping sequences
    random.Random(1234).shuffle(phrasedata)
    phraselist = phrasedata[:limit]

    print('Read', len(phrasedata), 'phrases, using', len(phraselist), 'random phrases')
  
  # create list of phrase documents as (list of words, label)
    phrasedocs = []
  # add all the phrases

  # each phrase has a list of tokens and the sentiment label ( 0 or 1)
    for phrase in phraselist:
        tokens = nltk.word_tokenize(phrase[1])
        
        # phrasedocs.append((tokens, int(phrase[0])))
        sentiment = int(phrase[0])
        
        if (sentiment == 0):
            phrasedocs.append((tokens, 'negative'))
        if (sentiment == 1):
            phrasedocs.append((tokens, 'positive'))


  # possibly filter tokens
  # lowercase - each phrase is a pair consisting of a token list and a label
    docs = []
    for phrase in phrasedocs:
        lowerphrase = ([w.lower() for w in phrase[0]], phrase[1])
        docs.append (lowerphrase)
  # print a few
    for phrase in docs[:10]:
        print (phrase)

  # continue as usual to get all words and create word features
    # remove stop words from the all words list
    new_all_words_list = [word for (sent,cat) in docs for word in sent if word not in newstopwords]
    new_all_words = nltk.FreqDist(new_all_words_list)
    print(len(new_all_words))

  # get the 2000 most frequently appearing keywords in the corpus
    new_word_items = new_all_words.most_common(2000)
    new_word_features = [word for (word,count) in new_word_items]

  # SL_feature sets from a feature definition function
    #featuresets = [(document_features(d, word_features), c) for (d, c) in docs]
    SL_featuresets = [(SL_features(d, new_word_features, SL), c) for (d, c) in docs]
    print(len(SL_featuresets)) # 100

  # train classifier and show performance in cross-validation
  # make a list of labels
    label_list = [c for (d,c) in docs]
    labels = list(set(label_list))    # gets only unique labels
    num_folds = 5
    cross_validation_PRF(num_folds, SL_featuresets, labels)
        
    train_set, test_set = SL_featuresets[round(.1*int(limit)):], SL_featuresets[:round(.1*int(limit))]
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    print('Overall Accuracy', nltk.classify.accuracy(classifier, test_set))
    
    goldlist = [] # gold labels for the test set
    predictedlist = [] # test list, predicted 
    for (features, label) in test_set:
        goldlist.append(label)
        predictedlist.append(classifier.classify(features))
    cm = nltk.ConfusionMatrix(goldlist, predictedlist)
    print(cm.pretty_format(sort_by_count=True, truncate=9))
    
    print(cross_validation_accuracy(num_folds, SL_featuresets))
        

In [38]:
process(dirPath, 10000)

Read 25000 phrases, using 10000 random phrases
(['``', 'hundstage', 'is', 'an', 'intentionally', 'ugly', 'and', 'unnerving', 'study', 'of', 'life', 'in', 'a', 'particularly', 'dreary', 'suburb', 'of', 'vienna', '.', 'it', 'comes', 'from', 'former', 'documentary', 'director', 'ulrich', 'seidl', 'who', 'adopts', 'a', 'very', 'documentary-like', 'approach', 'to', 'the', 'material', '.', 'however'], 'positive')
(['``', 'oh', 'dear', 'we', 'do', "n't", 'like', 'it', 'when', 'our', 'super-hero', 'love', 'interest', 'develops', 'a', 'brain', 'do', 'we', '?', 'something', 'has', 'happened', 'to', 'people'], 'positive')
(['``', 'been', 'lurking', 'for', 'a', 'couple', 'of', 'years', 'or', 'so', '.', 'i', 'have', 'never', 'been', 'moved', 'to', 'post', 'on', 'here', 'before'], 'negative')
(['``', 'any', 'story', 'comprises', 'a', 'premise'], 'positive')
(['``', 'in', 'my', 'humble', 'opinion'], 'negative')
(['``', 'there', 'are', 'movies', 'like', '``', "''", 'plan', '9', "''", "''", 'that', 'ar

__Negation and bigrams together__

In [39]:
## function to read kaggle training file, train and test a classifier 
def process(dirPath,limitStr):
    limit = int(limitStr)
    dirPath = '/Users/aivii/programsmm/FP736/corpus/'
    f = open('./corpus/train.csv','r', encoding='utf-8')
    phrasedata = []
    for line in f:
        if (not line.startswith(',label')): # ignore the first line 
            line = line.strip()
        #line = fp.readline()
            phrasedata.append(line.split(',')[1:3])
  
  # pick a random sample of length limit because of phrase overlapping sequences
    random.shuffle(phrasedata)
    phraselist = phrasedata[:limit]

    print('Read', len(phrasedata), 'phrases, using', len(phraselist), 'random phrases')
  
  # create list of phrase documents as (list of words, label)
    phrasedocs = []
  # add all the phrases

  # each phrase has a list of tokens and the sentiment label ( 0 or 1)
    for phrase in phraselist:
        tokens = nltk.word_tokenize(phrase[1])
        
        # phrasedocs.append((tokens, int(phrase[0])))
        sentiment = int(phrase[0])
        
        if (sentiment == 0):
            phrasedocs.append((tokens, 'negative'))
        if (sentiment == 1):
            phrasedocs.append((tokens, 'positive'))


  # possibly filter tokens
  # lowercase - each phrase is a pair consisting of a token list and a label
    docs = []
    for phrase in phrasedocs:
        lowerphrase = ([w.lower() for w in phrase[0]], phrase[1])
        docs.append (lowerphrase)
  # print a few
    for phrase in docs[:10]:
        print (phrase)

  # continue as usual to get all words and create word features
    all_words_list = [word for (sent,cat) in docs for word in sent]
    all_words = nltk.FreqDist(all_words_list)
    print(len(all_words))

  # get the 2000 most frequently appearing keywords in the corpus
    word_items = all_words.most_common(2000)
    word_features = [word for (word,count) in word_items]
    
    finder = BigramCollocationFinder.from_words(all_words_list)
    bigram_features = finder.nbest(bigram_measures.raw_freq, 500)

  # not_feature sets from a feature definition function
    not_featuresets = [(not_features(d, word_features, bigram_features, negationwords), c) for (d, c) in docs]
    print(len(not_featuresets)) # 100

  # train classifier and show performance in cross-validation
  # make a list of labels
    label_list = [c for (d,c) in docs]
    labels = list(set(label_list))    # gets only unique labels
    num_folds = 5
    cross_validation_PRF(num_folds, not_featuresets, labels)
        
    train_set, test_set = not_featuresets[round(.1*int(limit)):], not_featuresets[:round(.1*int(limit))]
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    print('Overall Accuracy', nltk.classify.accuracy(classifier, test_set))
    
    goldlist = [] # gold labels for the test set
    predictedlist = [] # test list, predicted 
    for (features, label) in test_set:
        goldlist.append(label)
        predictedlist.append(classifier.classify(features))
    cm = nltk.ConfusionMatrix(goldlist, predictedlist)
    print(cm.pretty_format(sort_by_count=True, truncate=9))
    
    print(cross_validation_accuracy(num_folds, not_featuresets))

In [40]:
process(dirPath, 10000)

Read 25000 phrases, using 10000 random phrases
(['``', 'this', 'third', 'pokemon', 'movie', 'is', 'too', 'abstract', 'for', 'younger', 'kids', 'to', 'follow', 'and', 'too', 'repetitious', 'to', 'entertain', 'older', 'kids', '.', 'the', 'message', 'of', 'the', 'film', '--', 'about', 'dealing', 'with', 'loss', '--', 'is', 'subverted', 'by', 'the', 'return', 'of', 'the', 'young', 'girl', "'s", 'father', 'during', 'the', 'film', "'s", 'credits', '.', 'team', 'rocket', 'provide', 'some', 'amusement'], 'negative')
(['``', 'this', 'movie', 'has', 'a', 'special', 'way', 'of', 'telling', 'the', 'story'], 'positive')
(['``', 'watching', 'a', 'videotaped', 'replay', 'of', 'about', '8', 'various', '1994-1997', 'spider-man', 'cartoons', 'made', 'me', 'realize', 'why', 'i', 'could', "n't", 'stomach', 'it', 'when', 'it', 'first', 'came', 'out', '.', 'i', "'m", 'from', 'the', 'old', 'school'], 'negative')
(['``', 'i', 'bought', 'this', 'dvd', 'without', 'any', 'previous', 'reference', 'but', 'the', 'n

__All combined__

In [41]:
## function to read kaggle training file, train and test a classifier 
def process(dirPath,limitStr):
    limit = int(limitStr)
    dirPath = '/Users/aivii/programsmm/FP736/corpus/'
    f = open('./corpus/train.csv','r', encoding='utf-8')
    phrasedata = []
    for line in f:
        if (not line.startswith(',label')): # ignore the first line 
            line = line.strip()
        #line = fp.readline()
            phrasedata.append(line.split(',')[1:3])
  
  # pick a random sample of length limit because of phrase overlapping sequences
    random.Random(1234).shuffle(phrasedata)
    phraselist = phrasedata[:limit]

    print('Read', len(phrasedata), 'phrases, using', len(phraselist), 'random phrases')
  
  # create list of phrase documents as (list of words, label)
    phrasedocs = []
  # add all the phrases

  # each phrase has a list of tokens and the sentiment label ( 0 or 1)
    for phrase in phraselist:
        tokens = nltk.word_tokenize(phrase[1])
        
        # phrasedocs.append((tokens, int(phrase[0])))
        sentiment = int(phrase[0])
        
        if (sentiment == 0):
            phrasedocs.append((tokens, 'negative'))
        if (sentiment == 1):
            phrasedocs.append((tokens, 'positive'))


  # possibly filter tokens
  # lowercase - each phrase is a pair consisting of a token list and a label
    docs = []
    for phrase in phrasedocs:
        lowerphrase = ([w.lower() for w in phrase[0]], phrase[1])
        docs.append (lowerphrase)
  # print a few
    for phrase in docs[:10]:
        print (phrase)

  # continue as usual to get all words and create word features
    all_words_list = [word for (sent,cat) in docs for word in sent]
    all_words = nltk.FreqDist(all_words_list)
    print(len(all_words))

  # get the 2000 most frequently appearing keywords in the corpus
    word_items = all_words.most_common(2000)
    word_features = [word for (word,count) in word_items]
    
    finder = BigramCollocationFinder.from_words(all_words_list)
    bigram_features = finder.nbest(bigram_measures.chi_sq, 500)

  # not_feature sets from a feature definition function
    combined_featuresets = [(combined_document_features(d, word_features,SL, bigram_features, negationwords), c) for (d, c) in docs]
    print(len(combined_featuresets)) # 100

  # train classifier and show performance in cross-validation
  # make a list of labels
    label_list = [c for (d,c) in docs]
    labels = list(set(label_list))    # gets only unique labels
    num_folds = 5
    cross_validation_PRF(num_folds, combined_featuresets, labels)
        
    train_set, test_set = combined_featuresets[round(.1*int(limit)):], combined_featuresets[:round(.1*int(limit))]
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    print('Overall Accuracy', nltk.classify.accuracy(classifier, test_set))
    
    goldlist = [] # gold labels for the test set
    predictedlist = [] # test list, predicted 
    for (features, label) in test_set:
        goldlist.append(label)
        predictedlist.append(classifier.classify(features))
    cm = nltk.ConfusionMatrix(goldlist, predictedlist)
    print(cm.pretty_format(sort_by_count=True, truncate=9))
    
    print(cross_validation_accuracy(num_folds, combined_featuresets))
        

In [42]:
process(dirPath, 10000)

Read 25000 phrases, using 10000 random phrases
(['``', 'hundstage', 'is', 'an', 'intentionally', 'ugly', 'and', 'unnerving', 'study', 'of', 'life', 'in', 'a', 'particularly', 'dreary', 'suburb', 'of', 'vienna', '.', 'it', 'comes', 'from', 'former', 'documentary', 'director', 'ulrich', 'seidl', 'who', 'adopts', 'a', 'very', 'documentary-like', 'approach', 'to', 'the', 'material', '.', 'however'], 'positive')
(['``', 'oh', 'dear', 'we', 'do', "n't", 'like', 'it', 'when', 'our', 'super-hero', 'love', 'interest', 'develops', 'a', 'brain', 'do', 'we', '?', 'something', 'has', 'happened', 'to', 'people'], 'positive')
(['``', 'been', 'lurking', 'for', 'a', 'couple', 'of', 'years', 'or', 'so', '.', 'i', 'have', 'never', 'been', 'moved', 'to', 'post', 'on', 'here', 'before'], 'negative')
(['``', 'any', 'story', 'comprises', 'a', 'premise'], 'positive')
(['``', 'in', 'my', 'humble', 'opinion'], 'negative')
(['``', 'there', 'are', 'movies', 'like', '``', "''", 'plan', '9', "''", "''", 'that', 'ar

In [43]:
process(dirPath, 5000)

Read 25000 phrases, using 5000 random phrases
(['``', 'hundstage', 'is', 'an', 'intentionally', 'ugly', 'and', 'unnerving', 'study', 'of', 'life', 'in', 'a', 'particularly', 'dreary', 'suburb', 'of', 'vienna', '.', 'it', 'comes', 'from', 'former', 'documentary', 'director', 'ulrich', 'seidl', 'who', 'adopts', 'a', 'very', 'documentary-like', 'approach', 'to', 'the', 'material', '.', 'however'], 'positive')
(['``', 'oh', 'dear', 'we', 'do', "n't", 'like', 'it', 'when', 'our', 'super-hero', 'love', 'interest', 'develops', 'a', 'brain', 'do', 'we', '?', 'something', 'has', 'happened', 'to', 'people'], 'positive')
(['``', 'been', 'lurking', 'for', 'a', 'couple', 'of', 'years', 'or', 'so', '.', 'i', 'have', 'never', 'been', 'moved', 'to', 'post', 'on', 'here', 'before'], 'negative')
(['``', 'any', 'story', 'comprises', 'a', 'premise'], 'positive')
(['``', 'in', 'my', 'humble', 'opinion'], 'negative')
(['``', 'there', 'are', 'movies', 'like', '``', "''", 'plan', '9', "''", "''", 'that', 'are

__Negation and SL__

In [46]:
## function to read kaggle training file, train and test a classifier 
def process(dirPath,limitStr):
    limit = int(limitStr)
    dirPath = '/Users/aivii/programsmm/FP736/corpus/'
    f = open('./corpus/train.csv','r', encoding='utf-8')
    phrasedata = []
    for line in f:
        if (not line.startswith(',label')): # ignore the first line 
            line = line.strip()
        #line = fp.readline()
            phrasedata.append(line.split(',')[1:3])
  
  # pick a random sample of length limit because of phrase overlapping sequences
    random.shuffle(phrasedata)
    phraselist = phrasedata[:limit]

    print('Read', len(phrasedata), 'phrases, using', len(phraselist), 'random phrases')
  
  # create list of phrase documents as (list of words, label)
    phrasedocs = []
  # add all the phrases

  # each phrase has a list of tokens and the sentiment label ( 0 or 1)
    for phrase in phraselist:
        tokens = nltk.word_tokenize(phrase[1])
        
        # phrasedocs.append((tokens, int(phrase[0])))
        sentiment = int(phrase[0])
        
        if (sentiment == 0):
            phrasedocs.append((tokens, 'negative'))
        if (sentiment == 1):
            phrasedocs.append((tokens, 'positive'))


  # possibly filter tokens
  # lowercase - each phrase is a pair consisting of a token list and a label
    docs = []
    for phrase in phrasedocs:
        lowerphrase = ([w.lower() for w in phrase[0]], phrase[1])
        docs.append (lowerphrase)
  # print a few
    for phrase in docs[:10]:
        print (phrase)

  # continue as usual to get all words and create word features
    all_words_list = [word for (sent,cat) in docs for word in sent]
    all_words = nltk.FreqDist(all_words_list)
    print(len(all_words))

  # get the 2000 most frequently appearing keywords in the corpus
    word_items = all_words.most_common(2000)
    word_features = [word for (word,count) in word_items]

  # feature sets from a feature definition function
    #featuresets = [(document_features(d, word_features), c) for (d, c) in docs]
    c_featuresets = [(generate_SLneg_features(d, SL, word_features), c) for (d, c) in docs]
    print(len(c_featuresets)) # 100

  # train classifier and show performance in cross-validation
  # make a list of labels
    label_list = [c for (d,c) in docs]
    labels = list(set(label_list))    # gets only unique labels
    num_folds = 5
    cross_validation_PRF(num_folds, c_featuresets, labels)
        
    train_set, test_set = c_featuresets[round(.1*int(limit)):], c_featuresets[:round(.1*int(limit))]
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    print('Overall Accuracy', nltk.classify.accuracy(classifier, test_set))
    
    goldlist = [] # gold labels for the test set
    predictedlist = [] # test list, predicted 
    for (features, label) in test_set:
        goldlist.append(label)
        predictedlist.append(classifier.classify(features))
    cm = nltk.ConfusionMatrix(goldlist, predictedlist)
    print(cm.pretty_format(sort_by_count=True, truncate=9))
    
    #print(cross_validation_accuracy(num_folds, combined_featuresets))
    

In [47]:
process(dirPath, 10000)

Read 25000 phrases, using 10000 random phrases
(['``', 'probably', 'the', 'best', 'film', 'of', 'the', 'year', 'for', 'me', '.', 'this', 'small', 'french', 'film', 'centers', 'on', 'put', 'upon', 'office', 'secretary', 'carla', '(', 'emmanuelle', 'devos', ')', 'who', 'spends', 'her', 'days', 'doing', 'other', 'men', "'s", 'jobs'], 'positive')
(['``', 'i', 'came', 'away', 'from', 'this', 'movie', 'with', 'the', 'feeling', 'that', 'it', 'could', 'have', 'been', 'so', 'much', 'better', '.', 'instead', 'of', 'what', 'should', 'be', 'a', 'gripping'], 'negative')
(['i', 'have', 'a', 'two', 'year', 'old', 'son', 'who', 'suffers', 'from', 'the', 'same', 'condition', 'as', 'jonny', 'kennedy', '.', 'i', 'never', 'got', 'the', 'chance', 'to', 'meet', 'him', 'but', 'i', 'have', 'never', 'heard', 'anybody', 'say', 'a', 'bad', 'word', 'about', 'him', '.', 'i', 'hope', 'he', 'knows', 'how', 'much', 'the', 'making', 'of', 'this', 'programme', 'has', 'helped', 'his', 'fellow', 'sufferers', 'by', 'raisi

{'length': 85, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 48, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 25, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 40, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 3, 'negationwords': 1}
{'length': 4, 'negationwords': 1}
{'length': 9, 'negationwords': 0, 'positiveStrengthCount': 5, 'negativeStrengthCount': 0}
{'length': 16, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 7, 'negationwords': 1, 'positiveStrengthCount': 5, 'negativeStrengthCount': 0}
{'length': 35, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 36, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 88, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 91, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 6, 'negationwords': 1}
{'length': 71, 'negationw

{'length': 33, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 24, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 4, 'negationwords': 1}
{'length': 12, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 48, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 35, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 8, 'negationwords': 1}
{'length': 6, 'negationwords': 1, 'positiveStrengthCount': 0, 'negativeStrengthCount': 0}
{'length': 80, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 26, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 27, 'negationwords': 0, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 21, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 9, 'negationwords': 0, 'positiveStrengthCount': 5, 'negativeStrengthCount': 0}
{'length': 136, 'negationwords': 0, 'percpositive': 0.0, 'p

{'length': 20, 'negationwords': 0, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 18, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 51, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 50, 'negationwords': 0, 'positiveStrengthCount': 0, 'negativeStrengthCount': 5, 'percpositive': 0.0, 'percnegative': 2.0}
{'length': 34, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 19, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 23, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 2, 'negationwords': 1}
{'length': 20, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 14, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 13, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 2, 'negationwords': 1, 'positiveStrengthCount': 0, 'negativeStrengthCount': 0}
{'length': 6, 'negationwords': 0}
{'length': 50, '

{'length': 20, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 12, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 6, 'negationwords': 1, 'positiveStrengthCount': 0, 'negativeStrengthCount': 5}
{'length': 10, 'negationwords': 1}
{'length': 57, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 27, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 24, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 70, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 10, 'negationwords': 1}
{'length': 6, 'negationwords': 1}
{'length': 23, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 9, 'negationwords': 0}
{'length': 115, 'negationwords': 1, 'positiveStrengthCount': 2, 'negativeStrengthCount': 0, 'percpositive': 0.87, 'percnegative': 0.0}
{'length': 17, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 12, 'negat

{'length': 25, 'negationwords': 0, 'positiveStrengthCount': 0, 'negativeStrengthCount': 5, 'percpositive': 0.0, 'percnegative': 4.0}
{'length': 73, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 33, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 16, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 38, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 53, 'negationwords': 0, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 10, 'negationwords': 1}
{'length': 20, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 17, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 3, 'negationwords': 1}
{'length': 19, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 6, 'negationwords': 1}
{'length': 56, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 24, 'negationwords': 1, 'positiveStrengthCount': 0, 'negativ

{'length': 108, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 10, 'negationwords': 1, 'positiveStrengthCount': 0, 'negativeStrengthCount': 0}
{'length': 31, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 19, 'negationwords': 1, 'positiveStrengthCount': 0, 'negativeStrengthCount': 5, 'percpositive': 0.0, 'percnegative': 5.26}
{'length': 25, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 35, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 47, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 8, 'negationwords': 0}
{'length': 10, 'negationwords': 1}
{'length': 23, 'negationwords': 1, 'positiveStrengthCount': 5, 'negativeStrengthCount': 0, 'percpositive': 4.35, 'percnegative': 0.0}
{'length': 18, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 24, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 83, 'negationwords': 

{'length': 19, 'negationwords': 1, 'positiveStrengthCount': 0, 'negativeStrengthCount': 5, 'percpositive': 0.0, 'percnegative': 5.26}
{'length': 14, 'negationwords': 1, 'positiveStrengthCount': 5, 'negativeStrengthCount': 0, 'percpositive': 7.14, 'percnegative': 0.0}
{'length': 23, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 31, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 9, 'negationwords': 1}
{'length': 4, 'negationwords': 1}
{'length': 41, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 11, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 2, 'negationwords': 1}
{'length': 6, 'negationwords': 1}
{'length': 9, 'negationwords': 1}
{'length': 7, 'negationwords': 1}
{'length': 36, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 43, 'negationwords': 1, 'positiveStrengthCount': 2, 'negativeStrengthCount': 0, 'percpositive': 2.33, 'percnegative': 0.0}
{'length'

{'length': 17, 'negationwords': 1, 'positiveStrengthCount': 0, 'negativeStrengthCount': 0, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 4, 'negationwords': 1}
{'length': 8, 'negationwords': 1}
{'length': 2, 'negationwords': 1}
{'length': 21, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 6, 'negationwords': 1}
{'length': 87, 'negationwords': 0, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 88, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 31, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 75, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 204, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 34, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 11, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 6, 'negationwords': 1, 'positiveStrengthCount': 0, 'negativeStrengthCount': 2}
{'length': 48, 'negation

{'length': 42, 'negationwords': 0, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 72, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 84, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 12, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 31, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 7, 'negationwords': 1}
{'length': 40, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 23, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 19, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 24, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 15, 'negationwords': 0, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 6, 'negationwords': 1}
{'length': 7, 'negationwords': 1}
{'length': 8, 'negationwords': 1}
{'length': 55, 'negationwords': 0, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 40, 'n

{'length': 41, 'negationwords': 0, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 37, 'negationwords': 0, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 5, 'negationwords': 1}
{'length': 18, 'negationwords': 1, 'positiveStrengthCount': 0, 'negativeStrengthCount': 2, 'percpositive': 0.0, 'percnegative': 5.56}
{'length': 31, 'negationwords': 0, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 9, 'negationwords': 1}
{'length': 16, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 4, 'negationwords': 1}
{'length': 3, 'negationwords': 1}
{'length': 8, 'negationwords': 1}
{'length': 11, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 2, 'negationwords': 1}
{'length': 21, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 8, 'negationwords': 0}
{'length': 25, 'negationwords': 1, 'positiveStrengthCount': 0, 'negativeStrengthCount': 5, 'percpositive': 0.0, 'percnegative': 4.0}
{'length': 57, 'negationwords': 1

{'length': 26, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 8, 'negationwords': 1}
{'length': 19, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 82, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 9, 'negationwords': 0, 'positiveStrengthCount': 5, 'negativeStrengthCount': 0}
{'length': 43, 'negationwords': 1, 'positiveStrengthCount': 2, 'negativeStrengthCount': 0, 'percpositive': 2.33, 'percnegative': 0.0}
{'length': 6, 'negationwords': 1}
{'length': 2, 'negationwords': 1, 'positiveStrengthCount': 0, 'negativeStrengthCount': 5}
{'length': 3, 'negationwords': 1}
{'length': 15, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 15, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 6, 'negationwords': 1}
{'length': 26, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 16, 'negationwords': 1, 'positiveStrengthCount': 5, 'negativeStrengthCount': 

{'length': 24, 'negationwords': 0, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 2, 'negationwords': 1}
{'length': 6, 'negationwords': 1}
{'length': 16, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 22, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 12, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 13, 'negationwords': 0, 'positiveStrengthCount': 5, 'negativeStrengthCount': 0, 'percpositive': 7.69, 'percnegative': 0.0}
{'length': 11, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 27, 'negationwords': 1, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 33, 'negationwords': 0, 'percpositive': 0.0, 'percnegative': 0.0}
{'length': 5, 'negationwords': 1}
{'length': 63, 'negationwords': 1, 'positiveStrengthCount': 2, 'negativeStrengthCount': 0, 'percpositive': 1.59, 'percnegative': 0.0}
{'length': 44, 'negationwords': 1, 'positiveStrengthCount': 5, 'negativeStrengthCount': 0, 

Fold 1
Fold 2
Fold 3
Fold 4

Average Precision	Recall		F1 	Per Label
negative 	      0.479      0.503      0.490
positive 	      0.541      0.517      0.528

Macro Average Precision	Recall		F1 	Over All Labels
	      0.510      0.510      0.509

Label Counts {'negative': 4923, 'positive': 5077}
Micro Average Precision	Recall		F1 	Over All Labels
	      0.510      0.510      0.509
Overall Accuracy 0.522
         |   p   n |
         |   o   e |
         |   s   g |
         |   i   a |
         |   t   t |
         |   i   i |
         |   v   v |
         |   e   e |
---------+---------+
positive |<312>198 |
negative | 280<210>|
---------+---------+
(row = reference; col = test)



__LIWC__

In [48]:
## function to read kaggle training file, train and test a classifier 
def process(dirPath,limitStr):
    limit = int(limitStr)
    dirPath = '/Users/aivii/programsmm/FP736/corpus/'
    f = open('./corpus/train.csv','r', encoding='utf-8')
    phrasedata = []
    for line in f:
        if (not line.startswith(',label')): # ignore the first line 
            line = line.strip()
        #line = fp.readline()
            phrasedata.append(line.split(',')[1:3])
  
  # pick a random sample of length limit because of phrase overlapping sequences
    random.shuffle(phrasedata)
    phraselist = phrasedata[:limit]

    print('Read', len(phrasedata), 'phrases, using', len(phraselist), 'random phrases')
  
  # create list of phrase documents as (list of words, label)
    phrasedocs = []
  # add all the phrases

  # each phrase has a list of tokens and the sentiment label ( 0 or 1)
    for phrase in phraselist:
        tokens = nltk.word_tokenize(phrase[1])
        
        # phrasedocs.append((tokens, int(phrase[0])))
        sentiment = int(phrase[0])
        
        if (sentiment == 0):
            phrasedocs.append((tokens, 'negative'))
        if (sentiment == 1):
            phrasedocs.append((tokens, 'positive'))


  # possibly filter tokens
  # lowercase - each phrase is a pair consisting of a token list and a label
    docs = []
    for phrase in phrasedocs:
        lowerphrase = ([w.lower() for w in phrase[0]], phrase[1])
        docs.append (lowerphrase)
  # print a few
    for phrase in docs[:10]:
        print (phrase)

  # continue as usual to get all words and create word features
    all_words_list = [word for (sent,cat) in docs for word in sent]
    all_words = nltk.FreqDist(all_words_list)
    print(len(all_words))

  # get the 2000 most frequently appearing keywords in the corpus
    word_items = all_words.most_common(2000)
    word_features = [word for (word,count) in word_items]

  # feature sets from a feature definition function
    liwcfeaturesets = [(liwc_features(d, word_features, poslist, neglist), c) for (d, c) in docs]
    print(len(liwcfeaturesets)) # 100

  # train classifier and show performance in cross-validation
  # make a list of labels
    label_list = [c for (d,c) in docs]
    labels = list(set(label_list))    # gets only unique labels
    num_folds = 5
    cross_validation_PRF(num_folds, liwcfeaturesets, labels)
        
    train_set, test_set = liwcfeaturesets[round(.1*int(limit)):], liwcfeaturesets[:round(.1*int(limit))]
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    print('Overall Accuracy', nltk.classify.accuracy(classifier, test_set))
    
    goldlist = [] # gold labels for the test set
    predictedlist = [] # test list, predicted 
    for (features, label) in test_set:
        goldlist.append(label)
        predictedlist.append(classifier.classify(features))
    cm = nltk.ConfusionMatrix(goldlist, predictedlist)
    print(cm.pretty_format(sort_by_count=True, truncate=9))
    
    print(cross_validation_accuracy(num_folds,liwcfeaturesets))
    

In [49]:
process(dirPath, 10000)

Read 25000 phrases, using 10000 random phrases
(['``', 'spoiler', '!', '!', '!', 'this', 'movie', 'is', 'based', 'on', 'the', 'concept', 'of', 'what', 'if', '?', 'of', 'course', 'mr', 'destiny', 'will', 'be', 'able', 'to', 'answer', 'this', 'question', '.', 'the', 'main', 'character', 'goes', 'through', 'a', 'bad', 'day'], 'positive')
(['``', 'the', 'actors', 'did', 'a', 'really', 'good', 'job', 'playing', 'their', 'roles', '--', 'particularly', 'the', 'mom', '.', 'however'], 'negative')
(['``', 'i', 'saw', 'not', 'so', 'fabulous', 'rating', 'on', 'imdb'], 'negative')
(['``', 'the', 'title', 'overstates', 'the', 'content', 'of', 'this', 'movie', 'somewhat'], 'positive')
(['``', 'broadway', 'and', 'film', 'actor-turned-director', 'john', 'cassavetes', '(', 'from', 'rosemary', "'s", 'baby', ')', 'creates', 'a', 'masterpiece', 'with', 'this', '1977', 'film', '.', 'it', 'stars', 'gena', 'rowlands'], 'positive')
(['``', 'a', 'classy', 'film', 'pulled', 'in', '2', 'directions', '.', 'to', 'i

__SL + LIWC__

In [50]:
## function to read kaggle training file, train and test a classifier 
def process(dirPath,limitStr):
    limit = int(limitStr)
    dirPath = '/Users/aivii/programsmm/FP736/corpus/'
    f = open('./corpus/train.csv','r', encoding='utf-8')
    phrasedata = []
    for line in f:
        if (not line.startswith(',label')): # ignore the first line 
            line = line.strip()
        #line = fp.readline()
            phrasedata.append(line.split(',')[1:3])
  
  # pick a random sample of length limit because of phrase overlapping sequences
    random.Random(1234).shuffle(phrasedata)
    phraselist = phrasedata[:limit]

    print('Read', len(phrasedata), 'phrases, using', len(phraselist), 'random phrases')
  
  # create list of phrase documents as (list of words, label)
    phrasedocs = []
  # add all the phrases

  # each phrase has a list of tokens and the sentiment label ( 0 or 1)
    for phrase in phraselist:
        tokens = nltk.word_tokenize(phrase[1])
        
        # phrasedocs.append((tokens, int(phrase[0])))
        sentiment = int(phrase[0])
        
        if (sentiment == 0):
            phrasedocs.append((tokens, 'negative'))
        if (sentiment == 1):
            phrasedocs.append((tokens, 'positive'))


  # possibly filter tokens
  # lowercase - each phrase is a pair consisting of a token list and a label
    docs = []
    for phrase in phrasedocs:
        lowerphrase = ([w.lower() for w in phrase[0]], phrase[1])
        docs.append (lowerphrase)
  # print a few
    for phrase in docs[:10]:
        print (phrase)

  # continue as usual to get all words and create word features
    all_words_list = [word for (sent,cat) in docs for word in sent]
    all_words = nltk.FreqDist(all_words_list)
    print(len(all_words))

  # get the 2000 most frequently appearing keywords in the corpus
    word_items = all_words.most_common(2000)
    word_features = [word for (word,count) in word_items]

  # feature sets from a feature definition function
    SLliwcfeaturesets = [(SL_liwc_features(d, word_features, SL,poslist, neglist), c) for (d, c) in docs]
    print(len(SLliwcfeaturesets)) # 100

  # train classifier and show performance in cross-validation
  # make a list of labels
    label_list = [c for (d,c) in docs]
    labels = list(set(label_list))    # gets only unique labels
    num_folds = 5
    cross_validation_PRF(num_folds, SLliwcfeaturesets, labels)
        
    train_set, test_set = SLliwcfeaturesets[round(.1*int(limit)):], SLliwcfeaturesets[:round(.1*int(limit))]
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    print('Overall Accuracy', nltk.classify.accuracy(classifier, test_set))
    
    goldlist = [] # gold labels for the test set
    predictedlist = [] # test list, predicted 
    for (features, label) in test_set:
        goldlist.append(label)
        predictedlist.append(classifier.classify(features))
    cm = nltk.ConfusionMatrix(goldlist, predictedlist)
    print(cm.pretty_format(sort_by_count=True, truncate=9))
    
    print(cross_validation_accuracy(num_folds,SLliwcfeaturesets))
        

In [51]:
process(dirPath, 10000)

Read 25000 phrases, using 10000 random phrases
(['``', 'hundstage', 'is', 'an', 'intentionally', 'ugly', 'and', 'unnerving', 'study', 'of', 'life', 'in', 'a', 'particularly', 'dreary', 'suburb', 'of', 'vienna', '.', 'it', 'comes', 'from', 'former', 'documentary', 'director', 'ulrich', 'seidl', 'who', 'adopts', 'a', 'very', 'documentary-like', 'approach', 'to', 'the', 'material', '.', 'however'], 'positive')
(['``', 'oh', 'dear', 'we', 'do', "n't", 'like', 'it', 'when', 'our', 'super-hero', 'love', 'interest', 'develops', 'a', 'brain', 'do', 'we', '?', 'something', 'has', 'happened', 'to', 'people'], 'positive')
(['``', 'been', 'lurking', 'for', 'a', 'couple', 'of', 'years', 'or', 'so', '.', 'i', 'have', 'never', 'been', 'moved', 'to', 'post', 'on', 'here', 'before'], 'negative')
(['``', 'any', 'story', 'comprises', 'a', 'premise'], 'positive')
(['``', 'in', 'my', 'humble', 'opinion'], 'negative')
(['``', 'there', 'are', 'movies', 'like', '``', "''", 'plan', '9', "''", "''", 'that', 'ar