In [25]:
import re
import os
import sys
import collections
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
import string
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.classify import NaiveBayesClassifier
from nltk.metrics import accuracy, precision, recall, f_measure

In [34]:
import re
from nltk.tokenize import TweetTokenizer
from nltk.tokenize.util import align_tokens

def represent_negation(tokens):
    negation_words = ["not", "n't", "no", "never", "none", "nobody", "nothing", "nowhere", "neither", "nor"]
    negation_pattern = r"\b(?:{})\b".format("|".join(negation_words))
    twtokenizer = TweetTokenizer()
    negation_tokens = []
    is_negated = False
    for token in tokens:
        if re.match(negation_pattern, token, flags=re.IGNORECASE):
            is_negated = not is_negated
        else:
            if is_negated:
                negation_tokens.append("NOT_" + token)
            else:
                negation_tokens.append(token)
    return negation_tokens

def processtweets(dirPath, limitStr):
    # convert the limit argument from a string to an int
    limit = int(limitStr)
    # initialize NLTK built-in tweet tokenizer
    twtokenizer = TweetTokenizer()

    os.chdir(dirPath)

    f = open('./corpus/downloaded-tweeti-b-dist.tsv', 'r')
    # loop over lines in the file and use the first limit of them
    # assuming that the tweets are sufficiently randomized
    tweetdata = []
    for line in f:
        if len(tweetdata) < limit:
            # remove final end of line character
            line = line.strip()
            # each line has 4 items separated by tabs
            # ignore the tweet and user ids, and keep the sentiment and tweet text
            tweetdata.append(line.split('\t')[2:4])

    tweetdocs = []
    # add all the tweets except the ones whose text is Not Available
    for tweet in tweetdata:
        if tweet[1] != 'Not Available':
            # run the tweet tokenizer on the text string - returns unicode tokens, so convert to utf8
            tokens = twtokenizer.tokenize(tweet[1])

            # Represent negation
            tokens = represent_negation(tokens)

            # Convert to lowercase
            tokens = [token.lower() for token in tokens]

            # Remove punctuation
            tokens = [token for token in tokens if token not in string.punctuation]

            # Remove words with numbers (eg 11th)
            tokens = [re.sub(r'\S*\d\S*', '', token).strip() for token in tokens]
            tokens = [token for token in tokens if token]

            def decontracted(phrase):
                # specific
                phrase = re.sub(r"won\'t", "will not", phrase)
                phrase = re.sub(r"can\'t", "can not", phrase)

                # general
                phrase = re.sub(r"n\'t", " not", phrase)
                phrase = re.sub(r"\'re", " are", phrase)
                phrase = re.sub(r"\'s", " is", phrase)
                phrase = re.sub(r"\'d", " would", phrase)
                phrase = re.sub(r"\'ll", " will", phrase)
                phrase = re.sub(r"\'t", " not", phrase)
                phrase = re.sub(r"\'ve", " have", phrase)
                phrase = re.sub(r"\'m", " am", phrase)
                return phrase

            tokens = [decontracted(token) for token in tokens]

            # Remove stopwords
            stop_words = set(stopwords.words('english'))
            tokens = [token for token in tokens if token not in stop_words]

            # Lemmatization
            lemmatizer = WordNetLemmatizer()
            tokens = [lemmatizer.lemmatize(token) for token in tokens]
            tokens = [token.strip() for token in tokens]

            if tweet[0] == '"positive"':
                label = 'pos'
            elif tweet[0] == '"negative"':
                label = 'neg'
            elif tweet[0] in ('"neutral"', '"objective"', '"objective-OR-neutral"'):
                label = 'neu'
            else:
                label = ''

            tweetdocs.append((tokens, label))

    df = pd.DataFrame(tweetdocs, columns=['tokens', 'labels'])
    df.to_csv('tweetdocs1.csv', index=False)

    def bag_of_words_features(tokens):
        freq_dist = FreqDist(tokens)
        most_common_words = freq_dist.most_common(20)
        word_features = {word: True for word, _ in most_common_words}
        return word_features

    feature_sets = []
    for entry in tweetdocs:
        tokens = entry[0]
        label = entry[1]
        features = bag_of_words_features(tokens)
        feature_set = (features, label)
        feature_sets.append(feature_set)

    train_set = feature_sets[:7000]
    test_set = feature_sets[7000:]

    nb_classifier = NaiveBayesClassifier.train(train_set)
    print('Accuracy:', nltk.classify.accuracy(nb_classifier, test_set))

    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)

    for i, (features, label) in enumerate(test_set):
        refsets[label].add(i)
        observed = nb_classifier.classify(features)
        testsets[observed].add(i)

    prec = precision(refsets['pos'], testsets['pos'])
    rec = recall(refsets['pos'], testsets['pos'])
    f1 = f_measure(refsets['pos'], testsets['pos'])

    print("Precision:", prec)
    print("Recall:", rec)
    print("F1 Score:", f1)




In [35]:
tweetdoc = processtweets('.',10000)

Accuracy: 0.5844370860927153
Precision: 0.6789667896678967
Recall: 0.6865671641791045
F1 Score: 0.6827458256029685
