In [1]:
import sys
import os

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import tweet_preprocess_mod as pre
import time
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import twitter_samples
from random import shuffle
from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import (
    BernoulliNB,
    ComplementNB,
    MultinomialNB
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

In [2]:
pos_tweets = [" ".join(pre.process_tweet(t)) for t in twitter_samples.strings("positive_tweets.json")]
neg_tweets = [" ".join(pre.process_tweet(t)) for t in twitter_samples.strings("negative_tweets.json")]
all_tweets = pos_tweets + neg_tweets
shuffle(all_tweets)

Extract the words from positive and negative tweets:

In [3]:
pos_words = []
neg_words = []
for p_t in pos_tweets:
    words = nltk.word_tokenize(p_t)
    for word in words:
        pos_words.append(word)
for n_t in neg_tweets:
    words = nltk.word_tokenize(n_t)
    for word in words:
        neg_words.append(word)

Find frequency and remove common words:

In [4]:
pos_fd = nltk.FreqDist(pos_words)
neg_fd = nltk.FreqDist(neg_words)
common_Set = set(pos_fd).intersection(neg_fd)

for word in common_Set:
    del pos_fd[word]
    del neg_fd[word]

In [6]:
top_pos = {word for word, count in pos_fd.most_common(10)}
top_neg = {word for word, count in neg_fd.most_common(10)}

top_pos

{'fantastic',
 'fback',
 'flipkartfashionfriday',
 'followfriday',
 'happiness',
 'here',
 'stats',
 'unfollowers',
 'warsaw',
 'youth'}

In [7]:
top_neg

{'beli̇eve',
 'justi̇n',
 'pray',
 'wi̇ll',
 'zayniscomingbackonjuly',
 '♛',
 '》',
 'ｍｅ',
 'ｓｅｅ',
 '😩'}

-----------------------
Feature Extraction:

In [10]:
vader = SentimentIntensityAnalyzer()

In [11]:
def extract_features(tweet):
    features = dict()
    wordcount = 0
    compound_scores = list()
    positive_scores = list()
    words = nltk.word_tokenize(tweet)

    for word in words:
        if word.lower() in top_pos:
            wordcount += 1

    compound_scores.append(vader.polarity_scores(tweet)["compound"])
    positive_scores.append(vader.polarity_scores(tweet)["pos"])

    features["compound"] = sum(compound_scores) + 1
    features["positive"] = sum(positive_scores)
    features["wordcount"] = wordcount

    return features

In [15]:
features = [
    (extract_features(tweet), "pos")
    for tweet in pos_tweets
]
features.extend([
    (extract_features(tweet), "neg")
    for tweet in neg_tweets
])

shuffle(features)
train, test = train_test_split(features, test_size=0.5)

In [16]:
features

[({'compound': 1.3612, 'positive': 0.477, 'wordcount': 0}, 'pos'),
 ({'compound': 1.3252, 'positive': 0.368, 'wordcount': 0}, 'pos'),
 ({'compound': 1.7717, 'positive': 0.691, 'wordcount': 0}, 'pos'),
 ({'compound': 1.6166, 'positive': 0.801, 'wordcount': 0}, 'pos'),
 ({'compound': 1.4588, 'positive': 0.6, 'wordcount': 0}, 'pos'),
 ({'compound': 1.6486, 'positive': 0.431, 'wordcount': 0}, 'pos'),
 ({'compound': 1.7506, 'positive': 0.649, 'wordcount': 0}, 'pos'),
 ({'compound': 1.4588, 'positive': 0.5, 'wordcount': 0}, 'pos'),
 ({'compound': 0.9484, 'positive': 0.355, 'wordcount': 0}, 'neg'),
 ({'compound': 1.802, 'positive': 0.455, 'wordcount': 0}, 'pos'),
 ({'compound': 0.872, 'positive': 0.316, 'wordcount': 0}, 'pos'),
 ({'compound': 1.5106000000000002, 'positive': 1.0, 'wordcount': 0}, 'pos'),
 ({'compound': 1.7351, 'positive': 0.554, 'wordcount': 0}, 'pos'),
 ({'compound': 0.4577, 'positive': 0.0, 'wordcount': 0}, 'neg'),
 ({'compound': 1.7096, 'positive': 0.425, 'wordcount': 0}, '

-----------------------
Classifiers:

In [17]:
classifiers = {
    "BernoulliNB": BernoulliNB(),
    "ComplementNB": ComplementNB(),
    "MultinomialNB": MultinomialNB(),
    "KNeighborsClassifier": KNeighborsClassifier(),
    "DecisionTreeClassifier": DecisionTreeClassifier(),
    "RandomForestClassifier": RandomForestClassifier(),
    "LogisticRegression": LogisticRegression(),
    "MLPClassifier": MLPClassifier(max_iter=1000),
    "AdaBoostClassifier": AdaBoostClassifier(),
}

In [18]:
for name, sklearn_classifier in classifiers.items():
    classifier = nltk.classify.SklearnClassifier(sklearn_classifier)
    classifier.train(train)

    acc = nltk.classify.accuracy(classifier, test)
    print(f"{acc:.2%} - {name}")

76.24% - BernoulliNB
68.24% - ComplementNB
65.92% - MultinomialNB
91.90% - KNeighborsClassifier
90.78% - DecisionTreeClassifier
92.52% - RandomForestClassifier
89.50% - LogisticRegression
89.52% - MLPClassifier
89.92% - AdaBoostClassifier
