In [2]:
import nltk
nltk.download('stopwords')
nltk.download('twitter_samples')
nltk.download('punkt')
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords, twitter_samples
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
import string
import re
import numpy as np

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/buming/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package twitter_samples to
[nltk_data]     /Users/buming/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package punkt to /Users/buming/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
def Tokenization(sentence):
    token_method = TweetTokenizer()
    token_list = token_method.tokenize(sentence)
    return token_list

def Cleaner(token_list, stop_words=(), english_punctuations=()):
    #pos_tag && Lemmatisation
    wordnet = WordNetLemmatizer()
    lemmatized_sentence = []
    for word, tag in pos_tag(token_list):
        word = re.sub(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+","",word)
        word = re.sub(r"@[a-zA-Z0-9_]+","",word)
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        lemmatized_sentence.append(wordnet.lemmatize(word.lower(),pos=pos))

    #remove stop_words and the punctuations    
    cleaned_token_list = []
    for word in lemmatized_sentence:
        if word.lower() not in stop_words and len(word) > 0 and word not in english_punctuations:
            cleaned_token_list.append(word)
    return cleaned_token_list

In [4]:
stop_words = stopwords.words('english')
english_punctuations = []
punc = string.punctuation
for pun in punc:
    english_punctuations.append(pun)

In [5]:
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

In [6]:
positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []
for positive_sentence in positive_tweets:
    positive_cleaned_tokens_list.append(Cleaner(Tokenization(positive_sentence), stop_words, english_punctuations))

for negative_sentence in negative_tweets:
    negative_cleaned_tokens_list.append(Cleaner(Tokenization(negative_sentence), stop_words, english_punctuations))

In [7]:
pos_model = []
for pos_sen in positive_cleaned_tokens_list:
    pos_model.append(dict([word, True] for word in pos_sen))
pos_dataset = [(pos_dict,"Positive") for pos_dict in pos_model]

neg_model = []
for neg_sen in negative_cleaned_tokens_list:
    neg_model.append(dict([word, True] for word in neg_sen))
neg_dataset = [(neg_dict,"Negative") for neg_dict in neg_model]

In [8]:
dataset = pos_dataset + neg_dataset

In [9]:
import random
import math

In [10]:
random.shuffle(dataset)

In [11]:
train_set = dataset[0:7500]; test_set = dataset[7500:10000] 

In [12]:
all_pos_words = {}
all_neg_words = {}

for temp in train_set:
    if temp[1] == "Positive":
        for word in temp[0]:
            if word in all_pos_words.keys():
                all_pos_words[word] = all_pos_words[word] + 1
            else:
                all_pos_words.setdefault(word, 1)
    elif temp[1] == 'Negative':
        for word in temp[0]:
            if word in all_neg_words.keys():
                all_neg_words[word] = all_neg_words[word] + 1
            else:
                all_neg_words.setdefault(word, 1)

In [13]:
pos_words = list(all_pos_words.keys())
neg_words = list(all_neg_words.keys())

In [14]:
def predict(sentence, stop_words, english_punctuations, pos_words, neg_words):
#     cleaned_sentence = Cleaner(Tokenization(sentence), stop_words, english_punctuations)
    pos_pro_ln = 0.0 ; neg_pro_ln = 0.0
    for word in sentence:
        pos_count = 1.0 ; neg_count = 1.0
        pos_denom = 2.0 ; neg_denom = 2.0
        if word in pos_words:
            pos_count += 1.0 ; pos_denom += 1.0
        if word in neg_words:
            neg_count += 1.0 ; neg_denom += 1.0
        pos_pro_ln = pos_pro_ln + math.log(pos_count/pos_denom) + math.log(1.0/len(pos_words))
        neg_pro_ln = neg_pro_ln + math.log(neg_count/neg_denom) + math.log(1.0/len(neg_words))
#     print(pos_pro_ln, neg_pro_ln)
    pos_pro = math.exp(pos_pro_ln)
    neg_pro = math.exp(neg_pro_ln)
#     print('pos_pro: ', pos_pro)
#     print('neg_pro: ', neg_pro)
    if pos_pro > neg_pro:
#         print('Positive')
        return pos_pro, 'Positive'
    else:
#         print('Negative')
        return neg_pro, 'Negative'

In [15]:
def get_eva(test_set, stop_words, english_punctuations, pos_words, neg_words):
    tp, fp, fn, tn = 0, 0, 0, 0
    for i in range(len(test_set)):
        score, pred = predict(test_set[i][0], stop_words, english_punctuations, pos_words, neg_words)
        if pred == 'Positive' and test_set[i][1] == 'Positive':
            tp += 1
        elif pred == 'Positive' and test_set[i][1] == 'Negative':
            fn += 1
        elif pred == 'Negative' and test_set[i][1] == 'Positive':
            fp += 1
        elif pred == 'Negative' and test_set[i][1] == 'Negative':
            tn += 1
    print(tp, fp, fn, tn)
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    f1 = 2 * precision * recall /(precision + recall)
    accuracy = (tp+tn)/len(test_set)
    return precision, recall, f1, accuracy

In [16]:
precision, recall, f1, accuracy = get_eva(test_set, stop_words, english_punctuations, pos_words, neg_words)

339 890 59 1212


In [17]:
precision, recall, f1, accuracy 

(0.2758340113913751, 0.8517587939698492, 0.4167178856791641, 0.6204)