import sys
import numpy as np
import csv
import nltk
nltk.download('stopwords')
nltk.download('twitter_samples')
nltk.download('punkt')
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords, twitter_samples
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.classify import NaiveBayesClassifier
import string
import re
import pickle

In [2]:
def Tokenization(sentence):
    token_method = TweetTokenizer()
    token_list = token_method.tokenize(sentence)
    return token_list


def Cleaner(token_list, stop_words=(), english_punctuations=()):
    # pos_tag && Lemmatisation
    wordnet = WordNetLemmatizer()
    lemmatized_sentence = []
    for word, tag in pos_tag(token_list):
        word = re.sub(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", "", word)
        word = re.sub(r"@[a-zA-Z0-9_]+", "", word)
        if tag.startswith('NN'):
            lemmatized_sentence.append(wordnet.lemmatize(word.lower(), pos='n'))
        elif tag.startswith('VB'):
            lemmatized_sentence.append(wordnet.lemmatize(word.lower(), pos='v'))
        elif tag.startswith('JJ'):
            lemmatized_sentence.append(wordnet.lemmatize(word.lower(),pos='a'))
        elif tag.startswith('R'):
            lemmatized_sentence.append(wordnet.lemmatize(word.lower(),pos='r'))
        else:
            lemmatized_sentence.append(word.lower())

    # remove stop_words and the punctuations
    cleaned_token_list = []
    for word in lemmatized_sentence:
        if word.lower() not in stop_words and len(word) > 0 and word not in english_punctuations:
            cleaned_token_list.append(word)
        # else:
        #     print(word)
    return cleaned_token_list

In [3]:
def deal_trainset(stop_words, english_punctuations):
    positive_tweets = twitter_samples.strings('positive_tweets.json')
    negative_tweets = twitter_samples.strings('negative_tweets.json')

    positive_cleaned_tokens_list = []
    negative_cleaned_tokens_list = []

    for pos_sen in positive_tweets:
        positive_cleaned_tokens_list.append(Cleaner(Tokenization(pos_sen), stop_words, english_punctuations))
    for neg_sen in negative_tweets:
        negative_cleaned_tokens_list.append(Cleaner(Tokenization(neg_sen), stop_words, english_punctuations))

    print(positive_cleaned_tokens_list)

    pos_model = []
    for pos_sen in positive_cleaned_tokens_list:
        pos_model.append(dict([word, True] for word in pos_sen))
    # print(pos_model)

    pos_dataset = [(pos_dict,"Positive") for pos_dict in pos_model]

    # print(pos_dataset)

    neg_model = []
    for neg_sen in negative_cleaned_tokens_list:
        neg_model.append(dict([word, True] for word in neg_sen))
    neg_dataset = [(neg_dict,"Negative") for neg_dict in neg_model]
    return pos_dataset, neg_dataset

In [4]:
    stop_words = stopwords.words('english')
    english_punctuations = []
    punc = string.punctuation
    for pun in punc:
        english_punctuations.append(pun)

    pos_dataset,neg_dataset = deal_trainset(stop_words, english_punctuations)

[['#followfriday', 'top', 'engage', 'member', 'community', 'week', ':)'], ['hey', 'james', 'odd', ':/', 'please', 'call', 'contact', 'centre', '02392441234', 'able', 'assist', ':)', 'many', 'thanks'], ['listen', 'last', 'night', ':)', 'bleed', 'amazing', 'track', 'scotland'], ['congrats', ':)'], ['yeaaaah', 'yippppy', 'accnt', 'verify', 'rqst', 'succeed', 'get', 'blue', 'tick', 'mark', 'fb', 'profile', ':)', '15', 'day'], ['one', 'irresistible', ':)', '#flipkartfashionfriday'], ['like', 'keep', 'lovely', 'customer', 'wait', 'long', 'hope', 'enjoy', 'happy', 'friday', 'lwwf', ':)'], ['second', 'thought', '’', 'enough', 'time', 'dd', ':)', 'new', 'short', 'enter', 'system', 'sheep', 'must', 'buy'], ['jgh', 'go', 'bayan', ':d', 'bye'], ['act', 'mischievousness', 'call', 'etl', 'layer', 'in-house', 'warehouse', 'app', 'katamari', 'well', '…', 'name', 'imply', ':p'], ['#followfriday', 'top', 'influencers', 'community', 'week', ':)'], ['love', 'big', '...', 'juicy', '...', 'selfies', ':)'], 

In [5]:
import random

In [6]:
dataset = pos_dataset + neg_dataset

In [8]:
len(dataset), len(pos_dataset), len(neg_dataset)

(10000, 5000, 5000)

In [9]:
random.shuffle(dataset)

In [10]:
len(dataset)

10000

In [12]:
train_set = dataset[0:7500] ; test_set = dataset[7500:10000]

In [19]:
test_set[0], test_set[2]

(({'youre': True,
   'gonna': True,
   'guess': True,
   'whats': True,
   'inside': True,
   'box': True,
   ':(': True},
  'Negative'),
 ({'thank': True, 'amazing': True, 'day': True, ':-)': True}, 'Positive'))

In [13]:
classifer = NaiveBayesClassifier.train(train_set)

In [23]:
def get_eva(test_set, classifer):
    tp, fp, fn, tn = 0, 0, 0, 0
    for i in range(len(test_set)):
        pred = classifer.classify(test_set[i][0])
        if pred == 'Positive' and test_set[i][1] == 'Positive':
            tp += 1
        elif pred == 'Positive' and test_set[i][1] == 'Negative':
            fn += 1
        elif pred == 'Negative' and test_set[i][1] == 'Positive':
            fp += 1
        elif pred == 'Negative' and test_set[i][1] == 'Negative':
            tn += 1
    print(tp, fp, fn, tn)
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    f1 = 2 * precision * recall /(precision + recall)
    accuracy = (tp+tn)/len(test_set)
    return precision, recall, f1, accuracy

        

In [24]:
precision, recall, f1, accuracy = get_eva(test_set, classifer)

1264 3 6 1227


In [25]:
precision, recall, f1, accuracy


(0.9976322020520916, 0.9952755905511811, 0.9964525029562477, 0.9964)

In [26]:
print('Precision: ', precision,' Recall: ', recall,'\nF1: ',f1,' Accuracy: ', accuracy)

Precision:  0.9976322020520916  Recall:  0.9952755905511811 
F1:  0.9964525029562477  Accuracy:  0.9964
