In [1]:
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import twitter_samples, stopwords
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
from nltk.metrics import ConfusionMatrix
import seaborn
from nltk import FreqDist, classify, NaiveBayesClassifier
import pickle
import joblib
import re, string, random

def normalize_data(tweet_tokens, stop_words = ()):

    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

def get_all_words(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token

def data_for_model(cleaned_tokens_list):
    for tweet_tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tweet_tokens)

if __name__ == "__main__":

    positive_tweets = twitter_samples.strings('positive_tweets.json')
    negative_tweets = twitter_samples.strings('negative_tweets.json')
    text = twitter_samples.strings('tweets.20150430-223406.json')
    tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0]

    stop_words = stopwords.words('english')

    positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
    negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

    positive_cleaned_tokens_list = []
    negative_cleaned_tokens_list = []

    for tokens in positive_tweet_tokens:
        positive_cleaned_tokens_list.append(normalize_data(tokens, stop_words))

    for tokens in negative_tweet_tokens:
        negative_cleaned_tokens_list.append(normalize_data(tokens, stop_words))

    all_pos_words = get_all_words(positive_cleaned_tokens_list)

    freq_dist_pos = FreqDist(all_pos_words)
    print(freq_dist_pos.most_common(10))

    positive_tokens_for_model = data_for_model(positive_cleaned_tokens_list)
    negative_tokens_for_model = data_for_model(negative_cleaned_tokens_list)

    positive_dataset = [(tweet_dict, "Positive")
                         for tweet_dict in positive_tokens_for_model]

    negative_dataset = [(tweet_dict, "Negative")
                         for tweet_dict in negative_tokens_for_model]

    dataset = positive_dataset + negative_dataset

    random.shuffle(dataset)

    train_data = dataset[:7000]
    test_data = dataset[7000:]

    classifier = NaiveBayesClassifier.train(train_data)
    joblib.dump(classifier,'NaiveBayesClassifier_trained_model')

[(':)', 3691), (':-)', 701), (':d', 658), ('thanks', 388), ('follow', 357), ('love', 333), ('...', 290), ('good', 283), ('get', 263), ('thank', 253)]


In [31]:
from nltk.metrics import ConfusionMatrix
import numpy as np

trained_model = joblib.load('NaiveBayesClassifier_trained_model')

print("Accuracy is:", classify.accuracy(trained_model, test_data))

print(classifier.show_most_informative_features(10))

labels = []
tests = []

for i, (feats, label) in enumerate(test_data):
    # train_data[label].add(i)
    observed = classifier.classify(feats)
    # test_data[observed].add(i)
    labels.append(label)
    tests.append(observed)
confusion_matrix = ConfusionMatrix(labels, tests)
print("CONFUSION MATRIX")
print(confusion_matrix)

custom_tweet = "Very great service loved it"

custom_tokens = normalize_data(word_tokenize(custom_tweet))

print(custom_tweet+"   ================>>  "+trained_model.classify(dict([token, True] for token in custom_tokens)))


Accuracy is: 0.996
Most Informative Features
                      :( = True           Negati : Positi =   2064.5 : 1.0
                      :) = True           Positi : Negati =   1647.3 : 1.0
                follower = True           Positi : Negati =     33.7 : 1.0
                     sad = True           Negati : Positi =     22.4 : 1.0
                 welcome = True           Positi : Negati =     21.4 : 1.0
                    sick = True           Negati : Positi =     19.7 : 1.0
                     x15 = True           Negati : Positi =     17.0 : 1.0
                     bam = True           Positi : Negati =     16.3 : 1.0
                      aw = True           Negati : Positi =     15.0 : 1.0
                followed = True           Negati : Positi =     14.2 : 1.0
None
CONFUSION MATRIX
         |    N    P |
         |    e    o |
         |    g    s |
         |    a    i |
         |    t    t |
         |    i    i |
         |    v    v |
         |    e    e |