In [13]:
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import twitter_samples, stopwords
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk import FreqDist, classify, NaiveBayesClassifier

import re, string, random
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/nick/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/nick/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [14]:
# The strings() method of twitter_samples will print all of the tweets within a dataset as strings. 
# Setting the different tweet collections as a variable will make processing and testing easier.

positive_tweets = twitter_samples.strings("positive_tweets.json")
negative_tweets = twitter_samples.strings("negative_tweets.json")
text = twitter_samples.strings('tweets.20150430-223406.json')

# Using NLTK’s tokenizers such as punkt
# The punkt module is a pre-trained model that helps tokenize words and sentences. 
tweet_tokens = twitter_samples.tokenized("positive_tweets.json")

In [78]:
print("Positive tweet example:", positive_tweets[0])

Positive tweet example: #FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)


In [79]:
print("Negative tweet example:", negative_tweets[0])

Negative tweet example: hopeless for tmr :(


In [106]:
# Dataset details

print("The number of positive tweets is:", len(positive_tweets))
print("The number of negative tweets is:", len(negative_tweets))
print("The dataset is consisting of:", len(text))

The number of positive tweets is: 5000
The number of negative tweets is: 5000
The dataset is consisting of: 20000


In [15]:
def remove_noise(tweet_tokens, stop_words = ()):
    """
    This function removes noise and incorporates the normalization and lemmatization functions. 
    The code takes two arguments: the tweet tokens and the tuple of stop words.
    
    Normalization Part of the function
    The code then uses a loop to remove the noise from the dataset. 
    To remove hyperlinks, the code first searches for a substring that matches a URL starting with 
    http:// or https://, followed by letters, numbers, or special characters. 
    Once a pattern is matched, the .sub() method replaces it with an empty string, or ''.
    
    Similar approach applied to @ signs.
    Punctuation using the library string.
    
    Lemmatization part of the function
    The function lemmatize_sentence first gets the position tag of each token of a tweet. 
    Within the if statement, if the tag starts with NN, the token is assigned as a noun. 
    Similarly, if the tag starts with VB, the token is assigned as a verb.
    """
    
    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

In [16]:
# testing .words() method
stop_words = stopwords.words('english')

print(remove_noise(tweet_tokens[0], stop_words)) # because we have deployed remove_noise function above
                                                   # all @ mentions, stop words, and converts the words to lowercase.

['#followfriday', 'top', 'engage', 'member', 'community', 'week', ':)']


In [17]:
# Implementing code to clean the sample tweets

positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []

for tokens in positive_tweet_tokens:
    positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

for tokens in negative_tweet_tokens:
    negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

In [18]:
# Testing the implementaion of the code which cleaning the sample positive tweets
print(positive_tweet_tokens[0])
print(positive_cleaned_tokens_list[0])

['#FollowFriday', '@France_Inte', '@PKuchly57', '@Milipol_Paris', 'for', 'being', 'top', 'engaged', 'members', 'in', 'my', 'community', 'this', 'week', ':)']
['#followfriday', 'top', 'engage', 'member', 'community', 'week', ':)']


In [148]:
# Testing the implementaion of the code which cleaning the sample negative tweets
print(negative_tweet_tokens[0])
print(negative_cleaned_tokens_list[0])

['hopeless', 'for', 'tmr', ':(']
['hopeless', 'tmr', ':(']


In [149]:
def get_all_words(cleaned_tokens_list):
    """
    This function takes a list of tweets as an argument to provide a list of words 
    in all of the tweet tokens joined.
    """
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token

all_pos_words = get_all_words(positive_cleaned_tokens_list)
all_neg_words = get_all_words(negative_cleaned_tokens_list)

In [20]:
# Finding out the most common words in the postive cleaned sample tweets, 
# using the FreqDist class of NLTK
# The .most_common() method lists the words which occur most frequently in the data.

freq_dist_pos = FreqDist(all_pos_words)
print(freq_dist_pos.most_common(10))

[(':)', 3691), (':-)', 701), (':d', 658), ('thanks', 388), ('follow', 357), ('love', 333), ('...', 290), ('good', 283), ('get', 263), ('thank', 253)]


In [150]:
# Finding out the most common words in the negative cleaned sample tweets, 
# using the FreqDist class of NLTK
# The .most_common() method lists the words which occur most frequently in the data.

freq_dist_net = FreqDist(all_neg_words)
print(freq_dist_net.most_common(10))

[(':(', 4585), (':-(', 501), ("i'm", 343), ('...', 332), ('get', 325), ('miss', 291), ('go', 275), ('please', 275), ('want', 246), ('like', 218)]


In [21]:
# Converting Tokens to a Dictionary

def get_tweets_for_model(cleaned_tokens_list):
    """
    This functions converts the tweets from a list of cleaned tokens 
    to dictionaries with keys as the tokens and True as values. 
    """
    for tweet_tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tweet_tokens)

positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

In [22]:
# Splitting the Dataset for Training and Testing the Model
# This code attaches a Positive or Negative label to each tweet. 
# It then creates a dataset by joining the positive and negative tweets.

import random

positive_dataset = [(tweet_dict, "Positive") for tweet_dict in positive_tokens_for_model]
negative_dataset = [(tweet_dict, "Negative") for tweet_dict in negative_tokens_for_model]

dataset = positive_dataset + negative_dataset

# To avoid bias, I’ve added code to randomly arrange the data using 
# the .shuffle() method of random.
random.shuffle(dataset)

# The number of tweets is 10000
# The code splits the shuffled data into a ratio of 70:30 for training and testing
train_data = dataset[:7000]
test_data  = dataset[7000:]

In [23]:
# Building and Testing the Model

In [24]:
from nltk import classify
from nltk import NaiveBayesClassifier

classifier = NaiveBayesClassifier.train(train_data)

print("Accuracy is:", classify.accuracy(classifier, test_data))

print(classifier.show_most_informative_features(10))

Accuracy is: 0.9946666666666667
Most Informative Features
                      :) = True           Positi : Negati =   1002.1 : 1.0
                     sad = True           Negati : Positi =     55.8 : 1.0
                follower = True           Positi : Negati =     33.6 : 1.0
                     bam = True           Positi : Negati =     22.3 : 1.0
                    glad = True           Positi : Negati =     18.3 : 1.0
                followed = True           Negati : Positi =     15.8 : 1.0
                 welcome = True           Positi : Negati =     15.8 : 1.0
                 forward = True           Positi : Negati =     14.3 : 1.0
                    blog = True           Positi : Negati =     14.3 : 1.0
                     ugh = True           Negati : Positi =     13.7 : 1.0
None


#### Summary
Accuracy is defined as the percentage of tweets in the testing dataset for which the model was correctly able to predict the sentiment. A 99.46% accuracy on the test set is quite good indicator.

In the table that shows the most informative features, every row in the output shows the ratio of occurrence of a token in positive and negative tagged tweets in the training dataset. The first row in the data signifies that in all tweets containing the token :(, the ratio of negative to positives tweets was 1002.1 to 1. Interestingly, it seems that there was one token with :( in the positive datasets. You can see that the top two discriminating items in the text are the emoticons. Further, words such as sad lead to negative sentiments, whereas follower and bam are associated with positive sentiments.

In [163]:
# Checking how the model performs on random tweets.
# custom_tweet variable could be edited to check the model performance.
# This code is updating the string associated with the custom_tweet variable.

from nltk.tokenize import word_tokenize

custom_tweet = "I ordered from AliExpress last week. They delayed my order for a week! I will never this service in the future."
custom_tokens = remove_noise(word_tokenize(custom_tweet))

print(classifier.classify(dict([token, True] for token in custom_tokens)))

Negative
