In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords

In [None]:
nltk.download("twitter_samples")                          #downloading and storing the tweets (datasets) from the nltk library
nltk.download("punkt")                                    #punkt is a pre-trained model that is used in tokenization of text into sentences, words or characters (as needed)
nltk.download("wordnet")                                  #wordnet is used to dtermine the base form of a given word
nltk.download("averaged_perceptron_tagger")               #used to find the context of the base word that has been used in the sentence
nltk.download("stopwords")                                #a predefined set of stop words in NLTK that is used to remove the stop words present in text

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from nltk.corpus import twitter_samples
pos_text = twitter_samples.strings("positive_tweets.json")
print(pos_text)

['#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)', '@Lamb2ja Hey James! How odd :/ Please call our Contact Centre on 02392441234 and we will be able to assist you :) Many thanks!', '@DespiteOfficial we had a listen last night :) As You Bleed is an amazing track. When are you in Scotland?!', '@97sides CONGRATS :)', 'yeaaaah yippppy!!!  my accnt verified rqst has succeed got a blue tick mark on my fb profile :) in 15 days', '@BhaktisBanter @PallaviRuhail This one is irresistible :)\n#FlipkartFashionFriday http://t.co/EbZ0L2VENM', "We don't like to keep our lovely customers waiting for long! We hope you enjoy! Happy Friday! - LWWF :) https://t.co/smyYriipxI", '@Impatientraider On second thought, there’s just not enough time for a DD :) But new shorts entering system. Sheep must be buying.', 'Jgh , but we have to go to Bayan :D bye', 'As an act of mischievousness, am calling the ETL layer of our in-house warehousing app Katam

In [None]:
neg_text = twitter_samples.strings("negative_tweets.json")
print(neg_text)



In [None]:
test_text = twitter_samples.strings("tweets.20150430-223406.json")
print(test_text)



In [None]:
#using the punkt module upon the tweets for tokenization
tokenized_pos_tweets = twitter_samples.tokenized("positive_tweets.json")
tokenized_neg_tweets = twitter_samples.tokenized('negative_tweets.json')
print(tokenized_pos_tweets)
print(tokenized_neg_tweets)

[['#FollowFriday', '@France_Inte', '@PKuchly57', '@Milipol_Paris', 'for', 'being', 'top', 'engaged', 'members', 'in', 'my', 'community', 'this', 'week', ':)'], ['@Lamb2ja', 'Hey', 'James', '!', 'How', 'odd', ':/', 'Please', 'call', 'our', 'Contact', 'Centre', 'on', '02392441234', 'and', 'we', 'will', 'be', 'able', 'to', 'assist', 'you', ':)', 'Many', 'thanks', '!'], ['@DespiteOfficial', 'we', 'had', 'a', 'listen', 'last', 'night', ':)', 'As', 'You', 'Bleed', 'is', 'an', 'amazing', 'track', '.', 'When', 'are', 'you', 'in', 'Scotland', '?', '!'], ['@97sides', 'CONGRATS', ':)'], ['yeaaaah', 'yippppy', '!', '!', '!', 'my', 'accnt', 'verified', 'rqst', 'has', 'succeed', 'got', 'a', 'blue', 'tick', 'mark', 'on', 'my', 'fb', 'profile', ':)', 'in', '15', 'days'], ['@BhaktisBanter', '@PallaviRuhail', 'This', 'one', 'is', 'irresistible', ':)', '#FlipkartFashionFriday', 'http://t.co/EbZ0L2VENM'], ['We', "don't", 'like', 'to', 'keep', 'our', 'lovely', 'customers', 'waiting', 'for', 'long', '!', 'W

In [None]:
#to see a single sample of the tokenized tweet
print(tokenized_pos_tweets[0])
print(tokenized_neg_tweets[0])

['#FollowFriday', '@France_Inte', '@PKuchly57', '@Milipol_Paris', 'for', 'being', 'top', 'engaged', 'members', 'in', 'my', 'community', 'this', 'week', ':)']
['hopeless', 'for', 'tmr', ':(']


In [None]:
#finding the context of each tag in the tweet (eg: checking for nouns, verbs, adjectives etc, and tagging them correspondingly)
print(pos_tag(tokenized_pos_tweets[0]))

[('#FollowFriday', 'JJ'), ('@France_Inte', 'NNP'), ('@PKuchly57', 'NNP'), ('@Milipol_Paris', 'NNP'), ('for', 'IN'), ('being', 'VBG'), ('top', 'JJ'), ('engaged', 'VBN'), ('members', 'NNS'), ('in', 'IN'), ('my', 'PRP$'), ('community', 'NN'), ('this', 'DT'), ('week', 'NN'), (':)', 'NN')]


In [None]:
#NORMALIZATION OF THE WORDS (Stemming and Lemmatization)

#function to lemmatize the most used words in a sentence (i.e. to take the
#words and tags associated with them and then converting them into their base form)

def lemmatize_tweet(tokens):
  lemmatizer = WordNetLemmatizer()
  res_sentence = []
  for word, tag in pos_tag(tokens):
    if tag.startswith("NN"):
      pos = 'n'
    elif tag.startswith("VB"):
      pos = 'v'
    else:
      pos = 'a'
    res_sentence.append(lemmatizer.lemmatize(word, pos))
  return res_sentence

print(lemmatize_tweet(tokenized_pos_tweets[0]))

['#FollowFriday', '@France_Inte', '@PKuchly57', '@Milipol_Paris', 'for', 'be', 'top', 'engage', 'member', 'in', 'my', 'community', 'this', 'week', ':)']


In [None]:
#removing stop words from text

import re, string
def remove_noise(tokenized_pos_tweets, stop_words = ()):
  noiseless_tokens = []
  for token, tag in pos_tag(tokenized_pos_tweets):
     #removing all the hyperlinks from the text and putting an empty string in place of the same 
    token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                   '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
    
    token = re.sub("(@[A-Za-z0-9_]+)","", token)                                   #removing all the twitter handles of the users from the text and putting an empty string in place of the same
    if tag.startswith("NN"):
      pos = 'n'
    elif tag.startswith("VB"):
      pos = 'v'
    else:
      pos = 'a'

    lemmatizer = WordNetLemmatizer()
    token = lemmatizer.lemmatize(token, pos)

    #string library is used to remove punctuations from the text
    if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
      noiseless_tokens.append(token.lower())

  return noiseless_tokens

In [None]:
#these are the stopwords available in the ccorpus module of the nltk library
eng_stop_words = stopwords.words("english")
print(eng_stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [None]:
#print(remove_noise(tokenized_pos_tweets[0], eng_stop_words))

#to clean the sample tweets (completely stemmed, lemmatized and stop words removed)
cleaned_pos_tweets = []
cleaned_neg_tweets = []

for unit in tokenized_pos_tweets:
  cleaned_pos_tweets.append(remove_noise(unit, eng_stop_words))

for unit in tokenized_neg_tweets:
  cleaned_neg_tweets.append(remove_noise(unit, eng_stop_words))

In [None]:
def word_frequency(cleaned_tokens_list):
  for tokens in cleaned_tokens_list:
    for unit in tokens:
      yield unit

all_pos_words = word_frequency(cleaned_pos_tweets)
all_neg_words = word_frequency(cleaned_neg_tweets)

print(all_pos_words)
print(all_neg_words)

<generator object word_frequency at 0x7f6b848a7888>
<generator object word_frequency at 0x7f6b848a7830>


In [None]:
#FreqDist is nltk module to find the frequency disrtribution

from nltk import FreqDist
freq_dist_pos = FreqDist(all_pos_words)
print(freq_dist_pos.most_common(10))

[(':)', 3691), (':-)', 701), (':d', 658), ('thanks', 388), ('follow', 357), ('love', 333), ('...', 290), ('good', 283), ('get', 263), ('thank', 253)]


In [None]:
#the data we are having currently is a list of cleaned tokens, but we will be needing a labelled dataset in the form of a dictionary in which the
#tweet tokens are keys and some boolean flag set as true or false as the corresponding values

def generate_finalised_data(cleaned_tokens_list):
  for unit in cleaned_tokens_list:
    yield dict([subunit, True] for subunit in unit)

POSITIVE_TWEETS = generate_finalised_data(cleaned_pos_tweets)
NEGATIVE_TWEETS = generate_finalised_data(cleaned_neg_tweets)

In [None]:
#combining the data

import random
pos_dataset = [(pos_tweet, "Positive") for pos_tweet in POSITIVE_TWEETS]              #assigning Positive value for all positive tweets
neg_dataset = [(neg_tweet, "Negative") for neg_tweet in NEGATIVE_TWEETS]              #assigning Negative value for all negative tweets
complete_dataset = pos_dataset + neg_dataset                                          #combining pos and neg labelled data to obtain a complete dataset

random.shuffle(complete_dataset)
train_split = complete_dataset[:7000]
test_split = complete_dataset[3000:]

In [None]:
#apply the Naive Bayes clsssifier upon the model generated above

from nltk import classify                                                        #classify module is used to get the accuracy of the model upon the test datset
from nltk import NaiveBayesClassifier                                            #NBC module to create a classifier model
classifier = NaiveBayesClassifier.train(train_split)


In [None]:
#using the accuracy method of classify module to compute the accuracy of the model

accuracy_score = classify.accuracy(classifier, test_split)
print("Accuracy of the model: ", accuracy_score)

Accuracy of the model:  0.9977142857142857


Accuracy of the model = 99.7%

In [None]:
print(classifier.show_most_informative_features(15))

Most Informative Features
                      :( = True           Negati : Positi =   2052.0 : 1.0
                      :) = True           Positi : Negati =   1632.9 : 1.0
                     sad = True           Negati : Positi =     38.2 : 1.0
                follower = True           Positi : Negati =     37.7 : 1.0
                followed = True           Negati : Positi =     25.6 : 1.0
                     bam = True           Positi : Negati =     18.4 : 1.0
                   enjoy = True           Positi : Negati =     16.2 : 1.0
                    blog = True           Positi : Negati =     15.0 : 1.0
                     x15 = True           Negati : Positi =     15.0 : 1.0
                     ugh = True           Negati : Positi =     14.3 : 1.0
                 welcome = True           Positi : Negati =     13.6 : 1.0
                    glad = True           Positi : Negati =     13.4 : 1.0
                  arrive = True           Positi : Negati =     11.2 : 1.0

show_most_informative_features method of the model is used to obtain a glimpse at the summary of performance of the model wrt the testing datset.
It basically contains the ratio of number of times a specific token appears in the a class (Positive or Negative), contained in the testing data

In [None]:
#sample tweet for testing the classificaton model
sample_tweet1 = "I wish there was as much interest in extremely cheap instant testing as there is in covid vaccine"

#extracting the tokens after stemming and lemmatization and then removing the noise
tokens_of_sample_tweet1 = remove_noise(word_tokenize(sample_tweet1))

print(classifier.classify(dict([unit, True] for unit in tokens_of_sample_tweet1)))

Negative


In [None]:
sample_tweet2 = "Does Bengal have the right to protest: BJP's jibe after violence in Kolkata"
tokens_of_sample_tweet2 = remove_noise(word_tokenize(sample_tweet2))
print(classifier.classify(dict([unit, True] for unit in tokens_of_sample_tweet2)))

Negative


In [None]:
sample_tweet3 = "Congratulations SpaceX on a successful launch! It was great to see the fire in the sky and hear the rumble"
tokens_of_sample_tweet3 = remove_noise(word_tokenize(sample_tweet3))
print(classifier.classify(dict([unit, True] for unit in tokens_of_sample_tweet3)))

Positive
