#Twitter sentiment analysis using a Naive Bayes Classifier

**Description:** Simple tweeter sentiment analysis using a Naive Bayes Classifier <br>
Accuracy is 99% test dataset<br>
**Dataset:** Twitter examples from nltk <br>

In [1]:
import nltk
from nltk.corpus import twitter_samples
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
import string

import numpy as np

In [2]:
nltk.download('twitter_samples')
nltk.download('stopwords')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     /home/manero/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to /home/manero/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

In [4]:
# See some example tweets
np.random.seed(123)
n_rand = np.random.randint(0,5000,7)
# print positive and negative tweets
print('* example positive tweets *')
for i in n_rand:
    print (i,'->', positive_tweets[i])
print()
print('* example negative tweets *')
for i in n_rand:
    print (i,'->', negative_tweets[i])

* example positive tweets *
3582 -> @laurentzwalker @BoykinsD1 I love being mixed :-)
3454 -> @Saskia_TeamLH44 Thanks baby girl. You too!! :) xx
1346 -> @UrbanKarizma alright sleep well!:)
4060 -> @ElliotHorwoodF1 @LewisHamilton His headrest was loose, that's the reason he pitted :-)
1593 -> @amy14_x @AstonMerrygold will do :D it's mainly my mother who wants it right now haha the others can wait they said 😂 x
96 -> waiting for nudes :-)
4143 -> @lovingjeonboram Ah, I see. So which song do you prefer? Take or Maman? :)

* example negative tweets *
3582 -> I just want to get paid already :(
3454 -> wait where the fuck is my ffvi &gt;:( ugh
1346 -> Dad says he wanna eat Nando's for dinner tmr :(
4060 -> @lucyanne_l Thank you for sending premium writing instrument, however some dastardly swine stole it from envelope! :( http://t.co/xNe3cD6dvk
1593 -> @RiotQuickshot Good man! Shame he no longer has a Cigar though :(
96 -> Why is my mum playing music out loud :(
4143 -> @DasCarrot no EZOO fo

In [5]:
# Tokenization
def transform_tweet(tweet):
  stemmer = PorterStemmer() 
  stopwords_english = stopwords.words('english')

  # remove the stock market tickers
  tweet = re.sub(r'\$\w*', '', tweet)

  # remove the old styles retweet text 'RT'
  tweet = re.sub(r'^RT[\s]+', '', tweet)

  # remove the hyperlinks
  tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)

  # remove the # symbol
  tweet = re.sub(r'#', '', tweet)

  # Tokenize the tweet
  tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)
  tweet_tokens = tokenizer.tokenize(tweet)

  tweet_clean = []

# Stop word Filtering & Stemming 
  # removing stopwords and punctuation
  for word in tweet_tokens:
     if (word not in stopwords_english and word not in string.punctuation):
       stem_word = stemmer.stem(word)    #stemming
       tweet_clean.append(stem_word)

  return tweet_clean

In [6]:
# test a couple of tweets one positive and one negative

tweet = negative_tweets[231]
tweet_cleaned = transform_tweet(tweet)
print('original    : ',tweet)
print('tokenized : ', tweet_cleaned)

tweet = positive_tweets[231]
tweet_cleaned = transform_tweet(tweet)
print('original    : ',tweet)
print('tokenized : ', tweet_cleaned)

original    :  My after effects not spanish :(
tokenized :  ['effect', 'spanish', ':(']
original    :  Hi BAM ! @BarsAndMelody 
Can you follow my bestfriend @969Horan696 ? 
She loves you a lot :) 
See you in Warsaw &lt;3 
Love you &lt;3 x40
tokenized :  ['hi', 'bam', 'follow', 'bestfriend', 'love', 'lot', ':)', 'see', 'warsaw', '<3', 'love', '<3', 'x40']


In [7]:
# Support routines
# Creation of a word count dictionary
def count_tweets(tweets, ys):
  ys_list = np.squeeze(ys).tolist()
  freqs ={}

  for y, tweet in zip(ys_list, tweets):
    for word in transform_tweet(tweet):
      pair = (word, y)
      if pair in freqs:
        freqs[pair] +=1
      else:
        freqs[pair] = 1
  
  return freqs

#returns negative and positive freqs of a work

def lookup(freqs, word, label):
  n = 0
  pair = (word, label)
  if pair in freqs:
    n = freqs[pair]
  return n 

In [8]:
# splitting the data for training and testing 
train_pos = positive_tweets[:4000]
test_pos = positive_tweets[4000:]

train_neg = negative_tweets[:4000]
test_neg = negative_tweets[4000:]

train_x = train_pos + train_neg
test_x = test_pos + test_neg

# numpy array for the labels in the training set
train_y = np.append(np.ones((len(train_pos))), np.zeros((len(train_neg))))
test_y = np.append(np.ones((len(test_neg))), np.zeros((len(test_neg))))

In [9]:
# Build a frequency dictionary
freqs = count_tweets(train_x, train_y)

def train_naive_bayes(freqs, train_x, train_y):
  logliklihood = {}
  logprior = 0

  # calculate V, number of unique words in the vocabulary
  vocab = set([pair[0] for pair in freqs.keys()])
  V = len(vocab)

  ## Calculate N_pos, N_neg, V_pos, V_neg
  # N_pos : total number of positive words
  # N_neg : total number of negative words
  # V_pos : total number of unique positive words
  # V_neg : total number of unique negative words

  N_pos = N_neg = V_pos = V_neg = 0
  for pair in freqs.keys():
    if pair[1]>0:
      V_pos +=1
      N_pos += freqs[pair]
    else:
      V_neg +=1
      N_neg += freqs[pair]

  # Number of Documents (tweets)
  D = len(train_y)

  # D_pos, number of positive documnets
  D_pos = len(list(filter(lambda x: x>0, train_y)))

  # D_pos, number of negative documnets
  D_neg = len(list(filter(lambda x: x<=0, train_y)))

  # calculate the logprior
  logprior = np.log(D_pos) - np.log(D_neg)

  for word in vocab:
    freqs_pos = lookup(freqs, word, 1)
    freqs_neg = lookup(freqs, word, 0)

    # calculte the probability of each word being positive and negative
    p_w_pos = (freqs_pos+1)/(N_pos+V)
    p_w_neg = (freqs_neg+1)/(N_neg+V)

    logliklihood[word] = np.log(p_w_pos/p_w_neg)
  
  return logprior, logliklihood

In [10]:
logprior, loglikelihood = train_naive_bayes(freqs, train_x, train_y)
print(logprior)
print(len(loglikelihood))
loglikelihood

0.0
9086


{'choroo': 0.6982277728402158,
 'sunnah': 0.6982277728402158,
 'torrentialrain': -0.6880665882796749,
 '319': -0.6880665882796749,
 "you'": -1.0935316963878392,
 "ledger'": -0.6880665882796749,
 'panga': -0.6880665882796749,
 'harsh': -0.6880665882796749,
 'giddi': -0.6880665882796749,
 'aameen': 0.6982277728402158,
 'debt': 1.1036928809483801,
 'saddest': -1.0935316963878392,
 'pleasur': 2.402975865078641,
 '18.99': 0.6982277728402158,
 'kathryn': 0.6982277728402158,
 'someon': -0.28260148017151027,
 'g': 0.005080592280270417,
 'sauc': 1.1036928809483801,
 'stream': -0.5210125036165085,
 'swore': 0.6982277728402158,
 'attempt': 0.005080592280270417,
 'openfollow': 0.6982277728402158,
 'sho': 0.6982277728402158,
 'milk': -0.4003845158278938,
 'hulkamania': -0.6880665882796749,
 'goal': -0.50574503148572,
 'bruis': 0.005080592280270417,
 'sc': -0.6880665882796749,
 'skulker': -0.6880665882796749,
 'wesen': -0.6880665882796749,
 'schoolwork': -0.6880665882796749,
 'masaantoday': 1.614518

In [11]:
def naive_bayes_predict(tweet, logprior, loglikelihood):
  word_l = transform_tweet(tweet)
  p = 0
  p+=logprior

  for word in word_l:
    if word in loglikelihood:
      p+=loglikelihood[word]

  return p

In [12]:
def test_naive_bayes(test_x, test_y, logprior, loglikelihood):
  accuracy = 0
  y_hats = []
  for tweet in test_x:
    if naive_bayes_predict(tweet, logprior, loglikelihood) > 0:
      y_hat_i = 1
    else:
      y_hat_i = 0
    y_hats.append(y_hat_i)
  error = np.mean(np.absolute(test_y - y_hats))
  accuracy = 1-error

  return accuracy
  
print("Naive Bayes accuracy = %0.4f" %
      (test_naive_bayes(test_x, test_y, logprior, loglikelihood)))

Naive Bayes accuracy = 0.9940


In [24]:
# one small example
my_tweet = "bad rainy day"
prob = naive_bayes_predict(my_tweet, logprior, loglikelihood)
print(prob)

-1.1979964809309143
