<a name='0'></a>
## Importing libraries and defining cleaning and other useful functions

In [1]:
import re
import string

In [2]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

In [3]:
from matplotlib.patches import Ellipse
import matplotlib.transforms as transforms
import numpy as np 

In [4]:
def process_tweet(tweet):
    '''
    Input: 
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet

    '''
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    #tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    tweet = re.sub(r'https?://[^\s\n\r]+', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
            word not in string.punctuation):  # remove punctuation
            # tweets_clean.append(word)
            stem_word = word  # stemming word
            tweets_clean.append(stem_word)

    return tweets_clean

In [5]:
process_tweet("newcamp.iitb.ac.in")

['newcamp.iitb.ac.in']

In [6]:
def lookup(freqs, word, label):
    '''
    Input:
        freqs: a dictionary with the frequency of each pair (or tuple)
        word: the word to look up
        label: the label corresponding to the word
    Output:
        n: the number of times the word with its corresponding label appears.
    '''
    n = 0  # freqs.get((word, label), 0)

    pair = (word, label)
    if (pair in freqs):
        n = freqs[pair]

    return n

In [7]:
def confidence_ellipse(x, y, ax, n_std=3.0, facecolor='none', **kwargs):
    """
    Create a plot of the covariance confidence ellipse of `x` and `y`
    Parameters
    ----------
    x, y : array_like, shape (n, )
        Input data.
    ax : matplotlib.axes.Axes
        The axes object to draw the ellipse into.
    n_std : float
        The number of standard deviations to determine the ellipse's radiuses.
    Returns
    -------
    matplotlib.patches.Ellipse
    Other parameters
    ----------------
    kwargs : `~matplotlib.patches.Patch` properties
    """
    if x.size != y.size:
        raise ValueError("x and y must be the same size")

    cov = np.cov(x, y)
    pearson = cov[0, 1] / np.sqrt(cov[0, 0] * cov[1, 1])
    # Using a special case to obtain the eigenvalues of this
    # two-dimensionl dataset.
    ell_radius_x = np.sqrt(1 + pearson)
    ell_radius_y = np.sqrt(1 - pearson)
    ellipse = Ellipse((0, 0),
                      width=ell_radius_x * 2,
                      height=ell_radius_y * 2,
                      facecolor=facecolor,
                      **kwargs)

    # Calculating the stdandard deviation of x from
    # the squareroot of the variance and multiplying
    # with the given number of standard deviations.
    scale_x = np.sqrt(cov[0, 0]) * n_std
    mean_x = np.mean(x)

    # calculating the stdandard deviation of y ...
    scale_y = np.sqrt(cov[1, 1]) * n_std
    mean_y = np.mean(y)

    transf = transforms.Affine2D() \
        .rotate_deg(45) \
        .scale(scale_x, scale_y) \
        .translate(mean_x, mean_y)

    ellipse.set_transform(transf + ax.transData)
    return ax.add_patch(ellipse)

In [8]:
import pdb
import numpy as np
import pandas as pd
import nltk
import string
from nltk.tokenize import TweetTokenizer
from os import getcwd

In [9]:
import csv
all_hate_tweets=[]
all_nonhate_tweets=[]
with open('hate_speech.tsv', newline='', encoding="utf8") as tsvfile:
    reader = csv.reader(tsvfile, delimiter='\t')
    for row in reader:
        if row[1] == 'yes': 
            all_hate_tweets.append(row[0])
        else:
            all_nonhate_tweets.append(row[0])

In [10]:
len(all_hate_tweets)

1661

In [11]:
len(all_nonhate_tweets)

2918

In [12]:
all_hate_tweets[15]

'Aayega jab take alia ki shaadi age pass aayegi kyuki alia said 30ki hoote hi won shaadi kargi varun bhi kitna kutta hai Natasha ka shaadi ka spna this alia ke saath shaadi ki baat paaki kar ki hate varun alia sid please career pe focus karo aue alia ke banner se film mat karo'

In [13]:
all_nonhate_tweets[15]

'aj arshi ne jo laat eske pichwade mei mari thi wo photo twit kro bhai juldi se.'

In [14]:
# split the data into two pieces, one for training and one for testing (validation set)
test_pos = all_nonhate_tweets[2400:]
train_pos = all_nonhate_tweets[:2400]
test_neg = all_hate_tweets[1328:]
train_neg = all_hate_tweets[:1328]

train_x = train_pos + train_neg
test_x = test_pos + test_neg

# avoid assumptions about the length of all_positive_tweets
train_y = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))
test_y = np.append(np.ones(len(test_pos)), np.zeros(len(test_neg)))

<a name='1'></a>
## 1 - Process the Data

For any machine learning project, once you've gathered the data, the first step is to process it to make useful inputs to your model.
- **Remove noise**: You will first want to remove noise from your data -- that is, remove words that don't tell you much about the content. These include all common words like 'I, you, are, is, etc...' that would not give us enough information on the sentiment.
- We'll also remove stock market tickers, retweet symbols, hyperlinks, and hashtags because they can not tell you a lot of information on the sentiment.
- You also want to remove all the punctuation from a tweet. The reason for doing this is because we want to treat words with or without the punctuation as the same word, instead of treating "happy", "happy?", "happy!", "happy," and "happy." as different words.
- Finally you want to use stemming to only keep track of one variation of each word. In other words, we'll treat "motivation", "motivated", and "motivate" similarly by grouping them within the same stem of "motiv-".

We have given you the function `process_tweet` that does this for you.

In [15]:
custom_tweet = "Aayega jab take alia ki shaadi age pass aayegi kyuki alia said 30ki hoote hi won shaadi kargi varun bhi kitna kutta hai Natasha ka shaadi ka spna this alia ke saath shaadi ki baat paaki kar ki hate varun alia sid please career pe focus karo aue alia ke banner se film mat karo"
# print cleaned tweet
print(process_tweet(custom_tweet))

['aayega', 'jab', 'take', 'alia', 'ki', 'shaadi', 'age', 'pass', 'aayegi', 'kyuki', 'alia', 'said', '30ki', 'hoote', 'hi', 'shaadi', 'kargi', 'varun', 'bhi', 'kitna', 'kutta', 'hai', 'natasha', 'ka', 'shaadi', 'ka', 'spna', 'alia', 'ke', 'saath', 'shaadi', 'ki', 'baat', 'paaki', 'kar', 'ki', 'hate', 'varun', 'alia', 'sid', 'please', 'career', 'pe', 'focus', 'karo', 'aue', 'alia', 'ke', 'banner', 'se', 'film', 'mat', 'karo']


<a name='1-1'></a>
### 1.1 - Implementing your Helper Functions

To help you train your naive bayes model, you will need to compute a dictionary where the keys are a tuple (word, label) and the values are the corresponding frequency.  Note that the labels we'll use here are 1 for positive and 0 for negative.

You will also implement a lookup helper function that takes in the `freqs` dictionary, a word, and a label (1 or 0) and returns the number of times that word and label tuple appears in the collection of tweets.

For example: given a list of tweets `["i am rather excited", "you are rather happy"]` and the label 1, the function will return a dictionary that contains the following key-value pairs:

{
    ("rather", 1): 2,
    ("happi", 1) : 1, 
    ("excit", 1) : 1
}

- Notice how for each word in the given string, the same label 1 is assigned to each word.
- Notice how the words "i" and "am" are not saved, since it was removed by process_tweet because it is a stopword.
- Notice how the word "rather" appears twice in the list of tweets, and so its count value is 2.

In [16]:
# UNQ_C1 GRADED FUNCTION: count_tweets

def count_tweets(result, tweets, ys):
    '''
    Input:
        result: a dictionary that will be used to map each pair to its frequency
        tweets: a list of tweets
        ys: a list corresponding to the sentiment of each tweet (either 0 or 1)
    Output:
        result: a dictionary mapping each pair to its frequency
    '''
    ### START CODE HERE ###
    for y, tweet in zip(ys, tweets):
        for word in process_tweet(tweet):
            # define the key, which is the word and label tuple
            pair = (word,y)
            
            # if the key exists in the dictionary, increment the count
            if pair in result:
                result[pair] += 1

            # else, if the key is new, add it to the dictionary and set the count to 1
            else:
                result[pair] = 1
    ### END CODE HERE ###

    return result

In [17]:
# Testing your function

result = {}
tweets = ['i am happy', 'i am tricked', 'i am sad', 'i am tired', 'i am tired']
ys = [1, 0, 0, 0, 0]
count_tweets(result, tweets, ys)

{('happy', 1): 1, ('tricked', 0): 1, ('sad', 0): 1, ('tired', 0): 2}

<a name='2'></a>
## 2 - Train your Model using Naive Bayes

Naive bayes is an algorithm that could be used for sentiment analysis. It takes a short time to train and also has a short prediction time.

#### So how do you train a Naive Bayes classifier?
- The first part of training a naive bayes classifier is to identify the number of classes that you have.
- You will create a probability for each class.
$P(D_{pos})$ is the probability that the document is positive.
$P(D_{neg})$ is the probability that the document is negative.
Use the formulas as follows and store the values in a dictionary:

$$P(D_{pos}) = \frac{D_{pos}}{D}\tag{1}$$

$$P(D_{neg}) = \frac{D_{neg}}{D}\tag{2}$$

Where $D$ is the total number of documents, or tweets in this case, $D_{pos}$ is the total number of positive tweets and $D_{neg}$ is the total number of negative tweets.

#### Prior and Logprior

The prior probability represents the underlying probability in the target population that a tweet is positive versus negative.  In other words, if we had no specific information and blindly picked a tweet out of the population set, what is the probability that it will be positive versus that it will be negative? That is the "prior".

The prior is the ratio of the probabilities $\frac{P(D_{pos})}{P(D_{neg})}$.
We can take the log of the prior to rescale it, and we'll call this the logprior

$$\text{logprior} = log \left( \frac{P(D_{pos})}{P(D_{neg})} \right) = log \left( \frac{D_{pos}}{D_{neg}} \right)$$.

Note that $log(\frac{A}{B})$ is the same as $log(A) - log(B)$.  So the logprior can also be calculated as the difference between two logs:

$$\text{logprior} = \log (P(D_{pos})) - \log (P(D_{neg})) = \log (D_{pos}) - \log (D_{neg})\tag{3}$$

#### Positive and Negative Probability of a Word
To compute the positive probability and the negative probability for a specific word in the vocabulary, we'll use the following inputs:

- $freq_{pos}$ and $freq_{neg}$ are the frequencies of that specific word in the positive or negative class. In other words, the positive frequency of a word is the number of times the word is counted with the label of 1.
- $N_{pos}$ and $N_{neg}$ are the total number of positive and negative words for all documents (for all tweets), respectively.
- $V$ is the number of unique words in the entire set of documents, for all classes, whether positive or negative.

We'll use these to compute the positive and negative probability for a specific word using this formula:

$$ P(W_{pos}) = \frac{freq_{pos} + 1}{N_{pos} + V}\tag{4} $$
$$ P(W_{neg}) = \frac{freq_{neg} + 1}{N_{neg} + V}\tag{5} $$

Notice that we add the "+1" in the numerator for additive smoothing.  This [wiki article](https://en.wikipedia.org/wiki/Additive_smoothing) explains more about additive smoothing.

#### Log likelihood
To compute the loglikelihood of that very same word, we can implement the following equations:

$$\text{loglikelihood} = \log \left(\frac{P(W_{pos})}{P(W_{neg})} \right)\tag{6}$$

##### Create `freqs` dictionary
- Given your `count_tweets` function, you can compute a dictionary called `freqs` that contains all the frequencies.
- In this `freqs` dictionary, the key is the tuple (word, label)
- The value is the number of times it has appeared.

We will use this dictionary in several parts of this assignment.

In [18]:
# Build the freqs dictionary for later uses
freqs = count_tweets({}, train_x, train_y)

<a name='ex-2'></a>
### Exercise 2 - train_naive_bayes
Given a freqs dictionary, `train_x` (a list of tweets) and a `train_y` (a list of labels for each tweet), implement a naive bayes classifier.

##### Calculate $V$
- You can then compute the number of unique words that appear in the `freqs` dictionary to get your $V$ (you can use the `set` function).

##### Calculate $freq_{pos}$ and $freq_{neg}$
- Using your `freqs` dictionary, you can compute the positive and negative frequency of each word $freq_{pos}$ and $freq_{neg}$.

##### Calculate $N_{pos}$, and $N_{neg}$
- Using `freqs` dictionary, you can also compute the total number of positive words and total number of negative words $N_{pos}$ and $N_{neg}$.

##### Calculate $D$, $D_{pos}$, $D_{neg}$
- Using the `train_y` input list of labels, calculate the number of documents (tweets) $D$, as well as the number of positive documents (tweets) $D_{pos}$ and number of negative documents (tweets) $D_{neg}$.
- Calculate the probability that a document (tweet) is positive $P(D_{pos})$, and the probability that a document (tweet) is negative $P(D_{neg})$

##### Calculate the logprior
- the logprior is $log(D_{pos}) - log(D_{neg})$

##### Calculate log likelihood
- Finally, you can iterate over each word in the vocabulary, use your `lookup` function to get the positive frequencies, $freq_{pos}$, and the negative frequencies, $freq_{neg}$, for that specific word.
- Compute the positive probability of each word $P(W_{pos})$, negative probability of each word $P(W_{neg})$ using equations 4 & 5.

$$ P(W_{pos}) = \frac{freq_{pos} + 1}{N_{pos} + V}\tag{4} $$
$$ P(W_{neg}) = \frac{freq_{neg} + 1}{N_{neg} + V}\tag{5} $$

**Note:** We'll use a dictionary to store the log likelihoods for each word.  The key is the word, the value is the log likelihood of that word).

- You can then compute the loglikelihood: $log \left( \frac{P(W_{pos})}{P(W_{neg})} \right)$.

In [19]:
def train_naive_bayes(freqs, train_x, train_y):
    '''
    Input:
        freqs: dictionary from (word, label) to how often the word appears
        train_x: a list of tweets
        train_y: a list of labels correponding to the tweets (0,1)
    Output:
        logprior: the log prior. (equation 3 above)
        loglikelihood: the log likelihood of you Naive bayes equation. (equation 6 above)
    '''
    loglikelihood = {}
    logprior = 0

    ### START CODE HERE ###

    # calculate V, the number of unique words in the vocabulary
    vocab = []
    for pair in freqs.keys():
        if pair[0] not in vocab:
            vocab.append(pair[0])
        
    V = len(set(vocab))    

    # calculate N_pos, N_neg, V_pos, V_neg
    N_pos = N_neg = 0
    for pair in freqs.keys():
        # if the label is positive (greater than zero)
        if pair[1] > 0:

            # Increment the number of positive words by the count for this (word, label) pair
            N_pos += lookup(freqs, pair[0], 1)

        # else, the label is negative
        else:

            # increment the number of negative words by the count for this (word,label) pair
            N_neg += lookup(freqs, pair[0], 0)
    
    # Calculate D, the number of documents
    D = len(train_y)

    # Calculate D_pos, the number of positive documents
    # Calculate D_neg, the number of negative documents

    D_pos = 0
    D_neg = 0
    for i in range(len(train_y)):
        if train_y[i] == 1:
            D_pos += 1
        else:
            D_neg += 1

    # Calculate logprior
    logprior = np.log(D_pos/D_neg)
    
    # For each word in the vocabulary...
    for word in vocab:
        # get the positive and negative frequency of the word
        freq_pos = lookup(freqs, word, 1)
        freq_neg = lookup(freqs, word, 0)
        

        # calculate the probability that each word is positive, and negative
        p_w_pos = (freq_pos+1)/(N_pos+V)
        p_w_neg = (freq_neg+1)/(N_neg+V)

        # calculate the log likelihood of the word
        loglikelihood[word] = np.log(p_w_pos/p_w_neg)

    ### END CODE HERE ###

    return logprior, loglikelihood

In [20]:
# UNQ_C3 (UNIQUE CELL IDENTIFIER, DO NOT EDIT)
logprior, loglikelihood = train_naive_bayes(freqs, train_x, train_y)
print(logprior)
print(len(loglikelihood))

0.5917946862996578
12544


<a name='3'></a>
## 3 - Test your Naive Bayes

Now that we have the `logprior` and `loglikelihood`, we can test the naive bayes function by making predicting on some tweets!

<a name='ex-3'></a>
### Exercise 3 - naive_bayes_predict
Implement `naive_bayes_predict`.

**Instructions**:
Implement the `naive_bayes_predict` function to make predictions on tweets.
* The function takes in the `tweet`, `logprior`, `loglikelihood`.
* It returns the probability that the tweet belongs to the positive or negative class.
* For each tweet, sum up loglikelihoods of each word in the tweet.
* Also add the logprior to this sum to get the predicted sentiment of that tweet.

$$ p = logprior + \sum_i^N (loglikelihood_i)$$

#### Note
Note we calculate the prior from the training data, and that the training data is evenly split between positive and negative labels (4000 positive and 4000 negative tweets).  This means that the ratio of positive to negative 1, and the logprior is 0.

The value of 0.0 means that when we add the logprior to the log likelihood, we're just adding zero to the log likelihood.  However, please remember to include the logprior, because whenever the data is not perfectly balanced, the logprior will be a non-zero value.

In [21]:
# UNQ_C4 GRADED FUNCTION: naive_bayes_predict

def naive_bayes_predict(tweet, logprior, loglikelihood):
    '''
    Input:
        tweet: a string
        logprior: a number
        loglikelihood: a dictionary of words mapping to numbers
    Output:
        p: the sum of all the logliklihoods of each word in the tweet (if found in the dictionary) + logprior (a number)

    '''
    ### START CODE HERE ###
    # process the tweet to get a list of words
    word_l = process_tweet(tweet)

    # initialize probability to zero
    p = 0

    # add the logprior
    p += logprior

    for word in word_l:

        # check if the word exists in the loglikelihood dictionary
        if word in loglikelihood:
            # add the log likelihood of that word to the probability
            p += loglikelihood[word]

    ### END CODE HERE ###

    return p

In [22]:
# UNQ_C5 (UNIQUE CELL IDENTIFIER, DO NOT EDIT)
my_tweet = 'She smiled.'
p = naive_bayes_predict(my_tweet, logprior, loglikelihood)
print('The expected output is', p)

The expected output is 0.5917946862996578


**Expected Output**:
- The expected output is around 1.55
- The sentiment is positive.

In [23]:
# UNQ_C5 (UNIQUE CELL IDENTIFIER, DO NOT EDIT)
my_tweet = 'bjp party ghatiya soch rakhti hai'
p = naive_bayes_predict(my_tweet, logprior, loglikelihood)
print('The expected output is', p)

The expected output is -1.5218028993768866


In [24]:
# Experiment with your own tweet.
my_tweet = 'bjp accha kam kr rhi hai'
p = naive_bayes_predict(my_tweet, logprior, loglikelihood)
print('The expected output is', p)

The expected output is 0.13144902260093805


<a name='ex-4'></a>
### Exercise 4 - test_naive_bayes
Implement test_naive_bayes.

**Instructions**:
* Implement `test_naive_bayes` to check the accuracy of your predictions.
* The function takes in your `test_x`, `test_y`, log_prior, and loglikelihood
* It returns the accuracy of your model.
* First, use `naive_bayes_predict` function to make predictions for each tweet in text_x.

In [25]:
# UNQ_C6 GRADED FUNCTION: test_naive_bayes

def test_naive_bayes(test_x, test_y, logprior, loglikelihood, naive_bayes_predict=naive_bayes_predict):
    """
    Input:
        test_x: A list of tweets
        test_y: the corresponding labels for the list of tweets
        logprior: the logprior
        loglikelihood: a dictionary with the loglikelihoods for each word
    Output:
        accuracy: (# of tweets classified correctly)/(total # of tweets)
    """
    accuracy = 0  # return this properly
    dictss={}
    ### START CODE HERE ###
    y_hats = []
    for tweet in test_x:
        # if the prediction is > 0
        if naive_bayes_predict(tweet, logprior, loglikelihood) > 0:
            # the predicted class is 1
            y_hat_i = 1
        else:
            # otherwise the predicted class is 0
            y_hat_i = 0

        # append the predicted class to the list y_hats
        y_hats.append(y_hat_i)

    # error is the average of the absolute values of the differences between y_hats and test_y
    error = np.sum(np.squeeze(np.asarray(y_hats)) != np.squeeze(test_y))/len(test_y)

    # Accuracy is 1 minus the error
    accuracy = 1 - error

    ### END CODE HERE ###

    return accuracy 

In [26]:
print("Naive Bayes accuracy = %0.4f" %
      (test_naive_bayes(test_x, test_y, logprior, loglikelihood)))

Naive Bayes accuracy = 0.6251


In [27]:
# UNQ_C7 (UNIQUE CELL IDENTIFIER, DO NOT EDIT)
# Run this cell to test your function
for tweet in ['I am happy', 'I am bad', 'this movie should have been great.', 'great', 'great great', 'great great great', 'great great great great']:
    # print( '%s -> %f' % (tweet, naive_bayes_predict(tweet, logprior, loglikelihood)))
    p = naive_bayes_predict(tweet, logprior, loglikelihood)
#     print(f'{tweet} -> {p:.2f} ({p_category})')
    print(f'{tweet} -> {p:.2f}')

I am happy -> 0.98
I am bad -> 0.63
this movie should have been great. -> 0.60
great -> 0.98
great great -> 1.37
great great great -> 1.76
great great great great -> 2.15


In [28]:
for tweet in test_pos[60:70]: 
    p = naive_bayes_predict(tweet, logprior, loglikelihood)
    print(f'{tweet} -> {p:.2f}')

India mai baap se phele beta ajata hai Rape se jabi 14th August or 15th August ka difference pata nai, Rape jo bhut huta hai -> 3.30
Sir ap raw ko samjha de wo pak mein kalbhoshan jese pithu bhejna band krde terrorism khud khatam hojae gi.umar pachpan ki harkate bachon si https://twitter.com/chintskap/status/875447907922567170 … -> 4.78
ya allah hitler jesa hukmaran de sdo thatta ko sharm nai roz jamah masjid ki light kharab hoti he or ye log ac room se nikulte tuk nai inko -> -1.92
@rssurjewala jinke khoon mein hai desh or deshwasiyo se gadaari wo hai congress saari. -> -0.68
Aapki soch ko aap k followers ne bhi reject kar diya. Sirf 1400 kuch logo'n ne pasand ki aur taqreeban 1500 RTs, nafrat chore'n Insan bane'n https://twitter.com/chintskap/status/875447907922567170 … -> 3.46
jinke khoon mein hai desh or deshwasiyo se gadaari wo hai congress saari. -> -0.68
aik baat tay hai, india ka media jitni hatred dikhata hai Pakistani media nahin dikhata. Game hai, game ki tarah lo -> 0.24
Be

**Expected Output**:
- I am happy -> 2.14
- I am bad -> -1.31
- this movie should have been great. -> 2.12
- great -> 2.13
- great great -> 4.26
- great great great -> 6.39
- great great great great -> 8.52

In [29]:
for tweet in test_neg[1:5]: 
    p = naive_bayes_predict(tweet, logprior, loglikelihood)
    print(f'{tweet} -> {p:.2f}')

Madarchod Saale gang rape ki paidayish -> 0.86
bs kr de taao hate speech ki -> 0.74
Kashmir mai Jo ho raha hai Kiya wo terrorism nahi -> 3.54
Toilet  nahi Isliye Tu Bahot rape hotay wahan  Rishi kapoor Ranbir Kis ka Beata Hain Haha shakti kapoor yeh Gulshan yeh Amitab -> 3.32


In [30]:
# Feel free to check the sentiment of your own tweet below
my_tweet = 'you are bad :('
naive_bayes_predict(my_tweet, logprior, loglikelihood)

-0.18328037283401155

<a name='4'></a>
## 4 - Filter words by Ratio of Positive to Negative Counts

- Some words have more positive counts than others, and can be considered "more positive".  Likewise, some words can be considered more negative than others.
- One way for us to define the level of positiveness or negativeness, without calculating the log likelihood, is to compare the positive to negative frequency of the word.
    - Note that we can also use the log likelihood calculations to compare relative positivity or negativity of words.
- We can calculate the ratio of positive to negative frequencies of a word.
- Once we're able to calculate these ratios, we can also filter a subset of words that have a minimum ratio of positivity / negativity or higher.
- Similarly, we can also filter a subset of words that have a maximum ratio of positivity / negativity or lower (words that are at least as negative, or even more negative than a given threshold).

<a name='ex-5'></a>
### Exercise 5 - get_ratio
Implement get_ratio.

- Given the freqs dictionary of words and a particular word, use `lookup(freqs,word,1)` to get the positive count of the word.
- Similarly, use the `lookup` function to get the negative count of that word.
- Calculate the ratio of positive divided by negative counts

$$ ratio = \frac{\text{pos_words} + 1}{\text{neg_words} + 1} $$

Where pos_words and neg_words correspond to the frequency of the words in their respective classes. 
<table>
    <tr>
        <td>
            <b>Words</b>
        </td>
        <td>
        Positive word count
        </td>
         <td>
        Negative Word Count
        </td>
  </tr>
    <tr>
        <td>
        glad
        </td>
         <td>
        41
        </td>
    <td>
        2
        </td>
  </tr>
    <tr>
        <td>
        arriv
        </td>
         <td>
        57
        </td>
    <td>
        4
        </td>
  </tr>
    <tr>
        <td>
        :(
        </td>
         <td>
        1
        </td>
    <td>
        3663
        </td>
  </tr>
    <tr>
        <td>
        :-(
        </td>
         <td>
        0
        </td>
    <td>
        378
        </td>
  </tr>
</table>

In [31]:
# UNQ_C8 GRADED FUNCTION: get_ratio

def get_ratio(freqs, word):
    '''
    Input:
        freqs: dictionary containing the words

    Output: a dictionary with keys 'positive', 'negative', and 'ratio'.
        Example: {'positive': 10, 'negative': 20, 'ratio': 0.5}
    '''
    pos_neg_ratio = {'positive': 0, 'negative': 0, 'ratio': 0.0}
    ### START CODE HERE ###
    # use lookup() to find positive counts for the word (denoted by the integer 1)
    pos_neg_ratio['positive'] = lookup(freqs,word,1)
    
    # use lookup() to find negative counts for the word (denoted by integer 0)
    pos_neg_ratio['negative'] = lookup(freqs,word,0)
    
    # calculate the ratio of positive to negative counts for the word
    pos_neg_ratio['ratio'] = (lookup(freqs,word,1)+1) / (lookup(freqs,word,0)+1)
    ### END CODE HERE ###
    return pos_neg_ratio


In [32]:
get_ratio(freqs, 'happi')

{'positive': 0, 'negative': 0, 'ratio': 1.0}

<a name='ex-6'></a>
### Exercise 6 - get_words_by_threshold
Implement get_words_by_threshold(freqs,label,threshold)

* If we set the label to 1, then we'll look for all words whose threshold of positive/negative is at least as high as that threshold, or higher.
* If we set the label to 0, then we'll look for all words whose threshold of positive/negative is at most as low as the given threshold, or lower.
* Use the `get_ratio` function to get a dictionary containing the positive count, negative count, and the ratio of positive to negative counts.
* Append the `get_ratio` dictionary inside another dictinoary, where the key is the word, and the value is the dictionary `pos_neg_ratio` that is returned by the `get_ratio` function.
An example key-value pair would have this structure:
```
{'happi':
    {'positive': 10, 'negative': 20, 'ratio': 0.524}
}
```

In [33]:
# UNQ_C9 GRADED FUNCTION: get_words_by_threshold

def get_words_by_threshold(freqs, label, threshold, get_ratio=get_ratio):
    '''
    Input:
        freqs: dictionary of words
        label: 1 for positive, 0 for negative
        threshold: ratio that will be used as the cutoff for including a word in the returned dictionary
    Output:
        word_list: dictionary containing the word and information on its positive count, negative count, and ratio of positive to negative counts.
        example of a key value pair:
        {'happi':
            {'positive': 10, 'negative': 20, 'ratio': 0.5}
        }
    '''
    word_list = {}

    ### START CODE HERE ###
    for key in freqs.keys():
        word= key[0]

        # get the positive/negative ratio for a word
        pos_neg_ratio = get_ratio(freqs, word)

        # if the label is 1 and the ratio is greater than or equal to the threshold...
        if label == 1 and pos_neg_ratio['ratio'] >= threshold:
        
            # Add the pos_neg_ratio to the dictionary
            word_list[word] = pos_neg_ratio

        # If the label is 0 and the pos_neg_ratio is less than or equal to the threshold...
        elif label == 0 and pos_neg_ratio['ratio'] <= threshold:
        
            # Add the pos_neg_ratio to the dictionary
            word_list[word] = pos_neg_ratio

        # otherwise, do not include this word in the list (do nothing)

    ### END CODE HERE ###
    return word_list


In [34]:
# Test your function: find negative words at or below a threshold
get_words_by_threshold(freqs, label=0, threshold=5)

{'knowing': {'positive': 1, 'negative': 0, 'ratio': 2.0},
 'ki': {'positive': 737, 'negative': 590, 'ratio': 1.248730964467005},
 'vikas': {'positive': 27, 'negative': 27, 'ratio': 1.0},
 'kitna': {'positive': 21, 'negative': 15, 'ratio': 1.375},
 'samjhata': {'positive': 1, 'negative': 0, 'ratio': 2.0},
 'hai': {'positive': 1556, 'negative': 984, 'ratio': 1.5807106598984773},
 'priyanka': {'positive': 1, 'negative': 0, 'ratio': 2.0},
 'aur': {'positive': 512, 'negative': 319, 'ratio': 1.603125},
 'itch': {'positive': 1, 'negative': 0, 'ratio': 2.0},
 'guard': {'positive': 1, 'negative': 0, 'ratio': 2.0},
 'luv': {'positive': 8, 'negative': 7, 'ratio': 1.125},
 'ko': {'positive': 742, 'negative': 480, 'ratio': 1.5446985446985446},
 'usne': {'positive': 12, 'negative': 17, 'ratio': 0.7222222222222222},
 'bola': {'positive': 34, 'negative': 17, 'ratio': 1.9444444444444444},
 'tha': {'positive': 175, 'negative': 124, 'ratio': 1.408},
 'ben': {'positive': 2, 'negative': 0, 'ratio': 3.0},
 

In [35]:
# Test your function; find positive words at or above a threshold
get_words_by_threshold(freqs, label=1, threshold=10)

{'bakri': {'positive': 10, 'negative': 0, 'ratio': 11.0},
 'punjab': {'positive': 13, 'negative': 0, 'ratio': 14.0},
 'khatm': {'positive': 15, 'negative': 0, 'ratio': 16.0},
 'episode': {'positive': 9, 'negative': 0, 'ratio': 10.0},
 'stop': {'positive': 11, 'negative': 0, 'ratio': 12.0},
 'himachal': {'positive': 9, 'negative': 0, 'ratio': 10.0},
 'jeeto': {'positive': 10, 'negative': 0, 'ratio': 11.0}}

Notice the difference between the positive and negative ratios. Emojis like :( and words like 'me' tend to have a negative connotation. Other words like glad, community, arrives, tend to be found in the positive tweets.

<a name='5'></a>
## 5 - Error Analysis

In this part you will see some tweets that your model missclassified. Why do you think the missclassifications happened? Were there any assumptions made by your naive bayes model?

In [36]:
# Some error analysis done for you
print('Truth Predicted Tweet')
for x, y in zip(test_x, test_y):
    y_hat = naive_bayes_predict(x, logprior, loglikelihood)
    if y != (np.sign(y_hat) > 0):
        print('%d\t%0.2f\t%s' % (y, np.sign(y_hat) > 0, ' '.join(
            process_tweet(x)).encode('ascii', 'ignore')))

Truth Predicted Tweet
1	0.00	b'salo rape k ilawah aur kiya ate hai porkies'
1	0.00	b'use ghar ki safayi karni hogi .. wish sanya kabhi mere ghar ayi hoti.pkka mera murder kar deti mera room dekh kar proud fatimasanashaikh'
1	0.00	b'india waly nafrat ko badhawa denay ka koi mauka ni choorty un gov apni jaga un fazool channel waly bhi un ki ...'
1	0.00	b'ye sahe kaha par matter solve karo ki tumhare bestie kyun hate karte'
1	0.00	b'kulbhushan bhi katega phansi pe charha ke bhej dein ge independence day pe'
1	0.00	b'wo gareeb awam ja k army ki god kyu beithi hai.kyu waha k terror perpetrators ka sath deti hai'
1	0.00	b'phir badiya hai ... voh kya hai ki hindustan ke hain na sirf pyar aur ekta samajh aati hai ... yeh nafrat ki soch se confuse hote hain'
1	0.00	b'ya allah hitler jesa hukmaran de sdo thatta ko sharm nai roz jamah masjid ki light kharab hoti ye log ac room se nikulte tuk nai inko'
1	0.00	b'jinke khoon mein hai desh deshwasiyo se gadaari wo hai congress saari'
1	0.00	b'jinke k

<a name='6'></a>
## 6 - Predict 

In [40]:
my_tweet = 'ab janata ke praakrtik otomaitik #raiphal ka baarood #nasabandee karake band karoge,,to phir khilauna #bandook to chalaenge hee....  ib kutte karm karoge to pher #kutte kee maut hee maroge    #tera bee tem aagya #kaijriwalassassinationchlaim'
p = naive_bayes_predict(my_tweet, logprior, loglikelihood)
print(p)

-0.5728726547946349


In [38]:
my_tweet = 'kitana achchha samudaay hai ye hamesha gareeb logo ke lie bhandaara arenj karata hai'
p = naive_bayes_predict(my_tweet, logprior, loglikelihood)
print(p)

1.6383123562330117
