In [None]:
# Run this every time you open the spreadsheet
%load_ext autoreload
%autoreload 2
from collections import Counter
import lib
import nltk


# Load and inspect the data

In [None]:
# Load the data.
# This function returns tweets and test_tweets, both lists of tweets
tweets, test_tweets = lib.read_data()

# Learn a Naive Bayes classifier

To construct our Naive Bayes classifier, we first need to calculate two things:

### Prior probabilities of categories
We need to calculate $P(C_i)$ for each category $C_i \in \{\text{Energy}, \text{Food}, \text{Medical}, \text{Water}, \text{None}\}$. 

We estimate $P(C_i)$ by $\frac{\text{# tweets about }C_i}{\text{# tweets}}$

### Conditional probabilities of tokens
For each token (i.e. word) $x_j$ and each category $C_i$, we need to calculate $P(x_j|C_i)$.

We estimate $P(x_j|C_i) = \frac{P(x_j \text{ and } C_i)}{P(C_i)}$ by $\frac{\text{# tweets about }C_i \text{ containing }x_j}{\text{# tweets about }C_i}$

In [None]:
# Exercise 1, step-by-step version

# The function below has two arguments: a list of tweets, and a category c
# which is a string equal to one of "Energy", "Food", "Medical", "Water", "None".
# The function should calculate the two things described above.
# Fill in the blanks.


def calc_probs(tweets, c):
    """
    Input:
        tweets: a list of tweets
        c: a string representing a category; one of "Energy", "Food", "Medical", "Water", "None". 
    Returns:
        prob_c: the prior probability of category c
        token_probs: a Counter mapping each token to P(token|category c)
    """
    
    # Step 1: Calculate the total number of tweets
    num_tweets = __________

    
    # Step 2: Calculate the number of tweets that are about category c.
    # Save the answer to a variable called num_tweets_about_c.
    # Remember c is a string, and you can get the category of a tweet via tweet.category
    ...
    ...
    ...

    
    
    # Step 3: Calculate the probability of category c using the answers from Steps 1 and 2.
    # Hint: be careful when you divide two integers!
    prob_c = __________
    
    
    # Step 4: Create an empty Counter called token_counts.
    # (We will use it to map each token to the number of category-c tweets containing that token.)
    token_counts = __________
    
    
    # Step 5 (tricky): Use a for-loop to iterate over the list of tweets.
    # Use an if-statement to check whether the tweet is in category c.
    # If it is, iterate over the tokens of the tweet (which you can access via tweet.tokenSet) using a for-loop.
    # For each token, increment its count in token_counts.
    ...
    ...
    ...
    
    
    
    # Step 6: Create an empty Counter called token_probs.
    # (We will use it to map each token to P(token | category c), 
    # i.e. the fraction of all category-c tweets that contain the token)
    token_probs = __________
    
    
    # Step 7: Now fill token_probs.
    # For each token->count in token_counts, you want to add token->fraction to token_probs.
    # Use a for-loop over token_counts. 
    # Remember that when you iterate over a dictionary/Counter, you access the keys.
    # You'll need to use the variable num_tweets_about_c.
    # Be careful when you divide integers!
    ...
    ...
    ...

    
    
    print("Class %s has prior probability %.2f" % (c, prob_c))    
    return prob_c, token_probs


prob_food, token_probs_food = calc_probs(tweets, "Food")
prob_water, token_probs_water = calc_probs(tweets, "Water")
prob_energy, token_probs_energy = calc_probs(tweets, "Energy")
prob_medical, token_probs_medical = calc_probs(tweets, "Medical")
prob_none, token_probs_none = calc_probs(tweets, "None")

### See what your model has learnt

In [None]:
# For each category c, print out the tokens that maximize P(c|token)

token_probs = {'Food': token_probs_food, 'Water': token_probs_water, 'Energy': token_probs_energy, 'Medical': token_probs_medical,'None': token_probs_none}
prior_probs = {'Food': prob_food, 'Water': prob_water, 'Energy': prob_energy, 'Medical': prob_medical, 'None': prob_none}
lib.most_discriminative(tweets, token_probs, prior_probs)

# Build a Naive Bayes classifier

Now we've calculated $P(C_i)$ and $P(x_j|C_i)$, we can classify any tweet!

Given a tweet which is a set of tokens $\{x_1,...,x_n\}$, the posterior probability of each category $C_i$ is

$P(C_i | x_1,...,x_n) \propto P(C_i) \times P(x_1|C_i) \times P(x_2|C_i) ... \times P(x_n|C_i)$

We just need to calculate this for each category then determine which is largest.

In [None]:
# Exercise 2. 

# Complete this function that calculates the posterior probability of P(c|tweet).

def get_posterior_prob(tweet, prob_c, token_probs):
    """Calculate the posterior P(c|tweet). 
    (Actually, calculate something proportional to it).
    
    Inputs:
        tweet: a tweet
        prob_c: the prior probability of category c
        token_probs: a Counter mapping each token P(token|c)
    Return:
        The posterior P(c|tweet).
    """

    ##### YOUR CODE STARTS HERE #####
    
    # Hint: first set posterior to prob_c, then use a for-loop over tweet.tokenSet
    # to repeatedly multiply posterior by P(token|c)
        

        
        
        
    ##### YOUR CODE ENDS HERE #####
    
    return posterior



# Now you've written the function, look at the output for P(Energy|"No power in Riverdale").
# What's gone wrong? 
# Try editing your function above to print out each token and token_probs[token].
# Can you see what went wrong? How might you fix it?

riverdale_tweet = lib.Tweet("No power in Riverdale", "Energy", "need")
print("P(Energy|'No power in Riverdale') = ", get_posterior_prob(riverdale_tweet, prob_energy, token_probs_energy))

In [None]:
# This cell defines the classification function, that takes a tweet 
# and decides which category is most likely using the posteriors you just calculated.


# OPTIONAL EXERCISE (come back to it once you've reached the end of the notebook).
# Rewrite this function to be less repetitive i.e. don't repeat things 5 times.
# There are several possible solutions; you might want to use lists or dictionaries.
# You might also want to rewrite the earlier code that computed prob_food, token_probs_food etc.


def classify_nb(tweet):
    """Classifies a tweet. Calculates the posterior P(c|tweet) for each category c, 
    and returns the category with largest posterior.
    Input:
        tweet
    Output:
        string equal to most-likely category for this tweet
    """
    posterior_food_prob = get_posterior_prob(tweet, prob_food, token_probs_food)
    posterior_water_prob = get_posterior_prob(tweet, prob_water, token_probs_water)
    posterior_energy_prob = get_posterior_prob(tweet, prob_energy, token_probs_energy)
    posterior_medical_prob = get_posterior_prob(tweet, prob_medical, token_probs_medical)
    posterior_none_prob = get_posterior_prob(tweet, prob_none, token_probs_none)
    
    max_posterior = max([posterior_food_prob, posterior_water_prob, posterior_energy_prob, posterior_medical_prob, posterior_none_prob])
    if posterior_food_prob == max_posterior:
        return 'Food'
    elif posterior_water_prob == max_posterior:
        return 'Water'
    elif posterior_energy_prob == max_posterior:
        return 'Energy'
    elif posterior_medical_prob == max_posterior:
        return 'Medical'
    else:
        return 'None'

## Evaluate the Naive Bayes classifier

In [None]:
# Compare true labels and predicted labels in a table

predictions = [(tweet, classify_nb(tweet)) for tweet in test_tweets] # a list of (tweet, prediction) pairs
lib.show_predictions(predictions)

In [None]:
# Get average F1 score for the test set

predictions = [(tweet, classify_nb(tweet)) for tweet in test_tweets] # maps each test tweet to its predicted label
lib.evaluate(predictions)

In [None]:
# Get average F1 score for the TRAINING set.
# Compare with average F1 for test set above. What's the reason for the difference?

trainset_predictions = [(tweet, classify_nb(tweet)) for tweet in tweets] # maps each training tweet to its predicted label
lib.evaluate(trainset_predictions)

In [None]:
lib.show_confusion_matrix(predictions)