# COMP 472 - Assignment 3

### Bernard Claveau - 40065756 / Nicolas Eliopoulos - 40059378

<hr>

### Imports

In [1]:
import pandas as pd
import csv
import math

## Naive Bayes Bag-of-Words Classifier [(documentation)](#NB_BOW)

In [2]:
class NB_BOW:
    
    def __init__(self, *, filter=False):
        self.ts = None
        self.filter = filter
        self.v = {}
        self.p_yes = None
        self.p_no = None
        self.yes_prob = {}
        self.no_prob = {}
        
        
    def train(self, ts):
        self.ts = ts
        self.v = self.__build_vocab()
        self.p_yes, self.p_no = self.__calculate_priors()
        self.yes_prob, self.no_prob = self.__calculate_conditionals()
        
        
    def predict(self, test):
        prediction = []
        ids = test.iloc[:, 0]
        tweets = test.iloc[:, 1].str.lower()
        labels = test.iloc[:, 2]
        
        for i in range(len(test)):
            prediction.append((ids[i],) + self.__classify(tweets[i]) + (labels[i],))
        return prediction
       
        
    def __build_vocab(self):
        self.ts.text = self.ts.text.str.lower()
        v = {}

        # build dict of words and their frequencies
        for tweet in self.ts.text:
            words = tweet.split()
            for w in words:
                if w in v:
                    v[w] += 1
                else:
                    v[w] = 1
        
        if self.filter:
            return { key:value for (key, value) in v.items() if value > 1}
        return v
    
    
    def __calculate_priors(self):
        return self.ts.q1_label.value_counts()['yes'] / len(self.ts), self.ts.q1_label.value_counts()['no'] / len(self.ts)
    
    
    def __calculate_conditionals(self):
        yes_tweets = self.ts.loc[self.ts.q1_label == 'yes'].text
        no_tweets = self.ts.loc[self.ts.q1_label == 'no'].text
        
        # word frequencies divided by class
        yes_dict = {}
        no_dict = {}
        
        # number of instances of words in each class
        tw_yes = 0
        tw_no = 0
        
        # conditional probabilities for each class
        yes_prob = {}
        no_prob = {}

        for tweet in yes_tweets:
            words = tweet.split()
            for w in words:
                tw_yes += 1
                yes_dict[w] = 1 if w not in yes_dict else yes_dict[w] + 1
                
                # ensure that words found in 'yes' tweets but not 'no' tweets still have a conditional probability for 'no'
                no_dict[w] = 0 if w not in no_dict else no_dict[w]

        for tweet in no_tweets:
            words = tweet.split()
            for w in words:
                tw_no += 1
                no_dict[w] = 1 if w not in no_dict else no_dict[w] + 1
                
                # ensure that words found in 'no' tweets but not 'yes' tweets still have a conditional probability for 'yes'
                yes_dict[w] = 0 if w not in yes_dict else yes_dict[w]
        
        for w in self.v:
            yes_prob[w] = (yes_dict[w] + 0.01) / (tw_yes + 0.01*len(self.v))
            no_prob[w] = (no_dict[w] + 0.01) / (tw_no + 0.01*len(self.v))
            
        return yes_prob, no_prob
    
    
    def __classify(self, tweet):
        yes, no = self.__score_yes(tweet), self.__score_no(tweet)
        return ('yes', yes) if yes > no else ('no', no)
        
        
    def __score_yes(self, tweet):
        words = tweet.split()
        score = math.log10(self.p_yes)
        
        for w in words:
            score += math.log10(self.yes_prob[w]) if w in self.v else 0
        return score

    
    def __score_no(self, tweet):
        words = tweet.split()
        score = math.log10(self.p_no)
        
        for w in words:
            score += math.log10(self.no_prob[w]) if w in self.v else 0
        return score


## Output Functions

In [3]:
def generate_output_files(prediction, name):
    write_trace(prediction, name)
    write_eval(prediction, name)

In [4]:
def write_trace(prediction, name):
    with open('output/trace_' + name, 'w', encoding='utf-8') as f:
        for i in range(len(prediction)):
            result = 'correct' if prediction[i][1] == prediction[i][3] else 'wrong'
            f.write(str(prediction[i][0]) + '  ' 
                    + prediction[i][1] + '  '
                    + str(format(prediction[i][2], '.2e')) + '  '
                    + prediction[i][3] + '  '
                    + result + '\r')

In [5]:
def write_eval(prediction, name):
    with open('output/eval_' + name, 'w', encoding='utf-8') as f:
        f.write(str(format(acc(prediction), '.4f')) + '\r' 
                + str(format(precision(prediction)[0], '.4f')) + '  '
                + str(format(precision(prediction)[1], '.4f')) + '\r'
                + str(format(recall(prediction)[0], '.4f')) + '  '
                + str(format(recall(prediction)[1], '.4f')) + '\r'
                + str(format(f1(prediction)[0], '.4f')) + '  '
                + str(format(f1(prediction)[1], '.4f')) + '\r')

Performance metrics

In [6]:
def acc(prediction):
    acc = 1.1111111111  # TODO
    return acc

def precision(prediction):
    yes_p, no_p = 2.222222222, 3.333333333  # TODO
    return yes_p, no_p

def recall(prediction):
    yes_r, no_r = 4.444444444, 5.555555555  # TODO
    return yes_r, no_r

def f1(prediction):
    yes_f, no_f = 6.6666666666, 7.777777777  # TODO
    return yes_f, no_f

## Generate Output

Get training set

In [7]:
training_set = None

with open('covid_training.tsv', 'r', encoding='utf-8') as f:
    training_set = pd.read_csv(f, sep='\t', encoding='utf-8').iloc[:, :3]

Get test set

In [8]:
test_set = None

with open('covid_test_public.tsv', 'r', encoding='utf-8') as f:
    test_set = pd.read_csv(f, sep='\t', encoding='utf-8', header=None)

### Original Vocabulary - NB-BOW-OV

In [9]:
nb_bow_ov = NB_BOW()

nb_bow_ov.train(training_set)
pred_ov = nb_bow_ov.predict(test_set)

generate_output_files(pred_ov, 'NB-BOW-OV.txt')

### Filtered Vocabulary - NB-BOW-FV

In [10]:
nb_bow_fv = NB_BOW(filter=True)

nb_bow_fv.train(training_set)
pred_fv = nb_bow_fv.predict(test_set)

generate_output_files(pred_fv, 'NB-BOW-FV.txt')

<hr>

## NB_BOW

### Fields

- **ts** - the DataFrame of the training set with columns *tweet_id*, *text*, and *q1_label*
- **filter** - whether or not to filter the vocabulary
- **v** - the vocabulary of the model
- **p_yes** - the prior of class **yes**
- **p_no** - the prior of class **no**
- **yes_prob** - a dict of the conditionals for class **yes**
- **no_prob** - a dict of the conditionals for class **no**

<hr>

### Public methods

#### train

- Trains the classifier with the given training set.

#### predict

- Predicts the classes of the given test set.
- **Return:** list of tuples of the form (tweet_id, predicted_label, score_of_predicted_label, actual_label)

<hr>

### Private methods

#### build_vocab

- Converts all tweets to lowercase.
- Loops through all tweets and all words in each tweet (separated by whitespace) and adds each word to the dictionary or updates its frequency.
- If **filter** is true, then words that appear only once are filtered out of the vocabulary.
- **Return:** vocabulary (dict)

#### calculate_priors

- Calculates the prior probabilities of the classes **yes** and **no**.
- **Return:** prior of **yes** (float), prior of **no** (float)

#### calculate_conditionals

- Calculates the conditional probabilties of each word in the vocabulary for both classes, using δ = 0.01 smoothing.
- **Return:** conditionals given **yes** (dict), conditionals given **no** (dict)

#### classify

- Classifies a tweet as either **yes** or **no**.
- **Return:** tuple of the form (predicted_label, score_of_predicited_label)

#### score_yes

- Calculates the score for **yes** of a tweet using log<sub>10</sub>.
- **Returns:** score for **yes** (float)

#### score_no

- Calculates the score for **no** of a tweet using log<sub>10</sub>.
- **Returns:** score for **no** (float)