**Multinomial Naive
Bayes learning algorithm in Python to classify tweets**

In [69]:
import re
from collections import defaultdict
import os
import pandas as pd
import numpy as np
import math

**Data Loading**

In [70]:
train_pos = '/Users/arjun/Downloads/Lab_Week2-3/dataFiles/train/trainPos.txt'
train_neg = '/Users/arjun/Downloads/Lab_Week2-3/dataFiles/train/trainNeg.txt'
test_pos = '/Users/arjun/Downloads/Lab_Week2-3/dataFiles/test/testPos.txt'
test_neg = '/Users/arjun/Downloads/Lab_Week2-3/dataFiles/test/testNeg.txt'


In [71]:
train_pos_df = pd.read_csv(train_pos, header=None, names=['tweet'], encoding='latin-1', engine='python')

train_neg_df = pd.read_csv(train_neg, header=None, names=['tweet'], encoding='latin-1', engine='python')

test_pos_df = pd.read_csv(test_pos, header=None, names=['tweet'], encoding='latin-1', engine='python')

test_neg_df = pd.read_csv(test_neg, header=None, names=['tweet'], encoding='latin-1', engine='python')

**Stage 1: Vocabulary and Word Frequency**

In [72]:
def preprocess(tweet):
    tweet = tweet.lower()
    tweet = re.sub(r'[^a-z\s]', '', tweet)
    words = tweet.split()
    return words

In [73]:
def build_vocabulary_and_frequencies(train_pos_df, train_neg_df):

    vocab = set()
    pos_dict = {}
    neg_dict = {}

    # positive tweets
    for tweet in train_pos_df['tweet']:
        words = preprocess(tweet)
        for word in words:
            vocab.add(word)
            pos_dict[word] = pos_dict.get(word, 0) + 1

    # negative tweets
    for tweet in train_neg_df['tweet']:
        words = preprocess(tweet)
        for word in words:
            vocab.add(word)
            neg_dict[word] = neg_dict.get(word, 0) + 1

    
    for word in vocab:
        if word not in pos_dict:
            pos_dict[word] = 0
        if word not in neg_dict:
            neg_dict[word] = 0

    print("Vocab size:", len(vocab))
    print("Total positive words:", sum(pos_dict.values()))
    print("Total negative words:", sum(neg_dict.values()))

    return vocab, pos_dict, neg_dict

**Stage 2: Word Probabilities (Multinomial NB with Laplace smoothing)**

In [74]:
def calculate_probabilities(vocab, pos_dict, neg_dict,
                            num_pos_docs, num_neg_docs):

    vocab_size = len(vocab)

    total_pos_words = sum(pos_dict.values())
    total_neg_words = sum(neg_dict.values())

    pos_prob = {}
    neg_prob = {}

    for word in vocab:
        pos_prob[word] = (pos_dict[word] + 1) / (total_pos_words + vocab_size)
        neg_prob[word] = (neg_dict[word] + 1) / (total_neg_words + vocab_size)

    total_docs = num_pos_docs + num_neg_docs
    prior_pos = num_pos_docs / total_docs
    prior_neg = num_neg_docs / total_docs

    return pos_prob, neg_prob, prior_pos, prior_neg, total_pos_words, total_neg_words

**Stage 3: Classify single tweet**

In [80]:
def classify_tweet(tweet, vocab,
                   pos_prob, neg_prob,
                   prior_pos, prior_neg,
                   total_pos_words, total_neg_words):

    words = preprocess(tweet)

    log_pos = math.log(prior_pos)
    log_neg = math.log(prior_neg)

    vocab_size = len(vocab)

    for word in words:
        if word in vocab:
            log_pos += math.log(pos_prob[word])
            log_neg += math.log(neg_prob[word])
        else:
            log_pos += math.log(1 / (total_pos_words + vocab_size))
            log_neg += math.log(1 / (total_neg_words + vocab_size))

    return "pos" if log_pos > log_neg else "neg"

**Evaluation**

In [81]:
def evaluate(test_pos_df, test_neg_df,
             vocab, pos_prob, neg_prob,
             prior_pos, prior_neg,
             total_pos_words, total_neg_words):

    correct_pos = 0
    correct_neg = 0

    # Positive test tweets
    for tweet in test_pos_df['tweet']:
        pred = classify_tweet(tweet, vocab,
                              pos_prob, neg_prob,
                              prior_pos, prior_neg,
                              total_pos_words, total_neg_words)
        if pred == "pos":
            correct_pos += 1

    # Negative test tweets
    for tweet in test_neg_df['tweet']:
        pred = classify_tweet(tweet, vocab,
                              pos_prob, neg_prob,
                              prior_pos, prior_neg,
                              total_pos_words, total_neg_words)
        if pred == "neg":
            correct_neg += 1

    pos_accuracy = correct_pos / len(test_pos_df)
    neg_accuracy = correct_neg / len(test_neg_df)
    avg_accuracy = (pos_accuracy + neg_accuracy) / 2

    print("Positive Accuracy:", round(pos_accuracy, 4))
    print("Negative Accuracy:", round(neg_accuracy, 4))
    print("Average Accuracy:", round(avg_accuracy, 4))

    return pos_accuracy, neg_accuracy, avg_accuracy

**Execution**

In [84]:

vocab, pos_dict, neg_dict = build_vocabulary_and_frequencies(train_pos_df, train_neg_df)


pos_prob, neg_prob, prior_pos, prior_neg, total_pos_words, total_neg_words = calculate_probabilities(vocab, pos_dict, neg_dict,len(train_pos_df), len(train_neg_df))


evaluate(test_pos_df, test_neg_df,vocab, pos_prob, neg_prob,prior_pos, prior_neg,total_pos_words, total_neg_words)

Vocab size: 490317
Total positive words: 5012555
Total negative words: 5368482
Positive Accuracy: 0.739
Negative Accuracy: 0.82
Average Accuracy: 0.7795


(0.739, 0.82, 0.7795)