# Programming Machine Learning Lab
# Exercise 08

**General Instructions:**

1. You need to submit the PDF as well as the filled notebook file.
1. Name your submissions by prefixing your matriculation number to the filename. Example, if your MR is 12345 then rename the files as **"12345_Exercise_11.xxx"**
1. Complete all your tasks and then do a clean run before generating the final PDF. (_Clear All Ouputs_ and _Run All_ commands in Jupyter notebook)

**Exercise Specific instructions::**

1. You are allowed to use only NumPy and Pandas (unless stated otherwise). You can use any library for visualizations.

### Part 1

**TF-IDF and BOW**

In this part, you will be working with the IMBD movie review dataset to perform various natural language processing tasks. You need to get the dataset from https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

1. Download and read the dataset (subset the data to only use 10,000 rows).
1. Perform tokenization on the review text.
1. Remove stop words from the tokenized text.
1. Use regular expressions to clean the text, removing any HTML tags, emails, and other unnecessary information.
1. Convert the cleaned data into a TF-IDF and BOW representation from scratch.

*Note: you can use NLTK for all sub-parts except the last*

**Main task**:
Using the BOW and Tf-Idf representation, implement a Naive-Bayes classifier for the data from scratch. Use Laplace smoothing for the implementation **Do not use sklearn for this part** 

[Reference Slide](https://www.ismll.uni-hildesheim.de/lehre/ml-16w/script/ml-09-A8-bayesian-networks.pdf)

In [1]:
import pandas as pd
import numpy as np
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict

In [16]:

# Download the dataset from https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
# Load the dataset and subset it to 10,000 rows
df = pd.read_csv('IMDB Dataset.csv', header=0, index_col=None)
df = df.sample(n=10000, random_state=42)

# Clean text using regular expressions
def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove emails
    text = re.sub(r'\S*@\S*\s?', '', text)
    # Remove other unnecessary information
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

df['cleaned_text'] = df['review'].apply(clean_text)

# Tokenization
df['tokenized_text'] = df['cleaned_text'].apply(lambda x: word_tokenize(x.lower()))

# Remove stop words
stop_words = set(stopwords.words('english'))
df['filtered_text'] = df['tokenized_text'].apply(lambda x: [word for word in x if word.isalpha() and word not in stop_words])

# Prepare data for classification
train_data = df['filtered_text'].to_numpy()[:8000]
test_data = df['filtered_text'].to_numpy()[8000:]
train_target = df['sentiment'].to_numpy()[:8000]
test_target = df['sentiment'].to_numpy()[8000:]


In [21]:
# Naive-Bayes classifier using BOW representation with Laplace smoothing
def train_naive_bayes_bow(data, target):
    vocabulary = set([word for sublist in data for word in sublist])
    word_counts = defaultdict(int)
    class_counts = defaultdict(int)
    total_docs = len(data)
    
    for i in range(total_docs):
        current_class = target[i]
        class_counts[current_class] += 1
        
        for word in data[i]:
            word_counts[(word, current_class)] += 1
    
    return vocabulary, word_counts, class_counts, total_docs

def predict_naive_bayes_bow(vocabulary, word_counts, class_counts, total_docs, number_of_unique_words_in_each_class, document, alpha=1):
    scores = defaultdict(float)
    
    # c is the class label in this iteration
    for c in class_counts:
        scores[c] = (class_counts[c] / total_docs)
        for word in document:
            scores[c] *= ((word_counts[(word, c)] + alpha) / (number_of_unique_words_in_each_class[c] + alpha * len(vocabulary))) * 10000
    # # Normalize scores array
    total_score = sum(scores.values())
    normalized_scores = {label: score / total_score for label, score in scores.items()}
    
    # Get the class with the highest probability
    predicted_class = max(normalized_scores, key=normalized_scores.get)
    
    return predicted_class

# Train the Naive-Bayes classifier with BOW representation
vocabulary, word_counts_per_class, class_counts, total_docs = train_naive_bayes_bow(train_data, train_target)
number_of_unique_words_in_each_class = defaultdict(int)
for t, count in word_counts_per_class.items():
    word = t[0]
    class_label = t[1]
    number_of_unique_words_in_each_class[class_label] += count

# Test the Naive-Bayes classifier on the test set
correct_predictions = 0
for i in range(len(test_data)):
    prediction = predict_naive_bayes_bow(vocabulary, word_counts_per_class, class_counts, total_docs, number_of_unique_words_in_each_class, test_data[i])
    if prediction == test_target[i]:
        correct_predictions += 1

accuracy = correct_predictions / len(test_data)
print(f"Accuracy: {accuracy}")

Accuracy: 0.8395


In [34]:
import math

def calculate_tf_idf(data):
    # Calculate TF (Term Frequency)
    tf = defaultdict(lambda: defaultdict(float))
    for i in range(len(data)):
        for word in data[i]:
            tf[i][word] += 1 / len(data[i])
    
    # Calculate IDF (Inverse Document Frequency)
    idf = defaultdict(float)
    for doc in tf:
        for word in tf[doc]:
            idf[word] += 1
    
    for word in idf:
        idf[word] = math.log(len(data) / (idf[word] + 1))
    
    # Calculate TF-IDF
    tf_idf = defaultdict(lambda: defaultdict(float))
    for doc in tf:
        for word in tf[doc]:
            tf_idf[doc][word] = tf[doc][word] * idf[word]
    
    return tf_idf

def train_naive_bayes_tfidf(data, target):
    tf_idf = calculate_tf_idf(data)
    class_counts = defaultdict(int)
    total_docs = len(data)
    
    for i in range(total_docs):
        current_class = target[i]
        class_counts[current_class] += 1
    
    return tf_idf, class_counts, total_docs

def predict_naive_bayes_tfidf(tf_idf, target, class_counts, total_docs, document, alpha=1):
    scores = defaultdict(float)
    
    # c is the class label in this iteration
    for c in class_counts:
        scores[c] = (class_counts[c] / total_docs)
        for word in document:
            scores[c] *= ((tf_idf.get(word, 0) + alpha) / (sum(tf_idf[i][word] for i in tf_idf) + alpha)) * 1000
    
    # Normalize scores array
    total_score = sum(scores.values())
    normalized_scores = {label: score / total_score for label, score in scores.items()}
    
    # Get the class with the highest probability
    predicted_class = max(normalized_scores, key=normalized_scores.get)
    
    return predicted_class

# Train the Naive-Bayes classifier with TF-IDF representation
tf_idf, class_counts, total_docs = train_naive_bayes_tfidf(train_data, train_target)

# Test the Naive-Bayes classifier on the test set
correct_predictions = 0
for i in range(len(test_data)):
    prediction = predict_naive_bayes_tfidf(tf_idf, train_target, class_counts, total_docs, test_data[i])
    if prediction == test_target[i]:
        correct_predictions += 1

accuracy = correct_predictions / len(test_data)
print(f"Accuracy: {accuracy}")


Accuracy: 1.0


**Evaluation**

Use sklearn implementation of Naive-Bayes classifier and compare the results with your implementation.

In [13]:
### Your code here
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn import metrics

df = pd.read_csv('IMDB Dataset.csv', header=0, index_col=None)
df = df.sample(n=10000, random_state=42)

# Clean text using regular expressions
def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove emails
    text = re.sub(r'\S*@\S*\s?', '', text)
    # Remove other unnecessary information
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

df['cleaned_text'] = df['review'].apply(clean_text)
stop_words = list(stopwords.words('english'))
vectorizer = CountVectorizer(stop_words=stop_words)
X = vectorizer.fit_transform(df['cleaned_text'])
X_train, X_test, y_train, y_test = train_test_split(X, df['sentiment'], test_size=0.2, random_state=42)
clf = MultinomialNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_sklearn = metrics.accuracy_score(y_test, y_pred)
print(f"Sklearn Naive-Bayes Classifier (with BOW representation) Accuracy: {accuracy_sklearn}")

vectorizer = TfidfVectorizer(stop_words=stop_words)
X = vectorizer.fit_transform(df['cleaned_text'])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, df['sentiment'], test_size=0.2, random_state=42)

# Train the Multinomial Naive Bayes classifier
clf = MultinomialNB()
clf.fit(X_train, y_train)

# Predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the sklearn implementation
accuracy_sklearn = metrics.accuracy_score(y_test, y_pred)
print(f"Sklearn Naive-Bayes Classifier Accuracy (with TF-IDF representation): {accuracy_sklearn}")


Sklearn Naive-Bayes Classifier (with BOW representation) Accuracy: 0.8415
Sklearn Naive-Bayes Classifier Accuracy (with TF-IDF representation): 0.853


### Part 2

**N-gram Language Model**


You won't believe what happened ??? !

Is the word "next" on the tip of your tongue? Although there are other possibilities, that is undoubtedly the most likely one. Other options are "after", "after that", and "to them". Our intuition tells us that some sentence endings are more plausible than others, especially when we take into account the previous information, the location of the phrase, and the speaker or author.

N-gram language models simply formalize that intuition. An n-gram model gives each possibility a probability score by solely taking into account the words that came before it. The probability of the word "next" in our example may be 80\%, whereas the probabilities of the words "after" and "then" might be 10\%, 5\%, and 5\%, respectively.

By leveraging these statistics, n-grams fuel the development of language models, which in turn contribute to an overall speech recognition system.

**Main task**:

In this part you are tasked with coding a N-gram language model on the dataset (https://www.kaggle.com/datasets/nltkdata/europarl). Use the english language for the task.


Evaluate your model based on perplexity and generate sentences using n-grams with n={2,3,4,5}. 

*Reading Material: https://web.stanford.edu/~jurafsky/slp3/3.pdf*

In [40]:
### Your code here
import nltk
from nltk import ngrams
from nltk.lm import MLE
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.tokenize import word_tokenize
import os

# Download the dataset if not already downloaded
data_folder_path = 'archive_2/europarl_raw/english/'

# Function to load data from .en files in a folder
def load_data_from_folder(folder_path):
    data = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".en"):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()
                tokens = word_tokenize(text)
                tokens = [token for token in tokens if token is not None]
                data.append(tokens)
    return data

# Load data from the local folder
tokenized_text = load_data_from_folder(data_folder_path)

# Split the dataset into training and testing sets
train_size = int(len(tokenized_text) * 0.9)
train_data, test_data = tokenized_text[:train_size], tokenized_text[train_size:]

# Function to train N-gram model and evaluate perplexity
def train_and_evaluate_ngram_model(n, train_data, test_data):
    # Create N-grams
    ngrams_train, vocab = padded_everygram_pipeline(n, train_data)

    # Train the N-gram model
    model = MLE(order=n)
    model.fit(ngrams_train, vocab)

    # Test the model on the test dataset
    ngrams_test = list(ngrams(test_data, n, pad_left=True, pad_right=True))
    perplexity = model.perplexity(ngrams_test)

    return model, perplexity

# Train and evaluate models for N={2, 3, 4, 5}
for n in range(2, 6):
    model, perplexity = train_and_evaluate_ngram_model(n, train_data, test_data)
    print(f"N={n} - Perplexity: {perplexity}")

    # Generate sentences using the trained model
    generated_sentence = model.generate(num_words=10, random_seed=42)
    print(f"Generated sentence for N={n}: {' '.join(generated_sentence)}\n")


[nltk_data] Downloading package punkt to C:\Users\Amir
[nltk_data]     Hossein\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


TypeError: Unsupported type for looking up in vocabulary: <class 'NoneType'>