In [35]:
# Install the NLTK library.
# Uncomment and run this line only once to install NLTK.

# !pip install nltk

In [36]:
# Download NLTK resources (tokenizer and stopwords).
# Uncomment and run these lines once after installing NLTK.

# import nltk
# nltk.download('punkt')
# nltk.download('stopwords')

In [37]:
# Usefull functions

# Function to print the results
def print_results(conf_matrix):
    # Extract values from the confusion matrix
    TP = conf_matrix[1][1]
    TN = conf_matrix[0][0]
    FP = conf_matrix[0][1]
    FN = conf_matrix[1][0]

    # Calculate metrics
    accuracy = (TP + TN) / (TP + TN + FP + FN)
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)

    print('Confusion Matrix:')
    print(conf_matrix)
    # Using f-strings to display metrics with 4 decimal places
    print(f'Accuracy: {accuracy:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')

# Function to replace suffix "n't" with the token " not"
def handle_negations(tokens):
    negation_pattern = [
        (r'n\'t', 'not'),
    ]
    # Apply replacements
    for pattern, replacement in negation_pattern:
        tokens = [re.sub(pattern, replacement, token) for token in tokens]

    return tokens

# Function to generate n-grams based on terms (tokens)
def generate_ngrams(text, n):
    n_grams = [text[i:i + n] for i in range(len(text) - n + 1)]
    return [' '.join(gram) for gram in n_grams]

In [38]:
# Load Data

import os
import pandas as pd

# Paths to the positive and negative review folders
positive_folder_path = 'C:\\Users\\user\\Desktop\\pamak\\current_b\\IR\\HW\\PP2\\txt_sentoken\\pos'
negative_folder_path = 'C:\\Users\\user\\Desktop\\pamak\\current_b\\IR\\HW\\PP2\\txt_sentoken\\neg'

# Function to load reviews from a folder
def load_reviews(folder_path, sentiment):
    reviews = []
    file_names = os.listdir(folder_path)
    for file_name in file_names:
        file_path = os.path.join(folder_path, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            review = file.read()
            reviews.append({'filename': file_name, 'review': review, 'sentiment': sentiment})
    return reviews

# Load positive reviews
positive_reviews = load_reviews(positive_folder_path, 'pos')
# Load negative reviews
negative_reviews = load_reviews(negative_folder_path, 'neg')

# Create dataframes
df_positive = pd.DataFrame(positive_reviews)
df_negative = pd.DataFrame(negative_reviews)

# Concatenate dataframes
df = pd.concat([df_positive, df_negative], ignore_index=True)

In [39]:
# Multinomial Naive Bayes

import re
from nltk.tokenize import word_tokenize
from nltk import ngrams
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import KFold, cross_val_score, cross_val_predict

# Function to preprocess a text review for Multinomial Naive Bayes
def preprocess_review_mnb(review):
    # Tokenization
    tokens = word_tokenize(review)
    
    # Replace suffix "n't" with "not"
    tokens = handle_negations(tokens)

    # Remove punctuation
    tokens = [word for word in tokens if word.isalpha()]
    
    return ' '.join(tokens)

# Apply preprocessing to the 'review' column in the DataFrame
df['preprocessed_review'] = df['review'].apply(preprocess_review_mnb)

# Separate feature from label
X = df['preprocessed_review']
y = df['sentiment']

# Vectorize the data (default: Term Occurrences, prune method: percentual)
vectorizer = CountVectorizer(min_df=0.039, max_df=0.3)
X_vectorized = vectorizer.fit_transform(X)

# Create a KFold object
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# Multinomial Naive Bayes classifier
mnb_classifier = MultinomialNB()

# Get predicted labels using cross_val_predict
y_pred_cv = cross_val_predict(mnb_classifier, X_vectorized, y, cv=kfold)

# Calculate the confusion matrix
conf_matrix = confusion_matrix(y, y_pred_cv)

# Print the results
print('----- Multinomial Naive Bayes Performance -----')
print_results(conf_matrix)

----- Multinomial Naive Bayes Performance -----
Confusion Matrix:
[[809 191]
 [193 807]]
Accuracy: 0.8080
Precision: 0.8086
Recall: 0.8070


In [41]:
from sklearn.naive_bayes import BernoulliNB

# Bernulli Naive Bayes

# Function to preprocess a text review for Bernulli Naive Bayes
def preprocess_review_bnb(review):
    # Tokenization
    tokens = word_tokenize(review)
    
    # Replace suffix "n't" with "not"
    # tokens = handle_negations(tokens)
    
    # Remove punctuation
    tokens = [word for word in tokens if word.isalpha()]
    
    # Generate 3-grams based on terms (tokens)
    trigrams = generate_ngrams(tokens, 3)
    
    return ' '.join(tokens + trigrams)

# Apply preprocessing to the 'review' column in the DataFrame
df['preprocessed_review'] = df['review'].apply(preprocess_review_bnb)

# Separate feature from label
X = df['preprocessed_review']
y = df['sentiment']

# Vectorize the data (Binary Term Occurrences)
b_vectorizer = CountVectorizer(binary=True, min_df=0.039, max_df=0.32, ngram_range=(1, 3))
X_vectorized = b_vectorizer.fit_transform(X)

# Bernulli Naive Bayes classifier
bnb_classifier = BernoulliNB()

# Get predicted labels
y_pred_cv = cross_val_predict(bnb_classifier, X_vectorized, y, cv=kfold)

# Calculate the confusion matrix
conf_matrix = confusion_matrix(y, y_pred_cv)

# Print the results
print('----- Bernulli Naive Bayes Performance -----')
print_results(conf_matrix)

----- Bernulli Naive Bayes Performance -----
Confusion Matrix:
[[825 175]
 [290 710]]
Accuracy: 0.7675
Precision: 0.8023
Recall: 0.7100
