In [7]:
import os

def load_reviews(path):
    texts, labels = [], []
    for sentiment in ['pos', 'neg']:
        folder = os.path.join(path, sentiment)
        for file in os.listdir(folder):
            with open(os.path.join(folder, file), 'r', encoding='utf-8') as f:
                texts.append(f.read())
                labels.append(1 if sentiment == 'pos' else 0)
    return texts, labels

train_texts, train_labels = load_reviews('./train')
test_texts, test_labels = load_reviews('./test')

In [8]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Bag-of-Words (BoW)
bow_vectorizer = CountVectorizer()
X_train_bow = bow_vectorizer.fit_transform(train_texts)
X_test_bow = bow_vectorizer.transform(test_texts)

# TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(train_texts)
X_test_tfidf = tfidf_vectorizer.transform(test_texts)


In [9]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


def evaluate(y_true, y_pred):
    return {
        'Accuracy': accuracy_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred),
        'Recall': recall_score(y_true, y_pred),
        'F1 Score': f1_score(y_true, y_pred),
    }
results = {}

# --- Naive Bayes on Bag-of-Words ---
nb_bow = MultinomialNB()
nb_bow.fit(X_train_bow, train_labels)
y_pred_nb_bow = nb_bow.predict(X_test_bow)
results['NaiveBayes_BoW'] = evaluate(test_labels, y_pred_nb_bow)

# --- Logistic Regression on Bag-of-Words ---
lr_bow = LogisticRegression(penalty='l2', solver='liblinear', max_iter=1000)
lr_bow.fit(X_train_bow, train_labels)
y_pred_lr_bow = lr_bow.predict(X_test_bow)
results['LogReg_BoW'] = evaluate(test_labels, y_pred_lr_bow)

# --- Naive Bayes on TF-IDF ---
nb_tfidf = MultinomialNB()
nb_tfidf.fit(X_train_tfidf, train_labels)
y_pred_nb_tfidf = nb_tfidf.predict(X_test_tfidf)
results['NaiveBayes_TFIDF'] = evaluate(test_labels, y_pred_nb_tfidf)

# --- Logistic Regression on TF-IDF ---
lr_tfidf = LogisticRegression(penalty='l2', solver='liblinear', max_iter=1000)
lr_tfidf.fit(X_train_tfidf, train_labels)
y_pred_lr_tfidf = lr_tfidf.predict(X_test_tfidf)
results['LogReg_TFIDF'] = evaluate(test_labels, y_pred_lr_tfidf)


In [10]:
import pandas as pd

df_results = pd.DataFrame(results).T
print(df_results)

df_results.to_csv("results.txt", sep="\t", float_format="%.4f")

                  Accuracy  Precision   Recall  F1 Score
NaiveBayes_BoW     0.81356   0.860546  0.74840  0.800565
LogReg_BoW         0.86672   0.872622  0.85880  0.865656
NaiveBayes_TFIDF   0.82956   0.874126  0.77000  0.818766
LogReg_TFIDF       0.88316   0.884113  0.88192  0.883015


In [11]:

print("BoW vs TF-IDF comparison:")
print(df_results.loc[["NaiveBayes_BoW", "NaiveBayes_TFIDF", "LogReg_BoW", "LogReg_TFIDF"]])

BoW vs TF-IDF comparison:
                  Accuracy  Precision   Recall  F1 Score
NaiveBayes_BoW     0.81356   0.860546  0.74840  0.800565
NaiveBayes_TFIDF   0.82956   0.874126  0.77000  0.818766
LogReg_BoW         0.86672   0.872622  0.85880  0.865656
LogReg_TFIDF       0.88316   0.884113  0.88192  0.883015


In [12]:
# BoW with stop word removal
bow_sw = CountVectorizer(stop_words='english')
X_train_bow_sw = bow_sw.fit_transform(train_texts)
X_test_bow_sw = bow_sw.transform(test_texts)

# TF-IDF with stop word removal
tfidf_sw = TfidfVectorizer(stop_words='english')
X_train_tfidf_sw = tfidf_sw.fit_transform(train_texts)
X_test_tfidf_sw = tfidf_sw.transform(test_texts)

In [13]:
# Naive Bayes on BoW with stop words removed
nb_bow_sw = MultinomialNB().fit(X_train_bow_sw, train_labels)
results['NaiveBayes_BoW_Stopwords'] = evaluate(test_labels, nb_bow_sw.predict(X_test_bow_sw))

# Logistic Regression on BoW with stop words removed
lr_bow_sw = LogisticRegression(penalty='l2', solver='liblinear').fit(X_train_bow_sw, train_labels)
results['LogReg_BoW_Stopwords'] = evaluate(test_labels, lr_bow_sw.predict(X_test_bow_sw))

# Naive Bayes on TF-IDF with stop words removed
nb_tfidf_sw = MultinomialNB().fit(X_train_tfidf_sw, train_labels)
results['NaiveBayes_TFIDF_Stopwords'] = evaluate(test_labels, nb_tfidf_sw.predict(X_test_tfidf_sw))

# Logistic Regression on TF-IDF with stop words removed
lr_tfidf_sw = LogisticRegression(penalty='l2', solver='liblinear').fit(X_train_tfidf_sw, train_labels)
results['LogReg_TFIDF_Stopwords'] = evaluate(test_labels, lr_tfidf_sw.predict(X_test_tfidf_sw))


In [15]:
# L2 (already done above)
# L1
lr_l1 = LogisticRegression(penalty='l1', solver='liblinear')
lr_l1.fit(X_train_tfidf_sw, train_labels)
results['LogReg_TFIDF_Stopwords_L1'] = evaluate(test_labels, lr_l1.predict(X_test_tfidf_sw))

# No regularization with more iterations
lr_none = LogisticRegression(penalty='none', solver='saga', max_iter=5000)
lr_none.fit(X_train_tfidf_sw, train_labels)
results['LogReg_TFIDF_Stopwords_None'] = evaluate(test_labels, lr_none.predict(X_test_tfidf_sw))


In [16]:
df_all = pd.DataFrame(results).T
pd.set_option("display.max_rows", None)
print(df_all)
# Optional: Save to results.txt
df_all.to_csv("results.txt", sep="\t", float_format="%.4f")


                             Accuracy  Precision   Recall  F1 Score
NaiveBayes_BoW                0.81356   0.860546  0.74840  0.800565
LogReg_BoW                    0.86672   0.872622  0.85880  0.865656
NaiveBayes_TFIDF              0.82956   0.874126  0.77000  0.818766
LogReg_TFIDF                  0.88316   0.884113  0.88192  0.883015
NaiveBayes_BoW_Stopwords      0.81968   0.863207  0.75976  0.808187
LogReg_BoW_Stopwords          0.85908   0.865602  0.85016  0.857812
NaiveBayes_TFIDF_Stopwords    0.82992   0.865279  0.78152  0.821269
LogReg_TFIDF_Stopwords        0.87900   0.877881  0.88048  0.879179
LogReg_TFIDF_Stopwords_L1     0.87336   0.863417  0.88704  0.875069
LogReg_TFIDF_Stopwords_None   0.84768   0.859530  0.83120  0.845128
