# Data & Preprocessing

Requirements:
- Unigrams and bigrams
- Folds 1-4 (training and tunning) <--- flexible
- Folds 5 (validation) <-- flexible
! Use cross-validation or (for random forests) out-of-bag evaluation to select the values of the hyperparameters of the algorithms on the training set.


Deceptive = 1
Truthful = 0

### Loading Data

In [12]:
import os
import glob

def load_reviews_simple(base_path):
    texts, labels, folds = [], [], []

    for label_name, label_val in [("deceptive_from_MTurk", 1),
                                  ("truthful_from_Web", 0)]:
        path = os.path.join(base_path, label_name)

        for fold_name in sorted(os.listdir(path)):
            fold_path = os.path.join(path, fold_name)
            if not os.path.isdir(fold_path):
                continue

            for file in glob.glob(os.path.join(fold_path, "*.txt")):
                with open(file, "r", encoding="utf-8") as f:
                    texts.append(f.read().strip())
                labels.append(label_val)
                folds.append(fold_name)

    return texts, labels, folds


In [13]:
base_path = r"C:\Users\irene\Documents\irene\Università\UU\DM\DM Assignment\negative_polarity"

'''
#base_path = "/content/drive/MyDrive/Data Mining G25/Data/negative_polarity" # Jagoda's path
base_path = "/content/drive/MyDrive/_AI_Master/DataMining/Data Mining G25/Data/negative_polarity" # Sara's path
'''
texts, labels, folds = load_reviews_simple(base_path)

print(len(texts))       # 800
print(set(folds))       # {'fold1', ..., 'fold5'}
print(labels[:10])      # [1, 1, 0, ...] -> should in it be all 1's?


800
{'fold1', 'fold4', 'fold5', 'fold2', 'fold3'}
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [14]:
print(labels)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

### Caching data

In [15]:
'''
from joblib import dump
dump((texts, labels, folds), "/content/drive/MyDrive/Data Mining G25/reviews.joblib")
'''

'\nfrom joblib import dump\ndump((texts, labels, folds), "/content/drive/MyDrive/Data Mining G25/reviews.joblib")\n'

In [16]:
from joblib import load

#path = "/content/drive/MyDrive/Data Mining G25/reviews.joblib" # Jagoda's path
#path = "/content/drive/MyDrive/_AI_Master/DataMining/Data Mining G25/reviews.joblib" # Sara's path
path = r"C:\Users\irene\Documents\irene\Università\UU\DM\DM Assignment\reviews.joblib"


texts, labels, folds = load(path)

### Data Preprocessing: Vectorizations, Uni & Bigrams, Stratfied Folds

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import StratifiedKFold

def prepare_data(texts, labels, folds, ngram_range=(1,1), min_df=2, use_tfidf=True, cv_folds=5, random_state=42):
    """
    train/test split with vectorization + StratifiedKFold CV splitter.

    Args:
        texts, labels, folds: dataset lists
        ngram_range (tuple): (1,1)=unigrams, (1,2)=unigrams+bigrams
        min_df: remove SPARSE terms >> when 2 only keeps words that appear in at least 2 reviews
        use_tfidf: True=TF-IDF, weights based on the frequency of words in corpus
        cv_folds (int): number of folds for cross-validation
        random_state: 42

    Returns:
        X_train, y_train, X_test, y_test, vectorizer, cv_splitter
    """
    # split train/test (fold 1–4 vs fold 5)
    train_texts = [t for t, f in zip(texts, folds) if f != "fold5"]
    test_texts  = [t for t, f in zip(texts, folds) if f == "fold5"]

    y_train = [y for y, f in zip(labels, folds) if f != "fold5"]
    y_test  = [y for y, f in zip(labels, folds) if f == "fold5"]

    # Vectorizer or CountVectorizer (Bag-of-Words) ??? >> JUST FOR BAYES MODEL :)))
    VectorizerClass = TfidfVectorizer if use_tfidf else CountVectorizer
    vectorizer = VectorizerClass(ngram_range=ngram_range, min_df=min_df)

    X_train = vectorizer.fit_transform(train_texts) # fit only on training
    X_test  = vectorizer.transform(test_texts) # apply to test set

    # StratifiedKFolds >> Besides Randomr Forest, then we use OOB = True
    cv_splitter = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=random_state) # to use later in GridSearch

    return X_train, y_train, X_test, y_test, vectorizer, cv_splitter


# Training Pipeline (NB, others TO ADD)

In [None]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, roc_auc_score
)


# Training each model
def train_model(model_name, X_train, y_train, cv, param_grid, scoring="f1", n_jobs=-1):
    """Train a model using GridSearchCV OR OOB in case of Forest"""
    if model_name == "naive_bayes":
        model = MultinomialNB()
        
    elif model_name == "decision_tree":
        model = DecisionTreeClassifier(random_state=42)
        
    elif model_name == "gradient_boosting":
        model = GradientBoostingClassifier(random_state=42)
    '''
    #example
    elif model_name == "svm":
        model = LinearSVC()
    else:
        raise NotImplementedError(f"Model '{model_name}' not implemented yet.")
    '''
    grid = GridSearchCV(model, param_grid=param_grid, cv=cv, scoring=scoring, n_jobs=n_jobs)
    grid.fit(X_train, y_train)
    return grid


# Experiment Runnerrrr
def run_experiments(
    texts, labels, folds,
    ngram_ranges=[(1,1), (1,2)],
    min_dfs=[2,4,6],
    use_tfidf=True,
    cv_folds_list=[3,5,10],
    random_state=42,
    models_and_params=None,  # dict: {"model": param_grid} >> Later in the pipeline
    scoring="f1",
    n_jobs=-1,
    save_prefix="experiment"
):
    """Run multiple models, each saved to its own CSV
    (overview + misclassified sentences). ADD more hyperparamteres <3"""
    # default models
    if models_and_params is None:
        models_and_params = {
            "naive_bayes": {"alpha": [0.01, 0.1,0.2,0.3,0.4, 0.5, 1]},
            
            "decision_tree": {
            "criterion": ["gini", "entropy"],
            "max_depth": [None, 5, 10, 20],
            "ccp_alpha": [0.0, 0.001, 0.01, 0.1],
            "min_samples_split": [2, 5, 10],
            "min_samples_leaf": [1, 2, 5]},
            
            "gradient_boosting": {
            "n_estimators": [50, 100, 200],      # B: number of trees
            "learning_rate": [0.01, 0.05, 0.1],  # λ: shrinkage
            "max_depth": [2, 3, 5],             # fraction of samples per tree
        }
           # "logreg": {"C": [0.1, 1, 10]},
           # "svm": {"C": [0.1, 1, 10]}
        }

    all_results_global = []
    all_misclassified_global = []

    # loop through models
    for model_name, param_grid in models_and_params.items():
        print(f"MODEL: {model_name.upper()}")

        all_results = []
        all_misclassified = []

        for ngram_range in ngram_ranges:
            for min_df in min_dfs:
                for cv_folds in cv_folds_list:
                    print(f"\n>>> ngram={ngram_range}, min_df={min_df}, folds={cv_folds}")

                    # prepare data
                    X_train, y_train, X_test, y_test, vec, cv= prepare_data(
                        texts, labels, folds,
                        ngram_range=ngram_range,
                        min_df=min_df,
                        use_tfidf=use_tfidf,
                        cv_folds=cv_folds,
                        random_state=random_state
                    )

                    # train
                    grid = train_model(model_name, X_train, y_train, cv, param_grid, scoring=scoring, n_jobs=n_jobs)
                    best_model = grid.best_estimator_
                    y_pred = best_model.predict(X_test)

                    # metrics
                    acc = accuracy_score(y_test, y_pred)
                    prec = precision_score(y_test, y_pred)
                    rec = recall_score(y_test, y_pred)
                    f1 = f1_score(y_test, y_pred)
                    f1_macro = f1_score(y_test, y_pred, average="macro")
                    f1_weighted = f1_score(y_test, y_pred, average="weighted")

                    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

                    try:
                        if hasattr(best_model, "predict_proba"):
                            y_proba = best_model.predict_proba(X_test)[:, 1]
                            auc = roc_auc_score(y_test, y_proba)
                        else:
                            auc = np.nan
                    except Exception:
                        auc = np.nan


                    cv_results = grid.cv_results_
                    std_cv_f1 = cv_results["std_test_score"][grid.best_index_]
                    vocab_size = len(vec.get_feature_names_out())

                    # misclassified sentences
                    mis_idx = [i for i, (true, pred) in enumerate(zip(y_test, y_pred)) if true != pred]
                    # Need the original test texts to save misclassified samples
                    test_texts  = [t for t, f in zip(texts, folds) if f == "fold5"]
                    for i in mis_idx:
                        all_misclassified.append({
                            "model": model_name,
                            "ngram_range": str(ngram_range),
                            "min_df": min_df,
                            "cv_folds": cv_folds,
                            "true_label": y_test[i],
                            "pred_label": y_pred[i],
                            "text": test_texts[i]
                        })


                    # collect results
                    all_results.append({
                        "model": model_name,
                        "ngram_range": str(ngram_range),
                        "min_df": min_df,
                        "cv_folds": cv_folds,
                        "vocab_size": vocab_size,
                        "best_params": grid.best_params_,
                        "best_cv_f1_mean": grid.best_score_,
                        "best_cv_f1_std": std_cv_f1,
                        "test_acc": acc,
                        "test_prec": prec,
                        "test_rec": rec,
                        "test_f1": f1,
                        "test_f1_macro": f1_macro,
                        "test_f1_weighted": f1_weighted,
                        "TN": tn, "FP": fp, "FN": fn, "TP": tp
                    })

        #Save files
        df_results = pd.DataFrame(all_results).sort_values(by="test_f1", ascending=False).reset_index(drop=True)
        df_misclassified = pd.DataFrame(all_misclassified)

        res_file = f"{save_prefix}_{model_name}_overview.csv"
        mis_file = f"{save_prefix}_{model_name}_misclassified.csv"

        df_results.to_csv(res_file, index=False)
        df_misclassified.to_csv(mis_file, index=False)

        print(f"Saved {model_name.upper()} overview to {res_file}")
        print(f"Saved misclassified samples to {mis_file}")

        all_results_global.extend(all_results)
        all_misclassified_global.extend(all_misclassified)

        #short summary
        print("\nTop 3 results for", model_name)
        print(df_results.head(3)[["ngram_range", "min_df", "cv_folds", "test_f1", "test_acc"]])


    return pd.DataFrame(all_results_global), pd.DataFrame(all_misclassified_global)


df_all, df_all_mis = run_experiments(texts, labels, folds, models_and_params=None)

MODEL: NAIVE_BAYES

>>> ngram=(1, 1), min_df=2, folds=3

>>> ngram=(1, 1), min_df=2, folds=5

>>> ngram=(1, 1), min_df=2, folds=10

>>> ngram=(1, 1), min_df=4, folds=3

>>> ngram=(1, 1), min_df=4, folds=5

>>> ngram=(1, 1), min_df=4, folds=10

>>> ngram=(1, 1), min_df=6, folds=3

>>> ngram=(1, 1), min_df=6, folds=5

>>> ngram=(1, 1), min_df=6, folds=10

>>> ngram=(1, 2), min_df=2, folds=3

>>> ngram=(1, 2), min_df=2, folds=5

>>> ngram=(1, 2), min_df=2, folds=10

>>> ngram=(1, 2), min_df=4, folds=3

>>> ngram=(1, 2), min_df=4, folds=5

>>> ngram=(1, 2), min_df=4, folds=10

>>> ngram=(1, 2), min_df=6, folds=3

>>> ngram=(1, 2), min_df=6, folds=5

>>> ngram=(1, 2), min_df=6, folds=10
Saved NAIVE_BAYES overview to experiment_naive_bayes_overview.csv
Saved misclassified samples to experiment_naive_bayes_misclassified.csv

Top 3 results for naive_bayes
  ngram_range  min_df  cv_folds   test_f1  test_acc
0      (1, 2)       6        10  0.891720   0.89375
1      (1, 2)       4         5  0.8

In [None]:
pip install -U seaborn

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# List of metrics to plot
metrics = ["test_acc", "test_prec", "test_rec", "test_f1", "test_f1_macro", "test_f1_weighted"]

# Loop through metrics and plot barplots
for metric in metrics:
    plt.figure(figsize=(8, 5))
    sns.barplot(
        data=df_all,
        x="model",
        y=metric,
        hue="ngram_range",  # e.g., (1,1) for unigram, (1,2) for bigram
        ci="sd",            # show standard deviation as error bars
        palette="Set2"
    )
    plt.title(f"{metric} by Model and N-gram Range")
    plt.ylabel(metric)
    plt.xlabel("Model")
    plt.xticks(rotation=45)
    plt.legend(title="N-gram Range")
    plt.tight_layout()
    plt.show()
