# 1. Install and Import Libraries

In [None]:
!pip install optuna

In [None]:
import os
import json
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, f1_score
import seaborn as sns
import matplotlib.pyplot as plt
import joblib
from google.colab import drive

import optuna
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

# 2. Environment Settings and Initialization

## 2.1. Set Seed for Reproducibility

In [None]:
SEED = 42
np.random.seed(SEED)

## 2.3. Manage Google Drive

### 2.3.1. Mount Google Drive

In [None]:
drive.mount('/content/drive')

### 2.3.2. Set Folder Path

In [None]:
# Datasets
READ_PATH = '/content/drive/MyDrive/Bach_Thesis/Dataset/'

# Models
SAVE_ROOT = "/content/drive/MyDrive/Bach_Thesis/Models/SA_Optuna"

## 2.4. Initialize Global Parameters

In [None]:
LABELS = [0,1]
TARGET_NAMES = ['negative','positive']

## 2.5. Initialize Datasets

In [None]:
df_train_ori = pd.read_csv(f"{READ_PATH}Train2lab.csv")
df_train_ros = pd.read_csv(f"{READ_PATH}Train_ROS2lab.csv")
df_train_ros_ncl = pd.read_csv(f"{READ_PATH}Train_ROS_NCL2lab.csv")

In [None]:
df_val = pd.read_csv(f"{READ_PATH}Validation2lab.csv")
df_test = pd.read_csv(f"{READ_PATH}Test2lab.csv")

# 3. Helper Functions

In [None]:
def create_classification_report(y_true, y_pred):
    cr = classification_report(y_true, y_pred, labels=LABELS, target_names=TARGET_NAMES, zero_division=0, output_dict=True)
    df_cr = pd.DataFrame(cr).transpose().reset_index().rename(columns={'index':'label'})
    for col in df_cr.select_dtypes(include=['float']).columns:
        df_cr[col] = df_cr[col].round(4)

    return df_cr

In [None]:
def create_confusion_matrix(y_true, y_pred, save_path_png):
    cm = confusion_matrix(y_true, y_pred, labels=LABELS)
    plt.figure(figsize=(6,5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=TARGET_NAMES, yticklabels=TARGET_NAMES)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.tight_layout()
    plt.savefig(save_path_png, dpi=150)
    plt.close()

In [None]:
def objective_svm(trial, df_train, df_val, seed):
    C = trial.suggest_float('C', 1e-3, 1e2, log=True)

    kernel = trial.suggest_categorical('kernel', ['linear', 'rbf', 'poly', 'sigmoid'])

    gamma = trial.suggest_categorical('gamma', ['scale', 'auto'])

    vectorizer = TfidfVectorizer(
        ngram_range=(1,2),
        sublinear_tf=True
    )
    X_train = vectorizer.fit_transform(df_train["cleaned_content"])
    y_train = df_train["sentiment"]
    X_val = vectorizer.transform(df_val["cleaned_content"])
    y_val = df_val["sentiment"]

    model = SVC(
        C=C,
        kernel=kernel,
        gamma=gamma,
        random_state=seed,
        probability=True
    )
    model.fit(X_train, y_train)
    preds = model.predict(X_val)

    return f1_score(y_val, preds, average="weighted", zero_division=0)

In [None]:
def run_optuna_svm(df_train, df_val, seed, n_trials, save_root, dataset_name="dataset"):
    experiment_dir = Path(save_root) / "SVM_2" / dataset_name
    study_dir = experiment_dir / "optuna_study"
    study_dir.mkdir(parents=True, exist_ok=True)

    storage_path = study_dir / "study.db"
    study_name_path = study_dir / "study_name.txt"
    storage_uri = f"sqlite:///{storage_path}"

    if storage_path.exists() and study_name_path.exists():
        study_name = study_name_path.read_text().strip()
        study = optuna.load_study(study_name=study_name, storage=storage_uri)
        print(f"Resuming Optuna study: {study_name}")
    else:
        study_name = "svm_opt_" + dataset_name
        study_name_path.write_text(study_name)
        print(f"Creating new Optuna study: {study_name}")
        study = optuna.create_study(
            study_name=study_name,
            direction="maximize",
            storage=storage_uri,
            load_if_exists=True
        )

    study.optimize(
        lambda trial: objective_svm(trial, df_train, df_val, seed),
        n_trials=n_trials
    )

    print("Best F1:", study.best_value)
    print("Best Params:", study.best_params)
    return study.best_params

In [None]:
def svm_pipeline(df_train, df_val, df_test, best_hp, seed, save_root, dataset_name="dataset"):
    experiment_dir = Path(save_root) / "SVM_2" / dataset_name
    final_dir = experiment_dir / "final_model"
    final_dir.mkdir(parents=True, exist_ok=True)
    print("Saving outputs to:", final_dir)

    vectorizer = TfidfVectorizer(
        ngram_range=(1,2),
        sublinear_tf=True
    )
    X_train = vectorizer.fit_transform(df_train["cleaned_content"])
    y_train = df_train["sentiment"]
    X_val = vectorizer.transform(df_val["cleaned_content"])
    y_val = df_val["sentiment"]
    X_test = vectorizer.transform(df_test["cleaned_content"])
    y_test = df_test["sentiment"]

    model = SVC(
        C=best_hp["C"],
        kernel=best_hp["kernel"],
        gamma=best_hp.get("gamma", "scale"),
        random_state=seed,
        probability=True
    )
    model.fit(X_train, y_train)

    train_preds = model.predict(X_train)
    val_preds = model.predict(X_val)
    test_preds = model.predict(X_test)

    train_cr_df = create_classification_report(y_train, train_preds)
    val_cr_df = create_classification_report(y_val, val_preds)
    test_cr_df = create_classification_report(y_test, test_preds)

    train_cr_df.to_csv(final_dir / "classification_report_train.csv", index=False)
    val_cr_df.to_csv(final_dir / "classification_report_val.csv", index=False)
    test_cr_df.to_csv(final_dir / "classification_report_test.csv", index=False)

    create_confusion_matrix(y_train, train_preds, final_dir / "cm_train.png")
    create_confusion_matrix(y_val, val_preds, final_dir / "cm_val.png")
    create_confusion_matrix(y_test, test_preds, final_dir / "cm_test.png")

    summary = {
        "model": "SVM",
        "dataset": dataset_name,
        "n_train": len(df_train),
        "n_val": len(df_val),
        "n_test": len(df_test),
        "train_weighted_f1": round(f1_score(y_train, train_preds, average='weighted'), 4),
        "val_weighted_f1": round(f1_score(y_val, val_preds, average='weighted'), 4),
        "test_weighted_f1": round(f1_score(y_test, test_preds, average='weighted'), 4),
        "train_accuracy": round((train_preds == y_train).mean(), 4),
        "val_accuracy": round((val_preds == y_val).mean(), 4),
        "test_accuracy": round((test_preds == y_test).mean(), 4),
        "best_hp": best_hp
    }

    with open(final_dir / "summary.json", "w") as f:
        json.dump(summary, f, indent=2)

    pd.json_normalize(summary).to_csv(final_dir / "summary_metrics.csv", index=False)

    joblib.dump(model, final_dir / "svm_model.pkl")
    joblib.dump(vectorizer, final_dir / "tfidf.pkl")

    print("âœ… SVM pipeline completed.")
    print("Model saved to:", final_dir)

    return {
        "summary": summary,
        "train_report": train_cr_df,
        "validation_report": val_cr_df,
        "test_report": test_cr_df,
        "model_dir": str(final_dir)
    }

# 4. Run Models

In [None]:
def clean_text_column(df, text_col="cleaned_content"):
    df[text_col] = df[text_col].fillna("").astype(str)
    return df

df_train_ori = clean_text_column(df_train_ori, "cleaned_content")
df_train_ros = clean_text_column(df_train_ros, "cleaned_content")
df_train_ros_ncl = clean_text_column(df_train_ros_ncl, "cleaned_content")
df_val = clean_text_column(df_val, "cleaned_content")
df_test = clean_text_column(df_test, "cleaned_content")

## original

In [None]:
best_hp_ori = run_optuna_svm(
    df_train=df_train_ori,
    df_val=df_val,
    seed=SEED,
    n_trials=20,
    save_root=SAVE_ROOT,
    dataset_name="original"
)


In [None]:
res_ori = svm_pipeline(
    df_train=df_train_ori,
    df_val=df_val,
    df_test=df_test,
    best_hp=best_hp_ori,
    seed=SEED,
    save_root=SAVE_ROOT,
    dataset_name="original"
)

## ros

In [None]:
best_hp_ros = run_optuna_svm(
    df_train=df_train_ros,
    df_val=df_val,
    seed=SEED,
    n_trials=20,
    save_root=SAVE_ROOT,
    dataset_name="ROS"
)


In [None]:
res_ros = svm_pipeline(
    df_train=df_train_ros,
    df_val=df_val,
    df_test=df_test,
    best_hp=best_hp_ros,
    seed=SEED,
    save_root=SAVE_ROOT,
    dataset_name="ROS"
)

## ros-ncl

In [None]:
best_hp_ros_ncl = run_optuna_svm(
    df_train=df_train_ros_ncl,
    df_val=df_val,
    seed=SEED,
    n_trials=20,
    save_root=SAVE_ROOT,
    dataset_name="ROS-NCL"
)

In [None]:
res_ros_ncl = svm_pipeline(
    df_train=df_train_ros_ncl,
    df_val=df_val,
    df_test=df_test,
    best_hp=best_hp_ros_ncl,
    seed=SEED,
    save_root=SAVE_ROOT,
    dataset_name="ROS-NCL"
)