# 3. Required Experiments & Ablation Studies

In [1]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import RandomizedSearchCV
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from scipy.stats import loguniform
from nltk.corpus import stopwords

# Add src to path
sys.path.append('..')
from src.data_loader import load_dataset
from src.preprocessing import create_text_features, clean_text, stop_words

# 1. Load Data
df_train_full, _ = load_dataset("../data")
df_train_full = create_text_features(df_train_full)

# 2. Prepare X and y
label_cols = df_train_full.select_dtypes(include=['number']).columns.tolist()
if 'Argument ID' in label_cols: label_cols.remove('Argument ID')

# --- CRITICAL FIX: APPLY EXACT WINNING COMBO (Lower + Stopwords + Lemma) ---
print("üßπ Applying winning preprocessing: Lowercase + Stopwords + Lemmatization...")

def apply_winning_strategy(text):
    # 1. Lowercase (handled by clean_text defaults or explicit)
    text = text.lower()
    
    # 2. Remove Stopwords
    words = [w for w in text.split() if w not in stop_words]
    text_no_stop = " ".join(words)
    
    # 3. Lemmatize (using your advanced function)
    # We pass 'lemmatized' to clean_text, which handles the POS tagging logic
    return clean_text(text_no_stop, strategy='lemmatized')

X_train = [apply_winning_strategy(t) for t in df_train_full['text'].values]
# ---------------------------------------------------------------------------

y_train = df_train_full[label_cols].values

# 3. Setup CV
stratified_cv = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=42)

print(f"‚úÖ Ready to tune on {len(X_train)} fully preprocessed examples.")

[nltk_data] Downloading package stopwords to /home/alumno/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/alumno/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/alumno/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/alumno/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


üßπ Applying winning preprocessing: Lowercase + Stopwords + Lemmatization...
‚úÖ Ready to tune on 7289 fully preprocessed examples.


Define search space

In [4]:
from src.models import create_advanced_pipeline  # <--- ADD THIS IMPORT

# --- HYPERPARAMETER TUNING CONFIGURATION ---

# 1. Based on Notebook 02 Winner
best_model_type = "logreg"

# 2. Initialize Pipeline
# We start with the pipeline for that specific model
pipeline = create_advanced_pipeline(model_type=best_model_type, vec_type="tfidf", ngram_range=(1, 2))

# 3. Define Parameter Distributions
# We define specific grids for each model type.

if best_model_type == "logreg":
    param_dist = {
        'vec__min_df': [2, 3, 5],
        'vec__max_features': [10000, 20000, 30000, None],
        'vec__ngram_range': [(1, 1), (1, 2), (1, 3)], 
        'clf__estimator__C': loguniform(0.01, 100),   # Regularization
        'clf__estimator__class_weight': ['balanced', None]
    }

elif best_model_type == "svm":
    param_dist = {
        'vec__min_df': [2, 3, 5],
        'vec__max_features': [10000, 20000, 40000],
        'vec__ngram_range': [(1, 1), (1, 2), (1, 3)],
        'clf__estimator__C': loguniform(0.01, 100),   # SVM Regularization
        'clf__estimator__class_weight': ['balanced', None]
    }

elif best_model_type in ["nb", "complement_nb"]:
    param_dist = {
        'vec__min_df': [2, 3, 5],
        'vec__max_features': [10000, 20000, None],
        'vec__ngram_range': [(1, 1), (1, 2), (1, 3)],
        'clf__estimator__alpha': loguniform(0.001, 10) # Smoothing parameter
    }

print(f"‚úÖ Configuration set for: {best_model_type.upper()}")
print(f"üîç Search Space Size: {len(param_dist)} parameter types.")

‚úÖ Configuration set for: LOGREG
üîç Search Space Size: 5 parameter types.


**Run randomized search**

In [5]:
# 3. Initialize Randomized Search
# n_iter=20 means we try 20 random combinations. 
random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=20,
    cv=stratified_cv,
    scoring='f1_macro',
    n_jobs=-1,      # Use all CPU cores
    verbose=1,
    random_state=42 # Reproducibility
)

# 4. Run the Search
print(f"üöÄ Starting Hyperparameter Optimization for {best_model_type}...")
random_search.fit(X_train, y_train)

print(f"\n‚úÖ Optimization Complete.")
print(f"üèÜ Best Score (F1-Macro): {random_search.best_score_:.4f}")
print("Best Parameters:")
for param, value in random_search.best_params_.items():
    print(f"  - {param}: {value}")

üöÄ Starting Hyperparameter Optimization for logreg...
Fitting 5 folds for each of 20 candidates, totalling 100 fits

‚úÖ Optimization Complete.
üèÜ Best Score (F1-Macro): 0.4826
Best Parameters:
  - clf__estimator__C: 0.6870614282613301
  - clf__estimator__class_weight: balanced
  - vec__max_features: None
  - vec__min_df: 5
  - vec__ngram_range: (1, 2)


**Visualization**

**Export winner**

In [6]:
# --- EXPORT THE WINNER ---

best_params = random_search.best_params_

print("üìã FINAL CONFIGURATION FOR NOTEBOOK 04:")
print("-" * 50)
print(f"Model Type:        {best_model_type}")
print(f"Min DF:            {best_params.get('vec__min_df')}")
print(f"Max Features:      {best_params.get('vec__max_features')}")
print(f"N-Grams:           {best_params.get('vec__ngram_range')}")

if 'clf__estimator__C' in best_params:
    print(f"C (Regularization): {best_params['clf__estimator__C']:.4f}")
if 'clf__estimator__alpha' in best_params:
    print(f"Alpha (Smoothing):  {best_params['clf__estimator__alpha']:.4f}")
if 'clf__estimator__class_weight' in best_params:
    print(f"Class Weight:      {best_params['clf__estimator__class_weight']}")
print("-" * 50)

üìã FINAL CONFIGURATION FOR NOTEBOOK 04:
--------------------------------------------------
Model Type:        logreg
Min DF:            5
Max Features:      None
N-Grams:           (1, 2)
C (Regularization): 0.6871
Class Weight:      balanced
--------------------------------------------------
