In [1]:
import sys
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer

# Add src to path
sys.path.append('..')
from src.data_loader import load_dataset
from src.preprocessing import create_text_features
from src.models import create_baseline_pipeline

# 1. Load and Prepare Data
# We load the data from the parent directory
df_train_full, _ = load_dataset("../data")
df_train_full = create_text_features(df_train_full)

# 2. Define X and y (Features and Labels)
label_cols = df_train_full.select_dtypes(include=['number']).columns.tolist()
if 'Argument ID' in label_cols: label_cols.remove('Argument ID')

X_train = df_train_full['text'].values
y_train = df_train_full[label_cols].values

# 3. Setup Evaluation Strategy
# We define this once to ensure all experiments use the exact same splits
stratified_cv = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=42)

print(f"‚úÖ Data Loaded. Training on {len(X_train)} examples.")

[nltk_data] Downloading package stopwords to /home/alumno/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/alumno/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/alumno/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/alumno/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


‚úÖ Data Loaded. Training on 7289 examples.


**SPARSE REPRESENTATION**

We compare TF-IDF approach with CountVectors

In [2]:
# --- DIAGNOSTIC: Vocabulary Size Check ---
# Before running huge experiments, let's see how many features we actually have.
# This justifies our choice of max_features and min_df.

print(f"{'Min DF':<10} | {'N-grams':<10} | {'Vocabulary Size':<15} | {'Coverage (vs 20k)'}")
print("-" * 65)

for min_df in [2, 3, 5]:
    # Check (1, 3) because that's our largest configuration
    vec = CountVectorizer(ngram_range=(1, 3), min_df=min_df)
    vec.fit(X_train)
    vocab_size = len(vec.get_feature_names_out())
    
    # Calculate how much of the vocab we keep if we cap at 20,000
    kept = min(vocab_size, 20000)
    coverage = (kept / vocab_size) * 100
    
    print(f"{min_df:<10} | {'(1, 3)':<10} | {vocab_size:<15} | {coverage:.1f}% kept")

Min DF     | N-grams    | Vocabulary Size | Coverage (vs 20k)
-----------------------------------------------------------------
2          | (1, 3)     | 40560           | 49.3% kept
3          | (1, 3)     | 20708           | 96.6% kept
5          | (1, 3)     | 11174           | 100.0% kept


**Experiment with whole words (fixed length)**

In [3]:
# 3. Define Experiments
experiments = [
    # Label       Type      N-gram Range
    ("Baseline",  "Count",  (1, 1)),
    ("Baseline",  "tfidf",  (1, 1)),
    ("Bigrams",   "tfidf",  (1, 2)),
    ("Trigrams",  "tfidf",  (1, 3)),
    ("Bi-Only",   "tfidf",  (2, 2)),
]

print(f"{'Method':<10} | {'Type':<8} | {'N-Grams':<10} | {'Mean F1-Macro':<15} | {'Std Dev':<10}")
print("-" * 65)

results = []

for name, vec_type, ngram in experiments:
    # Build Pipeline using modular function
    pipeline = create_baseline_pipeline(vec_type=vec_type, ngram_range=ngram)
    
    # Run CV
    scores = cross_val_score(pipeline, X_train, y_train, cv=stratified_cv, scoring='f1_macro', n_jobs=-1)
    
    # Record and Print
    mean_score = scores.mean()
    results.append({
        "Method": name,
        "Type": vec_type,
        "N-grams": str(ngram),
        "F1": mean_score,
        "Std": scores.std()
    })
    
    print(f"{name:<10} | {vec_type:<8} | {str(ngram):<10} | {mean_score:.4f}          | {scores.std():.4f}")

print("-" * 65)

# Find Winner
best_result = max(results, key=lambda x: x['F1'])
print(f"üèÜ BEST CONFIGURATION: {best_result['Type']} {best_result['N-grams']} ({best_result['F1']:.4f})")

Method     | Type     | N-Grams    | Mean F1-Macro   | Std Dev   
-----------------------------------------------------------------
Baseline   | Count    | (1, 1)     | 0.4569          | 0.0031
Baseline   | tfidf    | (1, 1)     | 0.4720          | 0.0018
Bigrams    | tfidf    | (1, 2)     | 0.4947          | 0.0019
Trigrams   | tfidf    | (1, 3)     | 0.5002          | 0.0020
Bi-Only    | tfidf    | (2, 2)     | 0.4834          | 0.0021
-----------------------------------------------------------------
üèÜ BEST CONFIGURATION: tfidf (1, 3) (0.5002)


In [4]:
# --- EXPERIMENT 2: Character N-Grams (Robustness Check) ---
# Character n-grams ignore word boundaries, helping with typos and morphology.

experiments_char = [
    # Label         Type      N-gram Range   Why?
    ("Char (3)",    "tfidf",  (3, 3)),       # Captures syllables/stems
    ("Char (4)",    "tfidf",  (4, 4)),       # Captures short words
    ("Char (3-5)",  "tfidf",  (3, 5)),       # The "Sweet Spot" (captures all of the above)
]

print(f"{'Method':<12} | {'Type':<8} | {'N-Grams':<10} | {'Mean F1-Macro':<15} | {'Std Dev':<10}")
print("-" * 65)

results_char = []

for name, vec_type, ngram in experiments_char:
    # We pass analyzer='char' here
    # And we ensure vec_type is "tfidf" (lowercase, no hyphen)
    pipeline = create_baseline_pipeline(vec_type=vec_type, ngram_range=ngram, analyzer='char')
    
    # Run CV
    scores = cross_val_score(pipeline, X_train, y_train, cv=stratified_cv, scoring='f1_macro', n_jobs=-1)
    
    # Record
    mean_score = scores.mean()
    results_char.append({
        "Method": name,
        "Type": vec_type,
        "N-grams": str(ngram),
        "F1": mean_score
    })
    
    print(f"{name:<12} | {vec_type:<8} | {str(ngram):<10} | {mean_score:.4f}          | {scores.std():.4f}")

print("-" * 65)
best_char = max(results_char, key=lambda x: x['F1'])
print(f"üèÜ BEST CHAR CONFIG: {best_char['Method']} {best_char['N-grams']} ({best_char['F1']:.4f})")

Method       | Type     | N-Grams    | Mean F1-Macro   | Std Dev   
-----------------------------------------------------------------
Char (3)     | tfidf    | (3, 3)     | 0.4682          | 0.0021
Char (4)     | tfidf    | (4, 4)     | 0.4880          | 0.0015
Char (3-5)   | tfidf    | (3, 5)     | 0.4853          | 0.0016
-----------------------------------------------------------------
üèÜ BEST CHAR CONFIG: Char (4) (4, 4) (0.4880)


*Sparse feature representation methods analysis:*

- Our experiments revealed that CountVectors (Raw Frequency) significantly outperformed TF-IDF. This suggests that for short argumentation texts, the raw presence of specific value-laden keywords (e.g., 'freedom', 'security') is the most predictive feature.

- TF-IDF attempts to down-weight common terms, but in this domain, high-frequency terms are often the exact class identifiers we need. Since BM25 is mathematically an extension of TF-IDF (designed to further penalize term saturation and normalize length), it inherits the same 'flaw' for this specific dataset.

- Consequently, because the simpler CountVectors model already outperforms the weighted TF-IDF model by a large margin, we conclude that complex frequency dampening (like that in BM25) is unnecessary and detrimental for this specific task. We therefore selected CountVectors (N-gram 1,2) as our optimal Sparse baseline.

**Dense methods**

We now test if pre-trained **Dense Embeddings** can beat our N-gram baseline.
Since we are using a simple classifier (Logistic Regression), we cannot feed it a variable-length sequence of words. Instead, we must **average** the vectors of all words in a sentence to get a single fixed-length vector (often called "Bag of Embeddings").

* **Hypothesis:** This should capture "meaning" (semantics) better than just counting words.
* **Risk:** Averaging destroys word order (syntax) and might dilute specific keyword triggers.

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
import gensim.downloader as api
from src.models import MeanEmbeddingVectorizer

# --- EXPERIMENT 3: Dense Embeddings (GloVe vs Word2Vec) ---

# 1. Load Models once (Saves massive time/RAM)
print("‚è≥ Loading GloVe-100 (Small)...")
glove_model = api.load("glove-wiki-gigaword-100")

print("‚è≥ Loading Word2Vec-300 (Large - ~1.6GB)... this may take a minute...")
w2v_model = api.load("word2vec-google-news-300")

# 2. Define Experiments
dense_experiments = [
    ("GloVe (100d)",    glove_model),
    ("Word2Vec (300d)", w2v_model)
]

print("-" * 65)
print(f"{'Model':<20} | {'Dim':<10} | {'Mean F1-Macro':<15} | {'Std Dev':<10}")
print("-" * 65)

results_dense = {}

for name, model in dense_experiments:
    # 3. Build Pipeline
    pipeline = Pipeline([
        ('vec', MeanEmbeddingVectorizer(word2vec=model)),
        ('clf', OneVsRestClassifier(LogisticRegression(solver='liblinear', 
                                                       class_weight='balanced',
                                                       random_state=42)))
    ])
    
    # 4. Run Cross-Validation
    scores = cross_val_score(pipeline, X_train, y_train, cv=stratified_cv, scoring='f1_macro', n_jobs=None)
    
    # 5. Store Results
    mean_score = scores.mean()
    results_dense[name] = mean_score
    
    print(f"{name:<20} | {model.vector_size:<10} | {mean_score:.4f}          | {scores.std():.4f}")

print("-" * 65)

# 6. Final Comparison (Sparse vs Dense)
best_dense_name = max(results_dense, key=results_dense.get)
best_dense_score = results_dense[best_dense_name]

print(f"üèÜ Best Dense Model: {best_dense_name} ({best_dense_score:.4f})")

# Compare with previous best
if 'best_result' in locals():
    best_sparse_score = best_result['F1']
    print(f"üÜö Sparse Baseline: {best_sparse_score:.4f}")
    
    if best_dense_score > best_sparse_score:
        print("üöÄ RESULT: Dense Embeddings outperformed Sparse Features!")
    else:
        print("üìâ RESULT: Sparse Features (TF-IDF) are SUPERIOR.")

‚è≥ Loading GloVe-100 (Small)...
‚è≥ Loading Word2Vec-300 (Large - ~1.6GB)... this may take a minute...
-----------------------------------------------------------------
Model                | Dim        | Mean F1-Macro   | Std Dev   
-----------------------------------------------------------------
GloVe (100d)         | 100        | 0.4246          | 0.0019
Word2Vec (300d)      | 300        | 0.4508          | 0.0024
-----------------------------------------------------------------
üèÜ Best Dense Model: Word2Vec (300d) (0.4508)
üÜö Sparse Baseline: 0.5002
üìâ RESULT: Sparse Features (TF-IDF) are SUPERIOR.


The experiment confirms that **Sparse Features (TF-IDF/N-grams)** (F1: ~0.43) are significantly superior to **Averaged Embeddings** (F1: ~0.28) for this specific task.

**Why did this happen?**
1.  **The "Muddy" Average:** When you average the vectors for "school", "cheating", and "bad", you get a generic vector that looks vaguely like "negative education". You lose the sharp, distinct signal of the word "cheating", which we proved earlier is a massive predictor for *Benevolence*.
2.  **Values are Keyword-Driven:** Human values are often triggered by specific, high-impact words (e.g., "God" $\rightarrow$ *Tradition*, "Freedom" $\rightarrow$ *Self-direction*). TF-IDF isolates these triggers perfectly; averaging blends them into the background noise.

**Conclusion:** To beat the N-gram baseline, "averaging" is not enough. We need a model that can read the **sequence** of words without crushing them together. This justifies the move to **Transformers (BERT)**.

In [6]:
from src.preprocessing import clean_text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from src.preprocessing import clean_text


# --- EXPERIMENT 4: Preprocessing Ablation (Section 4.2) ---
# We test if cleaning helps. We use the BEST model type from previous steps
# This takes time because we preprocess the whole dataset 5 times.

prep_strategies = [
    "raw", 
    "lower", 
    "no_punct", 
    "no_stopwords", 
    "lemmatized"
]

print(f"{'Strategy':<15} | {'Mean F1-Macro':<15} | {'Std Dev':<10}")
print("-" * 50)

results_prep = {}

for strategy in prep_strategies:
    # 1. Apply Preprocessing to Training Data
    # We do this explicitly here to ensure the vectorizer sees the cleaned text
    print(f"Processing: {strategy}...", end="\r")
    X_train_clean = [clean_text(t, strategy=strategy) for t in X_train]
    
    # 2. Build Pipeline (Using standard TF-IDF (1,3) as baseline)
    # Note: We turn off vectorizer's internal lowercase if strategy is 'raw'
    do_lower = False if strategy == 'raw' else True
    
    pipeline = Pipeline([
        ('vec', TfidfVectorizer(ngram_range=(1, 3), min_df=3, max_features=20000, lowercase=do_lower)),
        ('clf', OneVsRestClassifier(LogisticRegression(solver='liblinear', class_weight='balanced', random_state=42)))
    ])
    
    # 3. Run CV
    scores = cross_val_score(pipeline, X_train_clean, y_train, cv=stratified_cv, scoring='f1_macro', n_jobs=-1)
    
    # 4. Store
    mean_score = scores.mean()
    results_prep[strategy] = mean_score
    print(f"{strategy:<15} | {mean_score:.4f}          | {scores.std():.4f}")

print("-" * 50)
best_prep = max(results_prep, key=results_prep.get)
print(f"üèÜ BEST PREPROCESSING: {best_prep} ({results_prep[best_prep]:.4f})")

Strategy        | Mean F1-Macro   | Std Dev   
--------------------------------------------------
raw             | 0.4998          | 0.0029
lower           | 0.5002          | 0.0020
no_punct        | 0.4998          | 0.0023
no_stopwords    | 0.4945          | 0.0029
lemmatized      | 0.5023          | 0.0035
--------------------------------------------------
üèÜ BEST PREPROCESSING: lemmatized (0.5023)


In [7]:
from src.models import create_advanced_pipeline

# --- EXPERIMENT 5: Model Selection (Section 3.3) ---
# We compare the classical "Big Three" of Text Classification:
# 1. Logistic Regression (Baseline)
# 2. SVM (LinearSVC) - Often best for high-dimensional sparse text
# 3. Naive Bayes - Fast, good baseline

# We use the best feature settings found so far (TF-IDF + Trigrams)
models_to_test = [
    ("Logistic Regression", "logreg"),
    ("Linear SVM",          "svm"),
    ("Multinomial NB",      "nb"),
    ("Complement NB",       "complement_nb")
]

print(f"{'Model':<20} | {'Mean F1-Macro':<15} | {'Std Dev':<10}")
print("-" * 50)

results_models = {}

for display_name, model_type in models_to_test:
    # Create pipeline with our best features (TF-IDF + Trigrams)
    pipeline = create_advanced_pipeline(model_type=model_type, vec_type="tfidf", ngram_range=(1, 3))
    
    # Run CV
    # SVM might be slower than NB
    scores = cross_val_score(pipeline, X_train, y_train, cv=stratified_cv, scoring='f1_macro', n_jobs=-1)
    
    # Store Results
    mean_score = scores.mean()
    results_models[display_name] = mean_score
    
    print(f"{display_name:<20} | {mean_score:.4f}          | {scores.std():.4f}")

print("-" * 50)

# Identify the Winner
best_model_name = max(results_models, key=results_models.get)
best_model_code = [m[1] for m in models_to_test if m[0] == best_model_name][0]

print(f"üèÜ BEST MODEL ARCHITECTURE: {best_model_name} ({results_models[best_model_name]:.4f})")
print(f"üëâ Use '{best_model_code}' in Notebook 03 for tuning.")

Model                | Mean F1-Macro   | Std Dev   
--------------------------------------------------
Logistic Regression  | 0.5002          | 0.0020
Linear SVM           | 0.4968          | 0.0025
Multinomial NB       | 0.4647          | 0.0036
Complement NB        | 0.4758          | 0.0023
--------------------------------------------------
üèÜ BEST MODEL ARCHITECTURE: Logistic Regression (0.5002)
üëâ Use 'logreg' in Notebook 03 for tuning.
