In [None]:
# 04_baseline_models

# 1. Setup & Imports
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent
sys.path.append(str(PROJECT_ROOT))
print("Project root:", PROJECT_ROOT)

import pandas as pd
import numpy as np

from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

from src.config import PROCESSED_DIR



# Load BIO-POL data (Notebook 03 output)
train_df = pd.read_parquet(PROCESSED_DIR / "bio_pol_train.parquet")
val_df   = pd.read_parquet(PROCESSED_DIR / "bio_pol_val.parquet")

print("Train size:", len(train_df))
print("Validation size:", len(val_df))



# Flatten token-level data
def flatten_token_data(df):
    X, y = [], []

    for _, row in df.iterrows():
        tokens = row["sentence"].split()
        labels = row["labels"]

        for token, label in zip(tokens, labels):
            if label == -100:   # ignore special tokens
                continue
            X.append(token)
            y.append(label)

    return X, y


X_train, y_train = flatten_token_data(train_df)
X_val, y_val     = flatten_token_data(val_df)

print("Train tokens:", len(X_train))
print("Val tokens:", len(X_val))

# Feature Engineering
def token_features(token):
    return {
        "word": token,
        "lower": token.lower(),
        "is_upper": token.isupper(),
        "is_title": token.istitle(),
        "is_digit": token.isdigit(),
        "suffix2": token[-2:],
        "suffix3": token[-3:]
    }


X_train_feats = [token_features(t) for t in X_train]
X_val_feats   = [token_features(t) for t in X_val]

# Vectorization
vectorizer = DictVectorizer(sparse=True)

X_train_vec = vectorizer.fit_transform(X_train_feats)
X_val_vec   = vectorizer.transform(X_val_feats)

print("Feature space size:", X_train_vec.shape[1])


# BASELINE 1: LOGISTIC REGRESSION
lr = LogisticRegression(
    max_iter=1000,
    n_jobs=-1,
    class_weight="balanced"
)

lr.fit(X_train_vec, y_train)
lr_preds = lr.predict(X_val_vec)

print("\n===== Logistic Regression Results =====")
print(classification_report(y_val, lr_preds))

# BASELINE 2: SUPPORT VECTOR MACHINE 
svm = LinearSVC(
    class_weight="balanced"
)

svm.fit(X_train_vec, y_train)
svm_preds = svm.predict(X_val_vec)

print("\n===== SVM Results =====")
print(classification_report(y_val, svm_preds))

# BASELINE 3: RANDOM FOREST
rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=20,
    n_jobs=-1,
    class_weight="balanced_subsample",
    random_state=42
)

rf.fit(X_train_vec, y_train)
rf_preds = rf.predict(X_val_vec) 

print("\n===== Random Forest Results =====")
print(classification_report(y_val, rf_preds))
