# Chef Classifier

## 1. Imports and Setup

In [42]:
import pandas as pd
import ast
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
import re
import os
import nltk
from nltk.corpus import wordnet  # for synonyms

nltk.download('wordnet')
os.environ["TOKENIZERS_PARALLELISM"] = "False"


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/gargoyle/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## 2. Load and Prepare Data

In [43]:
df = pd.read_csv('/Users/gargoyle/Downloads/FINAL TESTING/train.csv', sep=';')
df['tags'] = df['tags'].apply(ast.literal_eval)
df['description'] = df['description'].fillna('')

# aug functions
def get_synonym(word):
    """Return a random synonym using WordNet."""
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            candidate = lemma.name().replace('_', ' ')
            if candidate.lower() != word.lower() and len(candidate.split()) == 1:
                synonyms.add(candidate)
    if synonyms:
        return np.random.choice(list(synonyms))
    return word


def synonym_replace(text, replacement_prob=0.15):
    words = text.split()
    if len(words) < 5:
        return text
    new_words = [
        get_synonym(w) if np.random.rand() < replacement_prob else w
        for w in words
    ]
    return ' '.join(new_words)


def random_delete(text, deletion_prob=0.05):
    words = text.split()
    if len(words) < 5:
        return text
    kept = [w for w in words if np.random.rand() > deletion_prob]
    return ' '.join(kept) if len(kept) > 3 else text


def sentence_shuffle(text):
    sentences = re.split(r'(?<=[.!?])\s+', text)
    if len(sentences) < 3:
        return text
    np.random.shuffle(sentences)
    return ' '.join(sentences)


def augment_text(text):
    r = np.random.rand()
    if r < 0.5:
        return synonym_replace(text)
    elif r < 0.75:
        return random_delete(text)
    else:
        return sentence_shuffle(text)


def augment_tags(tag_list):
    """Slightly modify tags for diversity (swap or drop)."""
    if not isinstance(tag_list, list) or len(tag_list) == 0:
        return tag_list
    new_tags = tag_list.copy()
    # 10% chance to remove one tag
    if np.random.rand() < 0.1 and len(new_tags) > 1:
        del new_tags[np.random.randint(0, len(new_tags))]
    # 10% chance to duplicate another
    if np.random.rand() < 0.1:
        new_tags.append(np.random.choice(new_tags))
    return list(set(new_tags))


# balancing chef data
def balance_and_augment(df, label_col='chef_id', id_col='orig_id'):
    """Upsample minority chefs with text+tag augmentation, tracking lineage."""
    assert id_col in df.columns, f"{id_col} must be present before augmenting."

    label_counts = df[label_col].value_counts()
    max_count = label_counts.max()
    augmented_rows = []

    print("\nCurrent class distribution:")
    print(label_counts.sort_index())
    print(f"→ Target per class: {max_count}\n")

    for chef, count in label_counts.items():
        if count < max_count:
            n_to_add = max_count - count
            chef_df = df[df[label_col] == chef]
            sampled = chef_df.sample(n=n_to_add, replace=True, random_state=42).copy()

            # lineage + flags
            sampled['parent_id'] = sampled[id_col]
            sampled['is_augmented'] = True

            # apply augments
            sampled['description'] = sampled['description'].apply(augment_text)
            sampled['tags'] = sampled['tags'].apply(augment_tags)

            augmented_rows.append(sampled)
            print(f"↑ Augmented class {chef}: +{n_to_add} samples")

    base = df.copy()
    base['parent_id'] = base[id_col]
    base['is_augmented'] = False

    if augmented_rows:
        df_aug = pd.concat([base] + augmented_rows, ignore_index=True)
        print(f"\nAdded total augmented samples: {sum(len(r) for r in augmented_rows)}")
        print(f"New total dataset size: {len(df_aug)}")
        print("=====================================\n")
        return df_aug
    else:
        print("No augmentation performed — already balanced.\n")
        return base
    
# Split before aug (keep validation clean)
X_raw = df[['description', 'tags', 'chef_id']].copy()

X_train_raw, X_val_raw = train_test_split(
    X_raw, test_size=0.2, random_state=4, stratify=X_raw['chef_id']
)

assert not set(X_train_raw.index).intersection(set(X_val_raw.index)), "Leakage detected between train and val!"


# Reset indices
X_train_raw = X_train_raw.reset_index(drop=True)
X_val_raw = X_val_raw.reset_index(drop=True)
X_train_raw['orig_id'] = X_train_raw.index

print(f"Original training samples: {len(X_train_raw)}")

# aug only training set
X_train_aug = balance_and_augment(X_train_raw)
print(f"Training samples after augmentation: {len(X_train_aug)}")


Original training samples: 2399

Current class distribution:
chef_id
1533    323
3288    361
4470    645
5060    427
6357    298
8688    345
Name: count, dtype: int64
→ Target per class: 645

↑ Augmented class 5060: +218 samples
↑ Augmented class 3288: +284 samples
↑ Augmented class 8688: +300 samples
↑ Augmented class 1533: +322 samples
↑ Augmented class 6357: +347 samples

✅ Added total augmented samples: 1471
✅ New total dataset size: 3870

Training samples after augmentation: 3870


## 3. Generate Embeddings for Descriptions

In [44]:
embedding_model = SentenceTransformer('all-mpnet-base-v2')  # 768 dim

# Encode augmented train and untouched val descriptions
desc_train_emb = embedding_model.encode(X_train_aug['description'].tolist(), show_progress_bar=True)
desc_val_emb   = embedding_model.encode(X_val_raw['description'].tolist(), show_progress_bar=True)

desc_train_emb_df = pd.DataFrame(desc_train_emb, columns=[f'desc_emb_{i}' for i in range(768)], index=X_train_aug.index)
desc_val_emb_df   = pd.DataFrame(desc_val_emb,   columns=[f'desc_emb_{i}' for i in range(768)], index=X_val_raw.index)

# One-hot encode tags
mlb = MultiLabelBinarizer()
tags_train_encoded = mlb.fit_transform(X_train_aug['tags'])
tags_val_encoded   = mlb.transform(X_val_raw['tags'])

tags_train_df = pd.DataFrame(tags_train_encoded, columns=mlb.classes_, index=X_train_aug.index).add_prefix('tag_')
tags_val_df   = pd.DataFrame(tags_val_encoded,   columns=mlb.classes_, index=X_val_raw.index).add_prefix('tag_')

# Combine embeddings + tag features
X_train = pd.concat([desc_train_emb_df, tags_train_df], axis=1)
X_val   = pd.concat([desc_val_emb_df, tags_val_df], axis=1)

y_train = X_train_aug['chef_id'].reset_index(drop=True)
y_val   = X_val_raw['chef_id'].reset_index(drop=True)

train_meta = X_train_aug[['is_augmented', 'parent_id']].copy()
train_meta.index = X_train.index  # ensure identical index alignment

print(f"\nFinal training shape: {X_train.shape}")
print(f"Validation shape: {X_val.shape}")



Batches: 100%|██████████| 121/121 [00:39<00:00,  3.06it/s]
Batches: 100%|██████████| 19/19 [00:06<00:00,  2.89it/s]


Final training shape: (3870, 1147)
Validation shape: (600, 1147)





## 5. Hyperparameter Tuning

In [45]:
scaler_hp = StandardScaler()
X_train_scaled = scaler_hp.fit_transform(X_train)
X_val_scaled = scaler_hp.transform(X_val)

C_values = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0]
results = []

print(f"\nTesting {len(C_values)} C values with lbfgs solver...")
for C in C_values:
    model = LogisticRegression(C=C, max_iter=1000, random_state=42, n_jobs=-1, solver='lbfgs')
    model.fit(X_train_scaled, y_train)
    train_score = model.score(X_train_scaled, y_train)
    val_score = model.score(X_val_scaled, y_val)
    results.append({'C': C, 'train_acc': train_score, 'val_acc': val_score, 'gap': train_score - val_score})
    print(f"  C={C:6.3f} | Train: {train_score:.4f} | Val: {val_score:.4f} | Gap: {train_score - val_score:.4f}")

results_df = pd.DataFrame(results)
best_idx = results_df['val_acc'].idxmax()
best_C = results_df.loc[best_idx, 'C']

print("\n" + "=" * 60)
print(f"Best C: {best_C}")
print(f"Best validation accuracy: {results_df.loc[best_idx, 'val_acc']:.4f}")
print("=" * 60)


Testing 7 C values with lbfgs solver...
  C= 0.001 | Train: 0.9444 | Val: 0.8367 | Gap: 0.1078
  C= 0.005 | Train: 0.9822 | Val: 0.8517 | Gap: 0.1305
  C= 0.010 | Train: 0.9930 | Val: 0.8500 | Gap: 0.1430
  C= 0.050 | Train: 1.0000 | Val: 0.8417 | Gap: 0.1583
  C= 0.100 | Train: 1.0000 | Val: 0.8417 | Gap: 0.1583
  C= 0.500 | Train: 1.0000 | Val: 0.8383 | Gap: 0.1617
  C= 1.000 | Train: 1.0000 | Val: 0.8367 | Gap: 0.1633

Best C: 0.005
Best validation accuracy: 0.8517


## 6. K-Fold Cross Validation

In [46]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

# Originals only for validation candidates
orig_mask = ~train_meta['is_augmented']
orig_idx = X_train.index[orig_mask]

X_orig = X_train.loc[orig_idx]
y_orig = y_train.loc[orig_idx]

skf = StratifiedKFold(n_splits=8, shuffle=True, random_state=42)
fold_scores = []

print("\nLeakage-proof CV (val=originals only, train=originals(excl. fold) + augmented not derived from val):")
for fold, (tr_i, va_i) in enumerate(skf.split(X_orig, y_orig), 1):
    val_idx = X_orig.index[va_i]          # original rows for validation
    tr_orig_idx = X_orig.index[tr_i]      # original rows for training

    # aug rows whose parent is not in validation fold
    aug_ok_mask = (train_meta['is_augmented']) & (~train_meta['parent_id'].isin(val_idx))
    tr_aug_idx = X_train.index[aug_ok_mask]

    # final train = originals (train part) + safe aug
    tr_idx_all = tr_orig_idx.union(tr_aug_idx)

    X_tr = X_train.loc[tr_idx_all]
    y_tr = y_train.loc[tr_idx_all]
    X_va = X_train.loc[val_idx]
    y_va = y_train.loc[val_idx]

    # scale from training only
    scaler_cv = StandardScaler()
    X_tr_s = scaler_cv.fit_transform(X_tr)
    X_va_s = scaler_cv.transform(X_va)

    model_cv = LogisticRegression(C=best_C, max_iter=1000, random_state=42, n_jobs=-1, solver='lbfgs')
    model_cv.fit(X_tr_s, y_tr)
    preds = model_cv.predict(X_va_s)
    acc = accuracy_score(y_va, preds)
    fold_scores.append(acc)
    print(f"  Fold {fold}: {acc:.4f} ({acc:.2%})")

print("-" * 20)
print(f"Mean: {np.mean(fold_scores):.4f} ({np.mean(fold_scores):.2%})")
print(f"Std:  {np.std(fold_scores):.4f} ({np.std(fold_scores):.2%})")
print(f"Min:  {np.min(fold_scores):.4f} ({np.min(fold_scores):.2%})")
print(f"Max:  {np.max(fold_scores):.4f} ({np.max(fold_scores):.2%})")



Leakage-proof CV (val=originals only, train=originals(excl. fold) + augmented not derived from val):
  Fold 1: 0.8567 (85.67%)
  Fold 2: 0.8533 (85.33%)
  Fold 3: 0.8433 (84.33%)
  Fold 4: 0.8567 (85.67%)
  Fold 5: 0.7933 (79.33%)
  Fold 6: 0.8500 (85.00%)
  Fold 7: 0.8400 (84.00%)
  Fold 8: 0.8328 (83.28%)
--------------------
Mean: 0.8408 (84.08%)
Std:  0.0196 (1.96%)
Min:  0.7933 (79.33%)
Max:  0.8567 (85.67%)
