In [6]:
# 02_explore_data_folder
!ls -la /kaggle/input/goemotions/data


total 4320
drwxr-xr-x 3 nobody nogroup       0 Oct 25 17:08 .
drwxr-xr-x 5 nobody nogroup       0 Oct 25 17:08 ..
-rw-r--r-- 1 nobody nogroup  439059 Oct 25 17:08 dev.tsv
-rw-r--r-- 1 nobody nogroup      55 Oct 25 17:08 ekman_labels.csv
-rw-r--r-- 1 nobody nogroup     396 Oct 25 17:08 ekman_mapping.json
-rw-r--r-- 1 nobody nogroup     248 Oct 25 17:08 emotions.txt
drwxr-xr-x 2 nobody nogroup       0 Oct 25 17:08 full_dataset
-rw-r--r-- 1 nobody nogroup     367 Oct 25 17:08 sentiment_dict.json
-rw-r--r-- 1 nobody nogroup     367 Oct 25 17:08 sentiment_mapping.json
-rw-r--r-- 1 nobody nogroup  436706 Oct 25 17:08 test.tsv
-rw-r--r-- 1 nobody nogroup 3519053 Oct 25 17:08 train.tsv


In [7]:
# 03_load_data_to_pandas
from pathlib import Path
import pandas as pd

DATA_ROOT = Path("/kaggle/input/goemotions/data")

cols = ["text", "labels", "id"]

train = pd.read_csv(DATA_ROOT / "train.tsv", sep="\t", names=cols, header=None, quoting=3)
dev   = pd.read_csv(DATA_ROOT / "dev.tsv",   sep="\t", names=cols, header=None, quoting=3)
test  = pd.read_csv(DATA_ROOT / "test.tsv",  sep="\t", names=cols, header=None, quoting=3)

print(f"Train: {len(train)} | Dev: {len(dev)} | Test: {len(test)}")
train.head(5)


Train: 43410 | Dev: 5426 | Test: 5427


Unnamed: 0,text,labels,id
0,My favourite food is anything I didn't have to...,27,eebbqej
1,"Now if he does off himself, everyone will thin...",27,ed00q6i
2,WHY THE FUCK IS BAYLESS ISOING,2,eezlygj
3,To make her feel threatened,14,ed7ypvh
4,Dirty Southern Wankers,3,ed0bdzj


In [8]:
# 04_label_preprocessing_multihot
from pathlib import Path
import numpy as np

EMOTIONS_FP = Path("/kaggle/input/goemotions/data/emotions.txt")
with open(EMOTIONS_FP, "r") as f:
    emotions = [s.strip() for s in f if s.strip()]
NUM_LABELS = len(emotions)  # expect 28 (27 emotions + neutral)

print("NUM_LABELS =", NUM_LABELS)
print("first 10 labels:", emotions[:10])

# convert comma-separated label ids -> multihot vector
def labels_to_multihot(label_str, n=NUM_LABELS):
    if pd.isna(label_str) or str(label_str).strip() == "":
        return np.zeros(n, dtype=int)
    ids = [int(x) for x in str(label_str).split(",") if x != ""]
    vec = np.zeros(n, dtype=int)
    vec[ids] = 1
    return vec

train["label_vec"] = train["labels"].apply(labels_to_multihot)
dev["label_vec"]   = dev["labels"].apply(labels_to_multihot)
test["label_vec"]  = test["labels"].apply(labels_to_multihot)

# quick sanity checks
print("Example row (text, labels, multihot):")
print(train.iloc[0][["text","labels","label_vec"]])
print("\nLabel counts (how many examples include each label):")
label_counts = np.sum(np.stack(train["label_vec"].values), axis=0)
for i, (lbl, cnt) in enumerate(zip(emotions, label_counts)):
    print(f"{i:02d} {lbl:20.20}  {int(cnt)}")


NUM_LABELS = 28
first 10 labels: ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment']
Example row (text, labels, multihot):
text         My favourite food is anything I didn't have to...
labels                                                      27
label_vec    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Name: 0, dtype: object

Label counts (how many examples include each label):
00 admiration            4130
01 amusement             2328
02 anger                 1567
03 annoyance             2470
04 approval              2939
05 caring                1087
06 confusion             1368
07 curiosity             2191
08 desire                641
09 disappointment        1269
10 disapproval           2022
11 disgust               793
12 embarrassment         303
13 excitement            853
14 fear                  596
15 gratitude             2662
16 grief                 77
17 joy                   1452
18 l

In [9]:
# 05_small_debug_subset
# Create a tiny subset to test the pipeline quickly (keeps first runs short)
DEBUG = True

if DEBUG:
    quick_train = train.sample(min(1000, len(train)), random_state=42).reset_index(drop=True)
    quick_dev   = dev.sample(min(200, len(dev)), random_state=42).reset_index(drop=True)
    quick_test  = test.sample(min(500, len(test)), random_state=1).reset_index(drop=True)
else:
    quick_train, quick_dev, quick_test = train, dev, test

print("DEBUG =", DEBUG)
print("quick sizes -> train:", len(quick_train), "dev:", len(quick_dev), "test:", len(quick_test))
quick_train.head(3)


DEBUG = True
quick sizes -> train: 1000 dev: 200 test: 500


Unnamed: 0,text,labels,id,label_vec
0,The only way this works is if [NAME] is doing ...,27,edupnyh,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,Access should be hindered it's getting destroyed.,3,ediy7lp,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Totally fair. All I was trying to remind every...,4,edv791a,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [10]:
# 06_install_and_import_hf
# Run this once at the top of the notebook to ensure required packages are available.
# It may take ~1-2 minutes the first time.
!pip install -q transformers datasets accelerate evaluate


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m566.1/566.1 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m39.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m93.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m66.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m50.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.

In [18]:
# 07b_tokenize_with_torchdataset
# Tokenize using the HF tokenizer and build torch.utils.data.Dataset objects
# This avoids the `datasets` library (so no pyarrow required).

from transformers import AutoTokenizer
import torch
import numpy as np

MODEL_NAME = "distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

max_length = 30

class GoEmotionsTorchDataset(torch.utils.data.Dataset):
    def __init__(self, texts, label_vecs, tokenizer, max_length=30):
        # texts: pandas Series of strings
        # label_vecs: pandas Series of numpy arrays or lists
        self.tokenized = tokenizer(
            texts.tolist(),
            truncation=True,
            padding="max_length",
            max_length=max_length,
            return_tensors="pt",
        )
        # stack labels into (N, num_labels) float tensor for BCE
        self.labels = torch.tensor(np.stack(label_vecs.values), dtype=torch.float)
        assert self.labels.shape[0] == self.tokenized["input_ids"].shape[0], "mismatch lengths"

    def __len__(self):
        return self.labels.shape[0]

    def __getitem__(self, idx):
        item = {k: v[idx] for k, v in self.tokenized.items()}
        item["labels"] = self.labels[idx]
        return item

# Build datasets from the quick_... pandas DataFrames you already created
train_dataset_torch = GoEmotionsTorchDataset(quick_train["text"], quick_train["label_vec"], tokenizer, max_length=max_length)
dev_dataset_torch   = GoEmotionsTorchDataset(quick_dev["text"], quick_dev["label_vec"], tokenizer, max_length=max_length)
test_dataset_torch  = GoEmotionsTorchDataset(quick_test["text"], quick_test["label_vec"], tokenizer, max_length=max_length)

# quick sanity prints
print("Train size:", len(train_dataset_torch))
print("Dev size:", len(dev_dataset_torch))
print("Test size:", len(test_dataset_torch))

sample = train_dataset_torch[0]
print("sample keys:", list(sample.keys()))
print("input_ids length:", sample["input_ids"].shape, "labels shape:", sample["labels"].shape)
print("first 10 input_ids:", sample["input_ids"][:10].tolist())
print("first label vector indices set:", torch.where(sample["labels"] == 1)[0].tolist())


Train size: 1000
Dev size: 200
Test size: 500
sample keys: ['input_ids', 'attention_mask', 'labels']
input_ids length: torch.Size([30]) labels shape: torch.Size([28])
first 10 input_ids: [0, 133, 129, 169, 42, 1364, 16, 114, 646, 48307]
first label vector indices set: [27]


In [20]:
# 08_manual_train_multiGPU
import os
import torch
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from torch.utils.data import DataLoader
from torch.optim import AdamW
from sklearn.metrics import f1_score
from tqdm.auto import tqdm

# Config (tweak if OOM)
MODEL_NAME = "distilroberta-base"
NUM_LABELS = 28
OUTDIR = "/kaggle/working/goemotions-checkpoint-manual"
os.makedirs(OUTDIR, exist_ok=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
ngpu = torch.cuda.device_count()
print("Device:", device, "GPUs:", ngpu)
for i in range(ngpu):
    print(i, torch.cuda.get_device_name(i))

# load model/tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=NUM_LABELS, problem_type="multi_label_classification")
if ngpu > 1:
    model = torch.nn.DataParallel(model)
model = model.to(device)

# training hyperparams
epochs = 3
per_device_batch_size = 16   # if OOM reduce to 8 or 4
gradient_accumulation_steps = 2
effective_batch = per_device_batch_size * gradient_accumulation_steps * max(1, ngpu)
print("Effective batch (approx):", effective_batch)

lr = 3e-5
weight_decay = 0.01

# dataloaders
train_loader = DataLoader(train_dataset_torch, batch_size=per_device_batch_size, shuffle=True, num_workers=2)
dev_loader   = DataLoader(dev_dataset_torch,   batch_size=per_device_batch_size*2, shuffle=False, num_workers=2)
test_loader  = DataLoader(test_dataset_torch,  batch_size=per_device_batch_size*2, shuffle=False, num_workers=2)

# optimizer & loss
optimizer = AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
criterion = torch.nn.BCEWithLogitsLoss()

# amp scaler for fp16
scaler = torch.cuda.amp.GradScaler(enabled=True)

best_val_f1 = -1.0

def evaluate(loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in loader:
            inputs = {k: v.to(device) for k,v in batch.items() if k in ["input_ids","attention_mask"]}
            labels = batch["labels"].to(device)
            with torch.cuda.amp.autocast(enabled=True):
                outputs = model(**inputs)
                logits = outputs.logits if not isinstance(outputs, tuple) else outputs[0]
            probs = torch.sigmoid(logits).cpu().numpy()
            preds = (probs >= 0.5).astype(int)
            all_preds.append(preds)
            all_labels.append(labels.cpu().numpy().astype(int))
    all_preds = np.vstack(all_preds)
    all_labels = np.vstack(all_labels)
    micro = f1_score(all_labels.reshape(-1), all_preds.reshape(-1), average="micro", zero_division=0)
    return micro, all_preds, all_labels

# training loop
global_step = 0
for epoch in range(1, epochs+1):
    model.train()
    pbar = tqdm(train_loader, desc=f"Epoch {epoch}")
    running_loss = 0.0
    optimizer.zero_grad()
    for step, batch in enumerate(pbar, start=1):
        inputs = {k: v.to(device) for k,v in batch.items() if k in ["input_ids","attention_mask"]}
        labels = batch["labels"].to(device)

        with torch.cuda.amp.autocast(enabled=True):
            outputs = model(**inputs, labels=None)
            logits = outputs.logits if not isinstance(outputs, tuple) else outputs[0]
            loss = criterion(logits, labels)

        scaler.scale(loss / gradient_accumulation_steps).backward()
        running_loss += loss.item()

        if (step % gradient_accumulation_steps) == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            pbar.set_postfix({"loss": f"{running_loss/(global_step):.4f}"})

    # end epoch: evaluate
    val_micro, _, _ = evaluate(dev_loader)
    print(f"\nEpoch {epoch} validation micro-F1: {val_micro:.4f}")

    # save best
    if val_micro > best_val_f1:
        best_val_f1 = val_micro
        # if DataParallel, unwrap
        model_to_save = model.module if hasattr(model, "module") else model
        model_to_save.save_pretrained(OUTDIR)
        tokenizer.save_pretrained(OUTDIR)
        print(f"Saved best model to {OUTDIR} (micro-F1={best_val_f1:.4f})")

# final test eval
test_micro, test_preds, test_labels = evaluate(test_loader)
print("Final test micro-F1:", test_micro)
# save predictions (small file)
np.savez_compressed("/kaggle/working/goemotions_preds.npz", preds=test_preds, labels=test_labels)
print("Saved predictions to /kaggle/working/goemotions_preds.npz")


Device: cuda GPUs: 2
0 Tesla T4
1 Tesla T4




RuntimeError: Failed to import transformers.models.roberta.modeling_roberta because of the following error (look up to see its traceback):
module 'torch' has no attribute 'float8_e8m0fnu'

In [21]:
# run this cell first and paste the output if you want me to review it
import torch, sys
print("python:", sys.version.split()[0])
print("torch:", getattr(torch, "__version__", None))
print("torch.cuda:", torch.version.cuda)
print("cuda available:", torch.cuda.is_available())


python: 3.11.13
torch: 2.6.0+cu124
torch.cuda: 12.4
cuda available: True


In [22]:
# 09_train_tfidf_baseline
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
import joblib
from tqdm.auto import tqdm

# prepare data from quick_train/quick_dev/quick_test (pandas DataFrames)
X_train = quick_train["text"].astype(str).tolist()
y_train = np.stack(quick_train["label_vec"].values).astype(int)

X_dev = quick_dev["text"].astype(str).tolist()
y_dev = np.stack(quick_dev["label_vec"].values).astype(int)

X_test = quick_test["text"].astype(str).tolist()
y_test = np.stack(quick_test["label_vec"].values).astype(int)

print("Sizes:", len(X_train), len(X_dev), len(X_test))

# pipeline: TF-IDF -> OneVsRest(LogisticRegression)
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=30000, ngram_range=(1,2), min_df=2)),
    ("clf", OneVsRestClassifier(LogisticRegression(solver="saga", max_iter=200, C=1.0, n_jobs=-1)))
])

# train (fast)
pipeline.fit(X_train, y_train)

# predict dev + test
y_dev_pred = pipeline.predict(X_dev)
y_test_pred = pipeline.predict(X_test)

# metrics (micro & macro, flatten for multi-label as in earlier)
dev_micro = f1_score(y_dev.reshape(-1), y_dev_pred.reshape(-1), average="micro", zero_division=0)
dev_macro = f1_score(y_dev.reshape(-1), y_dev_pred.reshape(-1), average="macro", zero_division=0)
test_micro = f1_score(y_test.reshape(-1), y_test_pred.reshape(-1), average="micro", zero_division=0)
test_macro = f1_score(y_test.reshape(-1), y_test_pred.reshape(-1), average="macro", zero_division=0)

print(f"Dev micro-F1: {dev_micro:.4f} | Dev macro-F1: {dev_macro:.4f}")
print(f"Test micro-F1: {test_micro:.4f} | Test macro-F1: {test_macro:.4f}")

# show a few examples
def show_examples(X, y_true, y_pred, n=6):
    inv = {i: lab for i, lab in enumerate(emotions)}
    for i in range(min(n, len(X))):
        true_idx = np.where(y_true[i]==1)[0].tolist()
        pred_idx = np.where(y_pred[i]==1)[0].tolist()
        print(f"TEXT: {X[i][:200]!s}")
        print("TRUE:", [inv[j] for j in true_idx])
        print("PRED:", [inv[j] for j in pred_idx])
        print("-"*60)

print("\nSample dev predictions:")
show_examples(X_dev, y_dev, y_dev_pred, n=6)

# persist model & vectorizer
joblib.dump(pipeline, "/kaggle/working/goemotions_tfidf_baseline.joblib")
print("Saved baseline to /kaggle/working/goemotions_tfidf_baseline.joblib")


Sizes: 1000 200 500




Dev micro-F1: 0.9595 | Dev macro-F1: 0.5264
Test micro-F1: 0.9582 | Test macro-F1: 0.5272

Sample dev predictions:
TEXT: Haha, my apologies!
TRUE: ['amusement']
PRED: []
------------------------------------------------------------
TEXT: It surprises me that he's a mod some days...
TRUE: ['surprise']
PRED: []
------------------------------------------------------------
TEXT: Seems to be fake. Checked the Morning Mix website, no articles from Nov 13^(th) 2018 that match the title shown.
TRUE: ['neutral']
PRED: []
------------------------------------------------------------
TEXT: I have faith
TRUE: ['neutral']
PRED: []
------------------------------------------------------------
TEXT: Oh. :[
TRUE: ['neutral']
PRED: []
------------------------------------------------------------
TEXT: Tbh I love [NAME] but for [NAME] id be willing to see him go. [NAME] gives [NAME] the best chance he'll ever have of competing.
TRUE: ['neutral']
PRED: []
-----------------------------------------------------

In [23]:
# diag_1_label_mapping_and_stats
import numpy as np
inv = {i: lab for i, lab in enumerate(emotions)}
print("Label index for 'neutral' check:", [(i, emotions[i]) for i in range(len(emotions)) if 'neutral' in emotions[i].lower()])

# show how many positives model predicted per dev sample and per label
print("Per-sample predicted positives (first 20):", np.sum(y_dev_pred, axis=1)[:20])
print("Histogram of positives per sample:", np.bincount(np.sum(y_dev_pred, axis=1).astype(int))[:10])
pred_counts_per_label = np.sum(y_dev_pred, axis=0)
true_counts_per_label = np.sum(y_dev, axis=0)
for i,cnt_t,cnt_p in zip(range(len(emotions)), true_counts_per_label, pred_counts_per_label):
    if i < 10 or cnt_t < 50 or cnt_p < 50:  # print many small/interesting ones
        print(f"{i:02d} {inv[i]:15.15} true:{int(cnt_t):5d} pred:{int(cnt_p):5d}")


Label index for 'neutral' check: [(27, 'neutral')]
Per-sample predicted positives (first 20): [0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]
Histogram of positives per sample: [190  10]
00 admiration      true:   16 pred:    1
01 amusement       true:    3 pred:    1
02 anger           true:   13 pred:    0
03 annoyance       true:   12 pred:    0
04 approval        true:   15 pred:    0
05 caring          true:    5 pred:    0
06 confusion       true:    6 pred:    0
07 curiosity       true:   10 pred:    0
08 desire          true:    1 pred:    0
09 disappointment  true:    8 pred:    0
10 disapproval     true:   14 pred:    0
11 disgust         true:    3 pred:    0
12 embarrassment   true:    1 pred:    0
13 excitement      true:    2 pred:    0
14 fear            true:    1 pred:    0
15 gratitude       true:   11 pred:    1
16 grief           true:    0 pred:    0
17 joy             true:    6 pred:    0
18 love            true:   11 pred:    1
19 nervousness     true:    1 pred:    0

In [24]:
# diag_2_example_probs_and_features
from sklearn.feature_extraction.text import TfidfVectorizer
# access pipeline parts (pipeline: TfidfVectorizer + OneVsRestClassifier)
tfidf = pipeline.named_steps['tfidf']
clf = pipeline.named_steps['clf']  # OneVsRestClassifier

# find index of example in dev set (first matching substring)
idx = None
for i, txt in enumerate(X_dev):
    if "have faith" in txt.lower():
        idx = i
        break
print("example idx in dev:", idx, "text:", X_dev[idx])

# TF-IDF sparsity info
vec = tfidf.transform([X_dev[idx]])
nz = vec.count_nonzero()
print("TF-IDF non-zero features for this example:", nz)

# per-label decision function / probabilities
# Many estimators expose decision_function; if not, use predict_proba
probas = None
try:
    # OneVsRestClassifier has estimators_ list
    # use predict_proba if available
    probas = clf.predict_proba([X_dev[idx]])  # shape (1, n_labels)
except Exception:
    # fallback to decision_function then convert via sigmoid
    from scipy.special import expit
    dec = np.array([est.decision_function(tfidf.transform([X_dev[idx]])) for est in clf.estimators_]).squeeze().T
    probas = expit(dec)

probas = np.array(probas).reshape(-1)
topk = probas.argsort()[::-1][:6]
print("Top probabilities (label idx:label:prob):")
for i in topk:
    print(i, inv[i], f"{probas[i]:.4f}")
print("Probability for 'neutral' index (if found):", 
      [(i, inv[i], float(probas[i])) for i in range(len(emotions)) if 'neutral' in inv[i].lower()])


example idx in dev: 3 text: I have faith
TF-IDF non-zero features for this example: 1
Top probabilities (label idx:label:prob):
27 neutral 0.3349
0 admiration 0.0694
3 annoyance 0.0597
7 curiosity 0.0533
1 amusement 0.0498
15 gratitude 0.0493
Probability for 'neutral' index (if found): [(27, 'neutral', 0.33494177392167707)]


In [25]:
# fix_A_lower_threshold_quick
import numpy as np
from sklearn.metrics import f1_score

# clf, tfidf, X_dev, y_dev were defined earlier in the baseline cell
# get dev probabilities (shape n_samples x n_labels)
try:
    dev_probas = clf.predict_proba(X_dev)
except Exception:
    # fallback if predict_proba not available
    from scipy.special import expit
    dec = np.vstack([est.decision_function(tfidf.transform(X_dev)) for est in clf.estimators_]).T
    dev_probas = expit(dec)

for thr in [0.5, 0.4, 0.35, 0.3, 0.25, 0.2]:
    preds_thr = (dev_probas >= thr).astype(int)
    micro = f1_score(y_dev.reshape(-1), preds_thr.reshape(-1), average="micro", zero_division=0)
    macro = f1_score(y_dev.reshape(-1), preds_thr.reshape(-1), average="macro", zero_division=0)
    avg_pos = preds_thr.sum()/preds_thr.shape[0]
    print(f"thr={thr:.2f} -> dev micro-F1: {micro:.4f} | macro-F1: {macro:.4f} | avg positives/sample: {avg_pos:.3f}")

# show the example prediction for "I have faith" with chosen threshold
example_idx = idx  # earlier you found idx for "I have faith"
chosen_thr = 0.3
probas_example = dev_probas[example_idx]
preds_example = np.where(probas_example >= chosen_thr)[0].tolist()
print("probas (top6):", sorted([(i, emotions[i], float(probas_example[i])) for i in range(len(probas_example))], key=lambda x: -x[2])[:6])
print("predicted label indices @ thr", chosen_thr, ":", preds_example)
print("predicted label names:", [emotions[i] for i in preds_example])



thr=0.50 -> dev micro-F1: 0.9595 | macro-F1: 0.5264 | avg positives/sample: 0.050
thr=0.40 -> dev micro-F1: 0.9602 | macro-F1: 0.6066 | avg positives/sample: 0.280
thr=0.35 -> dev micro-F1: 0.9591 | macro-F1: 0.6350 | avg positives/sample: 0.440
thr=0.30 -> dev micro-F1: 0.9568 | macro-F1: 0.6546 | avg positives/sample: 0.635
thr=0.25 -> dev micro-F1: 0.9534 | macro-F1: 0.6575 | avg positives/sample: 0.800
thr=0.20 -> dev micro-F1: 0.9509 | macro-F1: 0.6591 | avg positives/sample: 0.920
probas (top6): [(27, 'neutral', 0.33494177392167707), (0, 'admiration', 0.06944683408539752), (3, 'annoyance', 0.05973364217308284), (7, 'curiosity', 0.0533066501290191), (1, 'amusement', 0.049826234031810746), (15, 'gratitude', 0.049270827739021514)]
predicted label indices @ thr 0.3 : [27]
predicted label names: ['neutral']


In [26]:
# fix_B_retrain_vectorizer_more_features
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
import joblib

pipeline2 = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=40000, ngram_range=(1,3), min_df=1, analyzer="word")),
    # consider adding char ngrams: uncomment next line and comment previous tfidf if desired
    # ("tfidf", TfidfVectorizer(max_features=40000, ngram_range=(1,3), min_df=1, analyzer="char_wb")),
    ("clf", OneVsRestClassifier(LogisticRegression(solver="saga", max_iter=1000, C=1.0, n_jobs=-1)))
])

pipeline2.fit(X_train, y_train)

# evaluate on dev
y_dev_pred2 = pipeline2.predict(X_dev)
dev_micro2 = f1_score(y_dev.reshape(-1), y_dev_pred2.reshape(-1), average="micro", zero_division=0)
dev_macro2 = f1_score(y_dev.reshape(-1), y_dev_pred2.reshape(-1), average="macro", zero_division=0)
print("New pipeline dev micro-F1:", dev_micro2, "macro-F1:", dev_macro2)

# save
joblib.dump(pipeline2, "/kaggle/working/goemotions_tfidf_pipeline2.joblib")
print("Saved improved pipeline to /kaggle/working/goemotions_tfidf_pipeline2.joblib")

# sample the example again
tfidf2 = pipeline2.named_steps["tfidf"]
clf2 = pipeline2.named_steps["clf"]
try:
    dev_probas2 = clf2.predict_proba(X_dev)
except Exception:
    from scipy.special import expit
    dec2 = np.vstack([est.decision_function(tfidf2.transform(X_dev)) for est in clf2.estimators_]).T
    dev_probas2 = expit(dec2)

print("Example nonzero features:", tfidf2.transform([X_dev[idx]]).count_nonzero())
print("Neutral prob for example:", float(dev_probas2[idx, emotions.index("neutral")]))


New pipeline dev micro-F1: 0.9582142857142857 macro-F1: 0.493566000284444
Saved improved pipeline to /kaggle/working/goemotions_tfidf_pipeline2.joblib
Example nonzero features: 2
Neutral prob for example: 0.3627720442782259


In [27]:
# 10_compute_per_label_thresholds_and_eval
import numpy as np
from sklearn.metrics import f1_score
import joblib

# Use pipeline2 if you re-trained; otherwise use pipeline & dev_probas from earlier.
# I'll prefer pipeline2 if it exists, fallback to pipeline.
clf_used = None
tfidf_used = None
try:
    pipeline2  # exists if you ran the improved pipeline
    clf_used = pipeline2.named_steps["clf"]
    tfidf_used = pipeline2.named_steps["tfidf"]
    print("Using pipeline2 (improved).")
except NameError:
    clf_used = pipeline.named_steps["clf"]
    tfidf_used = pipeline.named_steps["tfidf"]
    print("Using original pipeline.")

# compute dev probabilities (n_samples x n_labels)
try:
    dev_probas = clf_used.predict_proba(X_dev)
except Exception:
    from scipy.special import expit
    dev_probas = np.vstack([est.predict_proba(tfidf_used.transform(X_dev))[:,1] for est in clf_used.estimators_]).T

n_labels = dev_probas.shape[1]
best_thrs = np.zeros(n_labels, dtype=float)

# search thresholds on dev for each label
for j in range(n_labels):
    best_f1 = -1.0
    best_t = 0.5
    scores_j = dev_probas[:, j]
    y_true_j = y_dev[:, j]
    # grid search (coarse)
    for t in np.linspace(0.05, 0.95, 19):
        preds_j = (scores_j >= t).astype(int)
        f1j = f1_score(y_true_j, preds_j, zero_division=0)
        if f1j > best_f1:
            best_f1 = f1j
            best_t = t
    best_thrs[j] = best_t

# evaluate with per-label thresholds
preds_dev_opt = (dev_probas >= best_thrs.reshape(1, -1)).astype(int)
micro_opt = f1_score(y_dev.reshape(-1), preds_dev_opt.reshape(-1), average="micro", zero_division=0)
macro_opt = f1_score(y_dev.reshape(-1), preds_dev_opt.reshape(-1), average="macro", zero_division=0)
avg_pos = preds_dev_opt.sum()/preds_dev_opt.shape[0]

print(f"Per-label tuned -> dev micro-F1: {micro_opt:.4f} | macro-F1: {macro_opt:.4f} | avg positives/sample: {avg_pos:.3f}")
print("First 10 per-label thresholds:", [(i, emotions[i], float(best_thrs[i])) for i in range(10)])

# save thresholds and pipeline
np.save("/kaggle/working/goemotions_per_label_thresholds.npy", best_thrs)
joblib.dump(clf_used, "/kaggle/working/goemotions_clf.joblib")
joblib.dump(tfidf_used, "/kaggle/working/goemotions_tfidf.joblib")
print("Saved thresholds + pipeline pieces to /kaggle/working/")


Using pipeline2 (improved).
Per-label tuned -> dev micro-F1: 0.8977 | macro-F1: 0.6121 | avg positives/sample: 2.800
First 10 per-label thresholds: [(0, 'admiration', 0.15), (1, 'amusement', 0.1), (2, 'anger', 0.05), (3, 'annoyance', 0.1), (4, 'approval', 0.05), (5, 'caring', 0.05), (6, 'confusion', 0.05), (7, 'curiosity', 0.05), (8, 'desire', 0.05), (9, 'disappointment', 0.05)]
Saved thresholds + pipeline pieces to /kaggle/working/


In [28]:
# 11_predict_single_sentences_with_thresholds
import numpy as np
import joblib

# load pipeline pieces if not in memory
try:
    clf_used
    tfidf_used
except NameError:
    tfidf_used = joblib.load("/kaggle/working/goemotions_tfidf.joblib")
    clf_used = joblib.load("/kaggle/working/goemotions_clf.joblib")

# load thresholds if available, else set global thr
try:
    best_thrs = np.load("/kaggle/working/goemotions_per_label_thresholds.npy")
    print("Loaded per-label thresholds.")
except Exception:
    best_thrs = None
    print("Per-label thresholds not found; will use global threshold.")

def predict_texts(texts, threshold=None):
    # texts: list[str] or single str
    single = False
    if isinstance(texts, str):
        texts = [texts]; single = True
    Xvec = tfidf_used.transform(texts)
    # compute probs per label
    try:
        probas = clf_used.predict_proba(texts)  # if estimator accepts raw texts
    except Exception:
        # fallback: per-estimator proba using transformed features
        probas = np.vstack([est.predict_proba(Xvec)[:,1] for est in clf_used.estimators_]).T
    if best_thrs is not None:
        thr = best_thrs
    elif threshold is not None:
        thr = np.ones(probas.shape[1]) * threshold
    else:
        thr = np.ones(probas.shape[1]) * 0.30  # default global threshold
    preds = (probas >= thr.reshape(1, -1)).astype(int)
    inv = {i: lab for i, lab in enumerate(emotions)}
    results = []
    for i, t in enumerate(texts):
        label_idxs = np.where(preds[i] == 1)[0].tolist()
        labels = [inv[idx] for idx in label_idxs]
        probs = {inv[j]: float(probas[i, j]) for j in label_idxs}
        results.append({"text": t, "pred_labels": labels, "pred_probs": probs})
    return results[0] if single else results

# examples
print(predict_texts("I have faith"))
print(predict_texts("Haha, my apologies!"))
print(predict_texts("I am furious and disgusted"))
# you can call predict_texts("your sentence here", threshold=0.25) to override


Loaded per-label thresholds.
{'text': 'I have faith', 'pred_labels': ['approval', 'curiosity', 'neutral'], 'pred_probs': {'approval': 0.05340669138068469, 'curiosity': 0.05472486367480756, 'neutral': 0.3627720442782259}}
{'text': 'Haha, my apologies!', 'pred_labels': ['amusement', 'approval'], 'pred_probs': {'amusement': 0.10223449950155794, 'approval': 0.05769728243428152}}
{'text': 'I am furious and disgusted', 'pred_labels': ['approval', 'curiosity'], 'pred_probs': {'approval': 0.05757634854339065, 'curiosity': 0.05382702505079021}}


In [29]:
# 12_zip_artifacts_for_download
import shutil, os
out_dir = "/kaggle/working/goemotions_artifacts"
os.makedirs(out_dir, exist_ok=True)

# copy relevant files
for f in [
    "/kaggle/working/goemotions_tfidf_pipeline2.joblib",
    "/kaggle/working/goemotions_clf.joblib",
    "/kaggle/working/goemotions_tfidf.joblib",
    "/kaggle/working/goemotions_per_label_thresholds.npy"
]:
    if os.path.exists(f):
        shutil.copy(f, out_dir)

!zip -r /kaggle/working/goemotions_artifacts.zip /kaggle/working/goemotions_artifacts
print("Zipped artifacts to /kaggle/working/goemotions_artifacts.zip (see Output tab)")


  adding: kaggle/working/goemotions_artifacts/ (stored 0%)
  adding: kaggle/working/goemotions_artifacts/goemotions_tfidf.joblib (deflated 77%)
  adding: kaggle/working/goemotions_artifacts/goemotions_clf.joblib

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


 (deflated 69%)
  adding: kaggle/working/goemotions_artifacts/goemotions_tfidf_pipeline2.joblib (deflated 70%)
  adding: kaggle/working/goemotions_artifacts/goemotions_per_label_thresholds.npy (deflated 74%)
Zipped artifacts to /kaggle/working/goemotions_artifacts.zip (see Output tab)


In [30]:
# 13_finalize_and_zip_output
import shutil, os, zipfile

OUTDIR = "/kaggle/working/goemotions_final"
os.makedirs(OUTDIR, exist_ok=True)

# copy your key files
for fname in [
    "goemotions_tfidf_baseline.joblib",
    "goemotions_tfidf_pipeline2.joblib",
    "goemotions_clf.joblib",
    "goemotions_tfidf.joblib",
    "goemotions_per_label_thresholds.npy"
]:
    src = f"/kaggle/working/{fname}"
    if os.path.exists(src):
        shutil.copy(src, OUTDIR)

# zip it
zip_path = "/kaggle/working/goemotions_final.zip"
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
    for root, _, files in os.walk(OUTDIR):
        for f in files:
            full_path = os.path.join(root, f)
            zf.write(full_path, os.path.relpath(full_path, OUTDIR))

print(f"✅ Zipped everything to {zip_path}\nCheck the 'Output' sidebar → goemotions_final.zip → Download")


✅ Zipped everything to /kaggle/working/goemotions_final.zip
Check the 'Output' sidebar → goemotions_final.zip → Download
