# Imports

In [1]:
!pip install -q transformers accelerate torch pandas scikit-learn tqdm
import logging
import os, re, json, math, random
os.environ["TRANSFORMERS_VERBOSITY"] = "error"
from transformers.utils import logging as hf_logging
hf_logging.set_verbosity_error()                 # transformers’ own logger
logging.getLogger("transformers").setLevel(logging.ERROR)  # stdlib logging fallback
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, StoppingCriteria, StoppingCriteriaList
from tqdm.auto import tqdm
from collections import defaultdict, Counter

# CONFIG

In [None]:
# -----------------------
# CONFIG
# -----------------------
MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
HF_TOKEN = "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"

from huggingface_hub import login
login(HF_TOKEN)

# If you already have your exact validation split in a DataFrame named `eval_df`, set this to True
USE_EXISTING_EVAL_DF = False

# If not using existing eval_df, point to your raw dataset CSV with columns: OriginalTweet, Sentiment
TRAIN_CSV = "Data/Corona_NLP_train.csv"
RANDOM_STATE = 42

# Inference params (deterministic)
MAX_NEW_TOKENS = 512
TOP_P = 1.0
BATCH_SIZE = 8  # tweets per batch for generation (tweak for your GPU)
K_PER_CLASS = 2   # few-shot examples per class in each batch
MAX_DEMO_LEN  = 200   # truncate long demos for brevity
MODEL_ID_R1   = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
OUT_CSV_R1    = "Data/deepseek_zero_shot.csv"
OUT_CSV_R1_FS  = "Data/deepseek_few_shot.csv"
TEMP_SC       = 0.6      # sampling temperature for self-consistency
TOP_P_SC      = 0.95

CONF_THRESHOLD = 70        # adaptive SC: if greedy confidence >= 70, accept (B)
SC_SAMPLES     = 3         # self-consistency: number of samples per tweet

# Output
OUT_CSV_A = "Data/llama_templateA_predictions.csv"
OUT_CSV_B = "Data/llama_templateB_predictions.csv"

# Canonical labels and numeric mapping (for metrics only)
LABELS = ["Extremely Negative", "Negative", "Neutral", "Positive", "Extremely Positive"]
LABEL2ID = {lab: i for i, lab in enumerate(LABELS)}
LOWER2CANON = {lab.lower(): lab for lab in LABELS}

train_df = pd.read_csv(TRAIN_CSV, encoding="latin-1")

train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=RANDOM_STATE, stratify=train_df['Sentiment'])
val_df = val_df.reset_index(drop=True)

# Llama-3.1-8b-Instruct - Zero-Shot - Steup for running the LLM

In [None]:
SYSTEM_PROMPT = (
    "You are a careful annotator for COVID-19 tweet sentiment. "
    "Follow the instructions exactly and output valid JSON only."
)

USER_TEMPLATE = """Classify the sentiment of the tweet below into exactly one of:
["Extremely Negative","Negative","Neutral","Positive","Extremely Positive"].

Guidelines:
- Consider emojis, hashtags, intensifiers (e.g., “soooo”, ALL CAPS), punctuation, and negation.
- Sarcasm or jokes: infer the implied attitude where possible.
- If the tweet is ambiguous or mixed, choose "Neutral".
- OUTPUT FORMAT (JSON only):
{{"label": "<one of the five labels>", "confidence": <0-100 integer>}}

Tweet:
{tweet}
"""

## Load Model + Tokenizer + Pipeline

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype  = torch.bfloat16 if device == "cuda" else torch.float32

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
# Avoid padding warnings for Llama
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=dtype,
    device_map="auto" if device == "cuda" else None,
)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=model.dtype,
    device_map="auto" if device == "cuda" else None,
    return_full_text=False,
)

## Helpers

In [None]:
JSON_RE = re.compile(r"\{.*\}", re.DOTALL)

def build_prompts(tweets):
    prompts = []
    for tw in tweets:
        messages = [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user",   "content": USER_TEMPLATE.format(tweet=tw)},
        ]
        prompts.append(tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))
    return prompts

def parse_json(text):
    if not text:
        return None, None
    try:
        obj = json.loads(text)
    except Exception:
        m = JSON_RE.search(text)
        if not m:
            return None, None
        try:
            obj = json.loads(m.group(0))
        except Exception:
            return None, None
    return obj.get("label"), obj.get("confidence")

def normalize_pred_label(label_raw):
    if not label_raw:
        return None
    return LOWER2CANON.get(str(label_raw).strip().lower(), None)

def clamp_confidence(c):
    try:
        x = int(c)
        return max(0, min(100, x))
    except Exception:
        return None

def generate_batch(tweets):
    prompts = build_prompts(tweets)
    outs = pipe(
        prompts,
        max_new_tokens=MAX_NEW_TOKENS,
        do_sample=False,
        eos_token_id=tokenizer.eos_token_id,
    )
    # pipeline returns list of [ { "generated_text": ... } ]
    return [o[0]["generated_text"] for o in outs]

## Inference on Validation

In [None]:
rows = []
num_batches = math.ceil(len(val_df) / BATCH_SIZE)

for start in tqdm(range(0, len(val_df), BATCH_SIZE),
                  total=num_batches,
                  desc="Evaluating Llama-3.1 on validation",
                  unit="batch"):
    batch = val_df.iloc[start:start+BATCH_SIZE]
    gens = generate_batch(batch["OriginalTweet"].tolist())
    for i, gen in enumerate(gens):
        gold = batch["Sentiment"].iloc[i]
        lbl_raw, conf_raw = parse_json(gen)
        pred = normalize_pred_label(lbl_raw)
        conf = clamp_confidence(conf_raw)
        rows.append({
            "OriginalTweet": batch["OriginalTweet"].iloc[i],
            "Sentiment": gold,
            "Prediction": pred,
            "Confidence": conf,
        })

pred_df = pd.DataFrame(rows)

In [None]:
valid = pred_df.dropna(subset=["Prediction"]).copy()
y_true = valid["Sentiment"].map(LABEL2ID).astype(int).to_numpy()
y_pred = valid["Prediction"].map(LABEL2ID).astype(int).to_numpy()

acc  = accuracy_score(y_true, y_pred)
f1_w = f1_score(y_true, y_pred, average="weighted")
f1_m = f1_score(y_true, y_pred, average="macro")

print(f"Validation size: {len(val_df)}")
print(f"Evaluated (valid predictions): {len(valid)}")
print(f"Accuracy:   {acc:.4f}")
print(f"F1 (weighted): {f1_w:.4f}")
print(f"F1 (macro):    {f1_m:.4f}")

In [None]:
pred_df.to_csv(OUT_CSV_A, index=False)
print(f"Saved: {OUT_CSV_A}")

# Llama-3.1-8b-Instruct - Zero-Shot - Results

In [3]:
pred_df = pd.read_csv(OUT_CSV_A)
valid = pred_df.dropna(subset=["Prediction"]).copy()
y_true = valid["Sentiment"].map(LABEL2ID).astype(int).to_numpy()
y_pred = valid["Prediction"].map(LABEL2ID).astype(int).to_numpy()

acc  = accuracy_score(y_true, y_pred)
f1_w = f1_score(y_true, y_pred, average="weighted")
f1_m = f1_score(y_true, y_pred, average="macro")

print(f"Validation size: {len(val_df)}")
print(f"Evaluated (valid predictions): {len(valid)}")
print(f"Accuracy:   {acc:.4f}")
print(f"F1 (weighted): {f1_w:.4f}")
print(f"F1 (macro):    {f1_m:.4f}")

Validation size: 8232
Evaluated (valid predictions): 8231
Accuracy:   0.3079
F1 (weighted): 0.2765
F1 (macro):    0.2597


# Llama-3.1-8b-Instruct - Few-Shot - Steup for running the LLM

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype  = torch.bfloat16 if device == "cuda" else torch.float32

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
tokenizer.padding_side = "left"                  # silence padding warning for decoder-only models
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=dtype,
    device_map="auto" if device == "cuda" else None,
)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=model.dtype,
    device_map="auto" if device == "cuda" else None,
    return_full_text=False,   # only the generated continuation
)

# ----------------------------
# Ensure canonical labels in splits (safe even if already canonical)
# (Uses your existing train_df, val_df created in CONFIG section)
# ----------------------------
train_df = train_df.copy()
val_df   = val_df.copy()
train_df["Sentiment"] = train_df["Sentiment"].astype(str).str.strip().str.lower().map(LOWER2CANON)
val_df["Sentiment"]   = val_df["Sentiment"].astype(str).str.strip().str.lower().map(LOWER2CANON)
train_df = train_df.dropna(subset=["OriginalTweet","Sentiment"]).reset_index(drop=True)
val_df   = val_df.dropna(subset=["OriginalTweet","Sentiment"]).reset_index(drop=True)

# ----------------------------
# Few-shot support bank FROM TRAIN ONLY (no leakage)
# ----------------------------
RNG = random.Random(RANDOM_STATE)

support_bank = defaultdict(list)
for _, row in train_df.iterrows():
    tw = str(row["OriginalTweet"]).strip()
    if not tw:
        continue
    if len(tw) > 200:                     # optional: keep examples concise
        tw = tw[:197] + "..."
    support_bank[row["Sentiment"]].append(tw)

# Optional: cap very large classes for faster sampling
for lab in LABELS:
    pool = support_bank.get(lab, [])
    if len(pool) > 5000:
        support_bank[lab] = RNG.sample(pool, 5000)

# ----------------------------
# Template B (paper-inspired) prompt pieces
# ----------------------------
SYSTEM_PROMPT = (
    "You are a careful annotator for COVID-19 tweet sentiment. "
    "Follow the instructions and output the JSON exactly as requested."
)

SHORT_TAG = {
    "Extremely Negative":"ExNeg",
    "Negative":"Neg",
    "Neutral":"Neu",
    "Positive":"Pos",
    "Extremely Positive":"ExPos",
}

# Double braces in JSON so .format() does not try to substitute them
USER_TEMPLATE_B = """{few_shots}
Now classify the tweet below into one label from:
["Extremely Negative","Negative","Neutral","Positive","Extremely Positive"].

Return JSON only: {{"label": "<one of the five labels>", "confidence": <0-100 integer>}}

Tweet:
{tweet}
"""

def build_few_shot_block(k_per_class=2, rng=None):
    rng = rng or RNG
    lines = ["Here are labeled examples:"]
    for lab in LABELS:
        pool = support_bank.get(lab, [])
        if not pool:
            continue
        k = min(k_per_class, len(pool))
        for s in rng.sample(pool, k):
            lines.append(f'[{SHORT_TAG[lab]}] "{s}" → "{lab}"')
    lines.append("")  # blank line
    return "\n".join(lines)

def build_prompts_few_shot(tweets, k_per_class=2, rng=None):
    few_shots = build_few_shot_block(k_per_class=k_per_class, rng=rng)
    prompts = []
    for tw in tweets:
        messages = [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user",   "content": USER_TEMPLATE_B.format(few_shots=few_shots, tweet=tw)},
        ]
        prompts.append(tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))
    return prompts

# ----------------------------
# Parsing & post-processing helpers
# ----------------------------
JSON_RE = re.compile(r"\{.*\}", re.DOTALL)

def parse_json(text):
    if not text:
        return None, None
    try:
        obj = json.loads(text)
    except Exception:
        m = JSON_RE.search(text)
        if not m:
            return None, None
        try:
            obj = json.loads(m.group(0))
        except Exception:
            return None, None
    return obj.get("label"), obj.get("confidence")

def normalize_pred_label(label_raw):
    if not label_raw:
        return None
    return LOWER2CANON.get(str(label_raw).strip().lower(), None)

def clamp_confidence(c):
    try:
        x = int(c)
        return max(0, min(100, x))
    except Exception:
        return None

# ----------------------------
# Inference loop (few-shot per BATCH)
# ----------------------------
rows = []
num_batches = math.ceil(len(val_df) / BATCH_SIZE)

for start in tqdm(range(0, len(val_df), BATCH_SIZE),
                  total=num_batches,
                  desc="Evaluating Llama-3.1 (Template B) on validation",
                  unit="batch"):
    batch = val_df.iloc[start:start+BATCH_SIZE]
    prompts = build_prompts_few_shot(
        batch["OriginalTweet"].tolist(),
        k_per_class=K_PER_CLASS,
        rng=RNG,  # keep fixed for reproducibility; change per batch if you want diversity
    )
    outs = pipe(
        prompts,
        max_new_tokens=MAX_NEW_TOKENS,
        do_sample=False,                     # deterministic greedy decoding
        eos_token_id=tokenizer.eos_token_id,
    )
    gens = [o[0]["generated_text"] for o in outs]

    for i, gen in enumerate(gens):
        gold = batch["Sentiment"].iloc[i]
        lbl_raw, conf_raw = parse_json(gen)
        pred = normalize_pred_label(lbl_raw)
        conf = clamp_confidence(conf_raw)
        rows.append({
            "OriginalTweet": batch["OriginalTweet"].iloc[i],
            "Sentiment": gold,
            "Prediction": pred,
            "Confidence": conf,
        })

pred_df = pd.DataFrame(rows)

# ----------------------------
# Metrics (numeric mapping for calc only)
# ----------------------------
valid = pred_df.dropna(subset=["Prediction"]).copy()
y_true = valid["Sentiment"].map(LABEL2ID).astype(int).to_numpy()
y_pred = valid["Prediction"].map(LABEL2ID).astype(int).to_numpy()

acc  = accuracy_score(y_true, y_pred)
f1_w = f1_score(y_true, y_pred, average="weighted")
f1_m = f1_score(y_true, y_pred, average="macro")

print(f"Validation size:            {len(val_df)}")
print(f"Evaluated (valid preds):    {len(valid)}")
print(f"Accuracy:                   {acc:.4f}")
print(f"F1 (weighted):              {f1_w:.4f}")
print(f"F1 (macro):                 {f1_m:.4f}")

# ----------------------------
# Save CSV
# ----------------------------
pred_df.to_csv(OUT_CSV_B, index=False)
print(f"Saved: {OUT_CSV_B}")

# Llama-3.1-8b-Instruct - Few-Shot - Results

In [4]:
pred_df = pd.read_csv(OUT_CSV_B)
valid = pred_df.dropna(subset=["Prediction"]).copy()
y_true = valid["Sentiment"].map(LABEL2ID).astype(int).to_numpy()
y_pred = valid["Prediction"].map(LABEL2ID).astype(int).to_numpy()

acc  = accuracy_score(y_true, y_pred)
f1_w = f1_score(y_true, y_pred, average="weighted")
f1_m = f1_score(y_true, y_pred, average="macro")

print(f"Validation size: {len(val_df)}")
print(f"Evaluated (valid predictions): {len(valid)}")
print(f"Accuracy:   {acc:.4f}")
print(f"F1 (weighted): {f1_w:.4f}")
print(f"F1 (macro):    {f1_m:.4f}")

Validation size: 8232
Evaluated (valid predictions): 8232
Accuracy:   0.3173
F1 (weighted): 0.3019
F1 (macro):    0.2984


# Distilled-Deepseek - Zero-Shot - Setup for running the LLM

In [None]:
# ----------------------------
# Model / Tokenizer (direct generate for full control)
# ----------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype  = torch.bfloat16 if device == "cuda" else torch.float32

tokenizer_r1 = AutoTokenizer.from_pretrained(MODEL_ID_R1, use_fast=True)
tokenizer_r1.padding_side = "left"  # decoder-only models prefer left-padding for generation
if tokenizer_r1.pad_token_id is None:
    tokenizer_r1.pad_token_id = tokenizer_r1.eos_token_id

model_r1 = AutoModelForCausalLM.from_pretrained(
    MODEL_ID_R1,
    torch_dtype=dtype,
    device_map="auto" if device == "cuda" else None,
)

# ----------------------------
# Zero-shot reasoning prompt (short rationale)  (C)
# Use double braces so .format() only fills {tweet}
# ----------------------------
USER_TEMPLATE_ZS = """You are classifying COVID-19 tweets into exactly one label:
["Extremely Negative","Negative","Neutral","Positive","Extremely Positive"].

Think out loud briefly (max 100 words). Then, as your final output, print EXACTLY ONE line:
FINAL_JSON: {{"label": "<one of the five labels>",
             "rationale": "<max 12 words>",
             "confidence": <0-100 integer>}}
- Do NOT use code fences.
- Do NOT add any text after the JSON line.

Tweet:
{tweet}
"""

# ----------------------------
# Early stop: halt immediately after first closing brace '}'  (A)
# ----------------------------
class StopOnJSONEnd(StoppingCriteria):
    def __init__(self, tokenizer, pattern="}"):
        self.pattern_ids = tokenizer(pattern, add_special_tokens=False).input_ids

    def __call__(self, input_ids, scores, **kwargs):
        # Simple suffix check
        seq = input_ids[0].tolist() if input_ids.dim() == 2 else input_ids.tolist()
        L = len(self.pattern_ids)
        if L == 0 or len(seq) < L:
            return False
        return seq[-L:] == self.pattern_ids

stopper = StoppingCriteriaList([StopOnJSONEnd(tokenizer_r1, pattern="}")])

# ----------------------------
# Helpers
# ----------------------------

def build_prompts_zero_shot(tweets):
    prompts = []
    for tw in tweets:
        messages = [{"role": "user", "content": USER_TEMPLATE_ZS.format(tweet=tw)}]
        prompts.append(
            tokenizer_r1.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        )
    return prompts

JSON_RE_GREEDY     = re.compile(r"\{.*\}", re.DOTALL)
JSON_RE_NONGREEDY  = re.compile(r"\{[^{}]*\}")
FINAL_JSON_ANCHOR  = re.compile(r"FINAL_JSON\s*:\s*(\{.*\})", re.IGNORECASE | re.DOTALL)

def parse_json_obj(text: str):
    """Prefer JSON after 'FINAL_JSON:', else take the last {...} block; else None."""
    if not text:
        return None
    # 1) Prefer explicit final JSON
    m = FINAL_JSON_ANCHOR.search(text)
    if m:
        cand = m.group(1).strip()
        try:
            return json.loads(cand)
        except Exception:
            pass
    # 2) Try last greedy {...}
    all_greedy = JSON_RE_GREEDY.findall(text)
    for cand in reversed(all_greedy):
        try:
            return json.loads(cand)
        except Exception:
            continue
    # 3) Try last simple {...}
    candidates = JSON_RE_NONGREEDY.findall(text)
    for cand in reversed(candidates):
        try:
            return json.loads(cand)
        except Exception:
            continue
    return None

def normalize_pred_label(raw):
    if not raw:
        return None
    return LOWER2CANON.get(str(raw).strip().lower(), None)

def clamp_confidence(c):
    try:
        return max(0, min(100, int(c)))
    except Exception:
        return None

def generate_once(prompts, do_sample=False, temperature=0.0, top_p=1.0):
    enc = tokenizer_r1(prompts, return_tensors="pt", padding=True, truncation=True)
    enc = {k: v.to(model_r1.device) for k, v in enc.items()}
    with torch.no_grad():
        out = model_r1.generate(
            **enc,
            max_new_tokens=MAX_NEW_TOKENS,
            do_sample=do_sample,
            temperature=temperature,
            top_p=top_p,
            eos_token_id=tokenizer_r1.eos_token_id,
        )
    prompt_len = enc["input_ids"].shape[1]
    gens = tokenizer_r1.batch_decode(out[:, prompt_len:], skip_special_tokens=True)
    return gens

def majority_vote(prompts, n_samples=SC_SAMPLES, temperature=TEMP_SC, top_p=TOP_P_SC):
    """
    For each prompt, sample n times and majority-vote on the label.
    Returns a list of tuples: (label, confidence, rationale, raw_text).
    """
    repeated = [p for p in prompts for _ in range(n_samples)]
    gens = generate_once(repeated, do_sample=True, temperature=temperature, top_p=top_p)

    results = []
    for i in range(0, len(gens), n_samples):
        group = gens[i:i+n_samples]
        parsed, labels = [], []
        for g in group:
            obj = parse_json_obj(g)
            if obj is None:
                parsed.append((None, None, None, g))
                continue
            lab = normalize_pred_label(obj.get("label"))
            conf = clamp_confidence(obj.get("confidence"))
            rat = obj.get("rationale")
            parsed.append((lab, conf, rat, g))
            labels.append(lab)

        counts = Counter([l for l in labels if l is not None])
        if counts:
            voted_label, _ = counts.most_common(1)[0]
            # choose among winners (highest confidence preferred)
            cands = [t for t in parsed if t[0] == voted_label]  # (lab, conf, rat, raw)
            cands.sort(key=lambda t: (t[1] is not None, t[1]), reverse=True)
            results.append(cands[0])
        else:
            # fallback: first valid sample if any, else keep raw of first gen
            fb = next((t for t in parsed if t[0] is not None), (None, None, None, group[0]))
            results.append(fb)
    return results  # (lab, conf, rat, raw)

# ----------------------------
# Inference (adaptive SC): greedy first, SC only if needed  (B)
# ----------------------------
rows = []
num_batches = math.ceil(len(val_df) / BATCH_SIZE)

for start in tqdm(range(0, len(val_df), BATCH_SIZE),
                  total=num_batches,
                  desc="DeepSeek-R1 (zero-shot, adaptive SC)",
                  unit="batch"):
    batch = val_df.iloc[start:start+BATCH_SIZE]
    prompts = build_prompts_zero_shot(batch["OriginalTweet"].tolist())

    # Pass 1: Greedy (fast, deterministic)
    greedy_outs = generate_once(prompts, do_sample=False)
    parsed_greedy = []
    need_sc_idx = []
    raw_used = greedy_outs[:]  # start with greedy raw responses

    for i, g in enumerate(greedy_outs):
        obj = parse_json_obj(g)
        if obj is None:
            parsed_greedy.append((None, None, None))
            need_sc_idx.append(i)
            continue
        lab = normalize_pred_label(obj.get("label"))
        conf = clamp_confidence(obj.get("confidence"))
        rat = obj.get("rationale")
        if lab is None or conf is None or conf < CONF_THRESHOLD:
            need_sc_idx.append(i)
        parsed_greedy.append((lab, conf, rat))

    # Pass 2: Self-consistency ONLY on low-confidence/invalid items
    if need_sc_idx:
        sc_prompts = [prompts[i] for i in need_sc_idx]
        sc_results = majority_vote(sc_prompts, n_samples=SC_SAMPLES)  # returns (lab, conf, rat, raw)
        for j, idx in enumerate(need_sc_idx):
            lab, conf, rat, raw = sc_results[j]
            parsed_greedy[idx] = (lab, conf, rat)
            raw_used[idx] = raw  # override with chosen SC raw output

    # Collect rows (now includes RawOutput)
    for i, (lab, conf, rat) in enumerate(parsed_greedy):
        rows.append({
            "OriginalTweet": batch["OriginalTweet"].iloc[i],
            "Sentiment":     batch["Sentiment"].iloc[i],
            "Prediction":    lab,
            "Confidence":    conf,
            "Rationale":     rat,
            "RawOutput":     raw_used[i],   # <-- full text kept here
        })

pred_df = pd.DataFrame(rows)

# ----------------------------
# Metrics (numeric mapping for calc only)
# ----------------------------
valid = pred_df.dropna(subset=["Prediction"]).copy()
y_true = valid["Sentiment"].map(LABEL2ID).astype(int).to_numpy()
y_pred = valid["Prediction"].map(LABEL2ID).astype(int).to_numpy()

acc  = accuracy_score(y_true, y_pred)
f1_w = f1_score(y_true, y_pred, average="weighted")
f1_m = f1_score(y_true, y_pred, average="macro")

print(f"Validation size:            {len(val_df)}")
print(f"Evaluated (valid preds):    {len(valid)}")
print(f"Accuracy:                   {acc:.4f}")
print(f"F1 (weighted):              {f1_w:.4f}")
print(f"F1 (macro):                 {f1_m:.4f}")

pred_df.to_csv(OUT_CSV_R1, index=False)
print(f"Saved: {OUT_CSV_R1}")

# Distilled-Deepseek Zero-Shot - Results

In [5]:
pred_df = pd.read_csv(OUT_CSV_R1)
valid = pred_df.dropna(subset=["Prediction"]).copy()
y_true = valid["Sentiment"].map(LABEL2ID).astype(int).to_numpy()
y_pred = valid["Prediction"].map(LABEL2ID).astype(int).to_numpy()

acc  = accuracy_score(y_true, y_pred)
f1_w = f1_score(y_true, y_pred, average="weighted")
f1_m = f1_score(y_true, y_pred, average="macro")

print(f"Validation size: {len(val_df)}")
print(f"Evaluated (valid predictions): {len(valid)}")
print(f"Accuracy:   {acc:.4f}")
print(f"F1 (weighted): {f1_w:.4f}")
print(f"F1 (macro):    {f1_m:.4f}")

Validation size: 8232
Evaluated (valid predictions): 8232
Accuracy:   0.3291
F1 (weighted): 0.2830
F1 (macro):    0.2507


# Distilled-Deepseek Few-Shot - Steup for running the LLM

In [None]:
# ----------------------------
# Model / Tokenizer
# ----------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype  = torch.bfloat16 if device == "cuda" else torch.float32

tokenizer_r1 = AutoTokenizer.from_pretrained(MODEL_ID_R1, use_fast=True)
tokenizer_r1.padding_side = "left"  # decoder-only models prefer left padding
if tokenizer_r1.pad_token_id is None:
    tokenizer_r1.pad_token_id = tokenizer_r1.eos_token_id

model_r1 = AutoModelForCausalLM.from_pretrained(
    MODEL_ID_R1,
    torch_dtype=dtype,
    device_map="auto" if device == "cuda" else None,
)

# ----------------------------
# Few-shot prompt (double braces around JSON example for .format safety)
# ----------------------------
SHORT_TAG = {
    "Extremely Negative":"ExNeg",
    "Negative":"Neg",
    "Neutral":"Neu",
    "Positive":"Pos",
    "Extremely Positive":"ExPos",
}

USER_TEMPLATE_FS = """You are classifying COVID-19 tweets into exactly one label:
["Extremely Negative","Negative","Neutral","Positive","Extremely Positive"].

Here are labeled examples:
{few_shots}

Now classify the tweet below.

Think out loud briefly (max 100 words). Then, as your final output, print EXACTLY ONE line:
FINAL_JSON: {{"label": "<one of the five labels>",
             "rationale": "<max 12 words>",
             "confidence": <0-100 integer>}}
- Do NOT use code fences.
- Do NOT add any text after the JSON line.

Tweet:
{tweet}
"""

# ----------------------------
# Build few-shot support bank FROM TRAIN ONLY (no leakage)
# ----------------------------
RNG = random.Random(RANDOM_STATE)

train_df = train_df.copy()
val_df   = val_df.copy()
train_df["Sentiment"] = train_df["Sentiment"].astype(str).str.strip().str.lower().map(LOWER2CANON)
val_df["Sentiment"]   = val_df["Sentiment"].astype(str).str.strip().str.lower().map(LOWER2CANON)
train_df = train_df.dropna(subset=["OriginalTweet","Sentiment"]).reset_index(drop=True)
val_df   = val_df.dropna(subset=["OriginalTweet","Sentiment"]).reset_index(drop=True)

support_bank = defaultdict(list)
for _, row in train_df.iterrows():
    tw = str(row["OriginalTweet"]).strip()
    if not tw:
        continue
    if len(tw) > MAX_DEMO_LEN:
        tw = tw[:MAX_DEMO_LEN-3] + "..."
    support_bank[row["Sentiment"]].append(tw)

# (Optional) cap huge classes for faster sampling
for lab in LABELS:
    pool = support_bank.get(lab, [])
    if len(pool) > 5000:
        support_bank[lab] = RNG.sample(pool, 5000)

def build_few_shot_block(k_per_class):
    lines = []
    for lab in LABELS:
        pool = support_bank.get(lab, [])
        if not pool:
            continue
        k = min(k_per_class, len(pool))
        for s in RNG.sample(pool, k):
            lines.append(f'[{SHORT_TAG[lab]}] "{s}" → "{lab}"')
    return "\n".join(lines)

def build_prompts_few_shot(tweets, k_per_class):
    few_shots = build_few_shot_block(k_per_class)
    prompts = []
    for tw in tweets:
        messages = [{"role": "user",
                     "content": USER_TEMPLATE_FS.format(few_shots=few_shots, tweet=tw)}]
        prompts.append(
            tokenizer_r1.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        )
    return prompts

# ----------------------------
# JSON parsing (prefer FINAL_JSON anchor; soft repairs)
# ----------------------------
FINAL_JSON_ANCHOR  = re.compile(r"FINAL_JSON\s*:\s*(\{.*\})", re.IGNORECASE | re.DOTALL)
JSON_RE_GREEDY     = re.compile(r"\{.*\}", re.DOTALL)
JSON_RE_NONGREEDY  = re.compile(r"\{[^{}]*\}")

def _try_load(s):
    try:
        return json.loads(s)
    except Exception:
        return None

def _soft_repair(cand: str):
    t = cand.strip().strip("`")
    t = re.sub(r"\s*```$", "", t)
    t = re.sub(r",\s*}", "}", t)  # trailing comma before '}'
    if not t.endswith("}"):
        t = t + "}"
    return t

def parse_json_obj(text: str):
    if not text:
        return None
    # Prefer JSON after 'FINAL_JSON:'
    m = FINAL_JSON_ANCHOR.search(text)
    if m:
        cand = m.group(1).strip()
        obj = _try_load(cand) or _try_load(_soft_repair(cand))
        if obj is not None:
            return obj
    # Else: last {...} (greedy)
    cands = JSON_RE_GREEDY.findall(text)
    for cand in reversed(cands):
        obj = _try_load(cand) or _try_load(_soft_repair(cand))
        if obj is not None:
            return obj
    # Else: last simple {...}
    cands = JSON_RE_NONGREEDY.findall(text)
    for cand in reversed(cands):
        obj = _try_load(cand) or _try_load(_soft_repair(cand))
        if obj is not None:
            return obj
    return None

def normalize_pred_label(raw):
    if not raw:
        return None
    return LOWER2CANON.get(str(raw).strip().lower(), None)

def clamp_confidence(c):
    try:
        return max(0, min(100, int(c)))
    except Exception:
        return None

# ----------------------------
# Generation helpers
# ----------------------------
def generate_once(prompts, do_sample=False, temperature=0.0, top_p=1.0):
    enc = tokenizer_r1(prompts, return_tensors="pt", padding=True, truncation=True)
    enc = {k: v.to(model_r1.device) for k, v in enc.items()}
    with torch.no_grad():
        out = model_r1.generate(
            **enc,
            max_new_tokens=MAX_NEW_TOKENS,
            do_sample=do_sample,
            temperature=temperature,
            top_p=top_p,
            eos_token_id=tokenizer_r1.eos_token_id,
        )
    prompt_len = enc["input_ids"].shape[1]
    gens = tokenizer_r1.batch_decode(out[:, prompt_len:], skip_special_tokens=True)
    return gens

def majority_vote(prompts, n_samples=SC_SAMPLES, temperature=TEMP_SC, top_p=TOP_P_SC):
    """
    For each prompt, sample n times and majority-vote on the label.
    Returns a list of tuples: (label, confidence, rationale, raw_text).
    """
    repeated = [p for p in prompts for _ in range(n_samples)]
    gens = generate_once(repeated, do_sample=True, temperature=temperature, top_p=top_p)

    results = []
    for i in range(0, len(gens), n_samples):
        group = gens[i:i+n_samples]
        parsed, labels = [], []
        for g in group:
            obj = parse_json_obj(g)
            if obj is None:
                parsed.append((None, None, None, g))
                continue
            lab = normalize_pred_label(obj.get("label"))
            conf = clamp_confidence(obj.get("confidence"))
            rat = obj.get("rationale")
            parsed.append((lab, conf, rat, g))
            labels.append(lab)

        counts = Counter([l for l in labels if l is not None])
        if counts:
            voted_label, _ = counts.most_common(1)[0]
            # choose among winners (highest confidence preferred)
            cands = [t for t in parsed if t[0] == voted_label]  # (lab, conf, rat, raw)
            cands.sort(key=lambda t: (t[1] is not None, t[1]), reverse=True)
            results.append(cands[0])
        else:
            # fallback: first valid sample if any, else keep raw of first gen
            fb = next((t for t in parsed if t[0] is not None), (None, None, None, group[0]))
            results.append(fb)
    return results  # (lab, conf, rat, raw)

# ----------------------------
# Inference (few-shot + adaptive SC) with RawOutput logging
# ----------------------------
rows = []
num_batches = math.ceil(len(val_df) / BATCH_SIZE)

for start in tqdm(range(0, len(val_df), BATCH_SIZE),
                  total=num_batches,
                  desc="DeepSeek-R1 (few-shot, adaptive SC)",
                  unit="batch"):
    batch = val_df.iloc[start:start+BATCH_SIZE]
    prompts = build_prompts_few_shot(
        batch["OriginalTweet"].tolist(),
        k_per_class=K_PER_CLASS
    )

    # Pass 1: Greedy
    greedy_outs = generate_once(prompts, do_sample=False)
    parsed_greedy = []
    need_sc_idx = []
    raw_used = greedy_outs[:]  # start with greedy raw responses

    for i, g in enumerate(greedy_outs):
        obj = parse_json_obj(g)
        if obj is None:
            parsed_greedy.append((None, None, None))
            need_sc_idx.append(i)
            continue
        lab = normalize_pred_label(obj.get("label"))
        conf = clamp_confidence(obj.get("confidence"))
        rat = obj.get("rationale")
        if lab is None or conf is None or conf < CONF_THRESHOLD:
            need_sc_idx.append(i)
        parsed_greedy.append((lab, conf, rat))

    # Pass 2: Self-consistency ONLY where needed
    if need_sc_idx:
        sc_prompts = [prompts[i] for i in need_sc_idx]
        sc_results = majority_vote(sc_prompts, n_samples=SC_SAMPLES)  # (lab, conf, rat, raw)
        for j, idx in enumerate(need_sc_idx):
            lab, conf, rat, raw = sc_results[j]
            parsed_greedy[idx] = (lab, conf, rat)
            raw_used[idx] = raw  # override with SC raw

    # Collect rows (with RawOutput)
    for i, (lab, conf, rat) in enumerate(parsed_greedy):
        rows.append({
            "OriginalTweet": batch["OriginalTweet"].iloc[i],
            "Sentiment":     batch["Sentiment"].iloc[i],  # gold
            "Prediction":    lab,
            "Confidence":    conf,
            "Rationale":     rat,
            "RawOutput":     raw_used[i],
        })

pred_df = pd.DataFrame(rows)

# ----------------------------
# Metrics
# ----------------------------
valid = pred_df.dropna(subset=["Prediction"]).copy()
y_true = valid["Sentiment"].map(LABEL2ID).astype(int).to_numpy()
y_pred = valid["Prediction"].map(LABEL2ID).astype(int).to_numpy()

acc  = accuracy_score(y_true, y_pred)
f1_w = f1_score(y_true, y_pred, average="weighted")
f1_m = f1_score(y_true, y_pred, average="macro")

print(f"Validation size:            {len(val_df)}")
print(f"Evaluated (valid preds):    {len(valid)}")
print(f"Accuracy:                   {acc:.4f}")
print(f"F1 (weighted):              {f1_w:.4f}")
print(f"F1 (macro):                 {f1_m:.4f}")

# Save
pred_df.to_csv(OUT_CSV_R1_FS, index=False)
print(f"Saved: {OUT_CSV_R1_FS}")

# Distilled-Deepseek Few-Shot - Results

In [6]:
pred_df = pd.read_csv(OUT_CSV_R1_FS)
valid = pred_df.dropna(subset=["Prediction"]).copy()
y_true = valid["Sentiment"].map(LABEL2ID).astype(int).to_numpy()
y_pred = valid["Prediction"].map(LABEL2ID).astype(int).to_numpy()

acc  = accuracy_score(y_true, y_pred)
f1_w = f1_score(y_true, y_pred, average="weighted")
f1_m = f1_score(y_true, y_pred, average="macro")

print(f"Validation size: {len(val_df)}")
print(f"Evaluated (valid predictions): {len(valid)}")
print(f"Accuracy:   {acc:.4f}")
print(f"F1 (weighted): {f1_w:.4f}")
print(f"F1 (macro):    {f1_m:.4f}")

Validation size: 8232
Evaluated (valid predictions): 8232
Accuracy:   0.3213
F1 (weighted): 0.2763
F1 (macro):    0.2410
