In [None]:
from google.colab import drive
drive.mount('/content/drive')
!pip install transformers

Mounted at /content/drive


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaModel
import pandas as pd
import random
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from collections import Counter
device = "cuda" if torch.cuda.is_available() else "cpu"


In [None]:
# Load FLAN-T5
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model.eval().to("cuda" if torch.cuda.is_available() else "cpu")

# Define candidate labels INCLUDING 'others'
candidate_labels = ["race", "religion", "gender", "sexual orientation", "politics", "immigration", "others"]

# === Generator ===
def generate_topics(
    text,
    num_return_sequences=20,
    top_k=50,
    top_p=0.95,
    temperature=1.4,
    max_length=20
):
    prompt = (
        "Choose the most relevant topic for the following text. "
        "Available topics: race, religion, gender, sexual orientation, politics, immigration, others.\n"
        "Use 'others' if none of the specific categories clearly apply.\n\n"

        "Text: 'The president signed a new healthcare bill.'\nTopic: politics\n"
        "Text: 'She attends church every Sunday.'\nTopic: religion\n"
        "Text: 'He identifies as non-binary and uses they/them pronouns.'\nTopic: gender\n"
        "Text: 'They were denied entry due to their visa status.'\nTopic: immigration\n"
        "Text: 'I love painting and long walks in the forest.'\nTopic: others\n"
        f"Text: {text}\nTopic:"
    )
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)

    outputs = model.generate(
        input_ids=input_ids,
        do_sample=True,
        top_k=top_k,
        top_p=top_p,
        temperature=temperature,
        num_return_sequences=num_return_sequences,
        max_length=max_length
    )

    decoded = [tokenizer.decode(out, skip_special_tokens=True).strip().lower() for out in outputs]

    # Map decoded output to closest valid label
    def map_to_label(output):
        for label in candidate_labels:
            if label in output:
                return label
        return "others"  # Still fallback for off-target generations

    mapped_labels = [map_to_label(output) for output in decoded]
    return mapped_labels

# === Distribution utility ===
def generate_topic_distribution(text, num_return_sequences=30):
    samples = generate_topics(text, num_return_sequences=num_return_sequences)
    counts = Counter(samples)
    total = sum(counts.values())
    return {label: counts.get(label, 0) / total for label in candidate_labels}


In [None]:
def estimate_p_star_a(
    csv_path,
    text_column='text',
    num_samples_per_text=30,
    candidate_labels=None,
    output_path=None,
    verbose=True
):
    if candidate_labels is None:
        candidate_labels = ["race", "religion", "gender", "sexual orientation", "politics", "immigration", "others"]

    # Load text data
    df = pd.read_csv(csv_path)
    texts = df[text_column].tolist()

    # Aggregate topic distributions
    total_counter = Counter()

    for i, text in enumerate(texts):
        dist = generate_topic_distribution(text, num_return_sequences=num_samples_per_text)
        total_counter.update(dist)
        if verbose and i % 100 == 0:
            print(f"Processed {i}/{len(texts)} rows...")

    # Normalize
    total = sum(total_counter.values())
    p_star_a = {label: total_counter.get(label, 0) / total for label in candidate_labels}

    # Save or return
    p_star_df = pd.DataFrame.from_dict(p_star_a, orient='index', columns=['P*(a)']).reset_index()
    p_star_df = p_star_df.rename(columns={'index': 'topic'})

    if output_path:
        p_star_df.to_csv(output_path, index=False)
        if verbose:
            print(f"Saved to {output_path}")

    return p_star_df


In [None]:
p_star_df = estimate_p_star_a(
    csv_path='/content/drive/MyDrive/test_twitter.csv',
    num_samples_per_text=30,
    output_path='/content/drive/MyDrive/p_star_a_test_twitter_distribution.csv'
)
print(p_star_df)


Processed 0/15475 rows...
Processed 100/15475 rows...
Processed 200/15475 rows...
Processed 300/15475 rows...
Processed 400/15475 rows...
Processed 500/15475 rows...
Processed 600/15475 rows...
Processed 700/15475 rows...
Processed 800/15475 rows...
Processed 900/15475 rows...
Processed 1000/15475 rows...
Processed 1100/15475 rows...
Processed 1200/15475 rows...
Processed 1300/15475 rows...
Processed 1400/15475 rows...
Processed 1500/15475 rows...
Processed 1600/15475 rows...
Processed 1700/15475 rows...
Processed 1800/15475 rows...
Processed 1900/15475 rows...
Processed 2000/15475 rows...
Processed 2100/15475 rows...
Processed 2200/15475 rows...
Processed 2300/15475 rows...
Processed 2400/15475 rows...
Processed 2500/15475 rows...
Processed 2600/15475 rows...
Processed 2700/15475 rows...
Processed 2800/15475 rows...
Processed 2900/15475 rows...
Processed 3000/15475 rows...
Processed 3100/15475 rows...
Processed 3200/15475 rows...
Processed 3300/15475 rows...
Processed 3400/15475 rows.

In [None]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from collections import Counter, defaultdict
import numpy as np
from tqdm import tqdm

# Load test set and P*(a)
df = pd.read_csv('/content/drive/MyDrive/train.csv')
p_star_df = pd.read_csv('/content/drive/MyDrive/p_star_a_test_twitter_distribution.csv')
p_star = dict(zip(p_star_df['topic'], p_star_df['P*(a)']))

# Load conditional model P(Y | x, a)
model_path = "/content/drive/MyDrive/hate_model_conditional"
model = AutoModelForSequenceClassification.from_pretrained(model_path).to("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(model_path)
model.eval()

# Same candidate label list used in training
candidate_labels = list(p_star.keys())

# Compute P(Y | x, a) * P*(a) over topic samples
def compute_hate_probability(text, p_star, model, tokenizer, device):
    weighted_probs = []
    for a_label, p_a in p_star.items():
        # Construct input: "[TOPIC: a_label] text"
        input_text = f"[TOPIC: {a_label}] {text}"
        encoded = tokenizer(input_text, return_tensors="pt", truncation=True, padding='max_length', max_length=128).to(device)
        with torch.no_grad():
            output = model(**encoded)
            prob = torch.softmax(output.logits, dim=1).squeeze()  # shape: [2]
            weighted = prob * p_a
            weighted_probs.append(weighted.cpu().numpy())

    total_prob = np.sum(weighted_probs, axis=0)  # shape: [2]
    return total_prob  # [P(not hate), P(hate)]

# Predict and collect results
results = []
all_preds = []
all_labels = []

for _, row in tqdm(df.iterrows(), total=len(df)):
    text = row['text']
    true_label = row['hate_label']

    probs = compute_hate_probability(text, p_star, model, tokenizer, device)
    prob_hate = float(probs[1])
    pred_label = int(prob_hate >= 0.5)

    all_preds.append(pred_label)
    all_labels.append(true_label)

    results.append({
        "text": text,
        "true_label": true_label,
        "predicted_label": pred_label,
        "prob_hate": prob_hate
    })

# Compute accuracy
accuracy = np.mean(np.array(all_preds) == np.array(all_labels))
print(f"Transported model accuracy: {accuracy:.4f}")

# Save to CSV
results_df = pd.DataFrame(results)
results_df.to_csv('/content/drive/MyDrive/test_twitter_causal_predictions.csv', index=False)
print("Predictions saved to /content/drive/MyDrive/test_twitter_causal_predictions.csv")


100%|██████████| 15475/15475 [16:24<00:00, 15.72it/s]


Transported model accuracy: 0.9574
Predictions saved to /content/drive/MyDrive/test_twitter_causal_predictions.csv
