In [None]:

# Load your full data 
#df = ...

# Extract year-month (e.g., 2020-04) for grouping
df["year_month"] = df["date"].dt.to_period("M")

# Randomly select 4 paragraphs per month
df_sampled = (
    df.groupby("year_month", group_keys=False)
      .apply(lambda x: x.sample(n=4, random_state=42) if len(x) >= 2 else x)
      .reset_index(drop=True)
)

# Add an empty label column for manual annotation
df_sampled["label"] = ""
df_sampled.to_csv("../data/processed/tr_speech_sample.csv", index=False, encoding="utf-8-sig")
df_sampled

In [None]:
# Function: Zero-shot Classification for migration threat  

import pandas as pd
from transformers import pipeline
from tqdm import tqdm

def f_zeroshot_threat_tr(df, text_col, model_name, labels, batch_size):
    classifier = pipeline('zero-shot-classification', model=model_name)
    texts = df[text_col].tolist()

    all_scores = []

    for i in tqdm(range(0, len(texts), batch_size), desc="Classifying texts in batches"):
        batch = texts[i:i + batch_size]
        results = classifier(batch, labels)
        for result in results:
            all_scores.append(dict(zip(result['labels'], result['scores'])))

    # Add label scores column
    model_suffix = model_name.split('/')[-1].replace('-', '_')
    model_scores_col = f"label_scores_{model_suffix}"
    df[model_scores_col] = all_scores

    return df

In [None]:
# Apply function: of zero-shot classification for migration threats

model_names = ['facebook/bart-large-mnli']
labels = [
    "explicitely threatening to send migrants",
    "implicitly threatening to send migrants",
    "criticizing based on migration policy",
    "criticizing based on migration",
    "cooperative migration discourse",
    "neutral or irrelevant",
]
batch_size = 32

for model in model_names:
    df_results = f_zeroshot_threat_tr(df_sampled, "text", model, labels, batch_size=batch_size)

In [None]:
# Spread the label scores into separate columns
scores_df = df_sampled["label_scores_bart_large_mnli"].apply(pd.Series)
df_sampled = pd.concat([df_sampled, scores_df], axis=1)
df_sampled.to_csv("../data/processed/tr_speech_sample_label2.csv", index=False, encoding="utf-8-sig")