# **Sentiment Analysis on Bluesky Social and Politisky Datasets**

In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import gzip, json, pandas as pd
import re
from transformers import pipeline



In [None]:
random.seed(42)


In [None]:
INPUT_DIR = 'datasets'
OUTPUT_DIR = 'annotated_datasets'

In [None]:
def load_jsonl_gz_fixed(path):

    rows = []
    with gzip.open(path, "rt", encoding="utf-8") as f:
        for line in f:
            rows.append(json.loads(line))

    df = pd.DataFrame(rows)

    if "date" in df.columns:
        df["date"] = pd.to_datetime(
            df["date"].astype(str),
            format="%Y%m%d%H%M",
            errors="coerce"
        )

    return df

In [None]:
dfs = []

for file in os.listdir(f"{INPUT_DIR}/feed_posts"):
    if file.endswith(".jsonl.gz"):
        tmp = load_jsonl_gz_fixed(f"{INPUT_DIR}/feed_posts/{file}")
        tmp["feed"] = file.replace(".jsonl.gz", "")
        dfs.append(tmp)

df_all = pd.concat(dfs, ignore_index=True)

  df_all = pd.concat(dfs, ignore_index=True)


In [None]:
df_all = df_all[df_all['langs'].apply(lambda x: isinstance(x, list) and 'eng' in x)]


In [None]:
df_politisky = pd.read_parquet(f"{INPUT_DIR}/llm_annotated_full_user_stance_dataset.parquet")


Context refers to the text column in Politisky dataset, ***context_to_text*** is used to normalize the text.



In [None]:
def context_to_text(x):
    if isinstance(x, np.ndarray):
        return " ".join(str(i) for i in x)
    if isinstance(x, list):
        return " ".join(str(i) for i in x)
    if pd.isna(x):
        return ""
    return str(x)
df_politisky["Context"] = df_politisky["Context"].apply(context_to_text)


In [None]:
print(df_politisky.columns)
print(df_all.columns)

Index(['UserId', 'TargetEntity', 'Context', 'LLMAnswerContent',
       'LLMAnswerUsage', 'SourcePosts', 'Spans', 'Reason', 'StanceLabel',
       'ConfidenceLevel'],
      dtype='object')
Index(['post_id', 'user_id', 'instance', 'date', 'text', 'langs', 'like_count',
       'reply_count', 'repost_count', 'reply_to', 'replied_author',
       'thread_root', 'thread_root_author', 'quotes', 'quoted_author',
       'labels', 'feed'],
      dtype='object')


Both datasets are filtered to get the fields that will be used.


In [None]:
df_bluesky_unified = df_all.rename(columns={
    "post_id": "id",
    "labels": "tags_or_target"
})

df_bluesky_unified = df_bluesky_unified[
    ["id", "text", "tags_or_target"]
].copy()

df_bluesky_unified["source"] = "bluesky"
df_bluesky_social_unified = df_bluesky_unified[
    df_bluesky_unified["text"].notna() &
    (df_bluesky_unified["text"].str.len() > 10)
]

df_politisky_unified = df_politisky.rename(columns={
    "UserId": "id",
    "TargetEntity": "tags_or_target",
    "Context": "text"
})

df_politisky_unified = df_politisky_unified[
    ["id", "text", "tags_or_target"]
].copy()

df_politisky_unified["source"] = "politisky"
df_politisky_unified = df_politisky_unified[
    df_politisky_unified["text"].notna() &
    (df_politisky_unified["text"].str.len() > 10)
]



In [None]:
len(df_bluesky_unified)

127556

In [None]:
df_bluesky_social_unified

Unnamed: 0,id,text,tags_or_target,source
0,2238814,üìç ‚ôøÔ∏è #Disability feed pinned post üìç\n\nAny pos...,,bluesky
1,165204275,"I got curious about this guy, and here is wher...",,bluesky
3,57985819,Hi friends. I don't know how much I'll be arou...,,bluesky
4,39046686,The Ministry of Disabled People is limiting th...,,bluesky
5,39046719,This is the surprise disability support fundin...,,bluesky
...,...,...,...,...
168457,39918937,Have always thought a benefit of studying earl...,,bluesky
168459,82760110,So‚Ä¶ #OnThisDay in #Chartism #C19th üóÉÔ∏è \nAfter ...,,bluesky
168460,32310989,"Gorgeous decorations! ""Shelving books with th...",,bluesky
168461,55090528,Wednesday at Wofford. @@77112 will discuss mat...,,bluesky


In [None]:

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)     # URLs
    text = re.sub(r"@\w+", "", text)        # mentions
    text = re.sub(r"#\w+", "", text)        # hashtags
    text = re.sub(r"\s+", " ", text)
    return text.strip()


In [None]:
df_bluesky_social_unified = df_bluesky_social_unified[
    df_bluesky_social_unified["text"].notna() &
    df_bluesky_social_unified["text"].astype(str).str.strip().ne("")
]

df_politisky_unified = df_politisky_unified[
    df_politisky_unified["text"].notna() &
    df_politisky_unified["text"].astype(str).str.strip().ne("")
]


In [None]:
df_bluesky_social_unified = df_bluesky_social_unified.drop_duplicates(subset="text")
df_politisky_unified = df_politisky_unified.drop_duplicates(subset="text")


In [None]:
df_bluesky_social_unified = df_bluesky_social_unified[
    df_bluesky_social_unified["text"].str.len() >= 10
]

df_politisky_unified = df_politisky_unified[
    df_politisky_unified["text"].str.len() >= 10
]

In [None]:
df_bluesky_social_unified["clean_text"] = df_bluesky_social_unified["text"].apply(preprocess_text)
df_politisky_unified["clean_text"] = df_politisky_unified["text"].apply(preprocess_text)


In [None]:

bert_sentiment = pipeline(
    "sentiment-analysis",
    model="distilbert-base-uncased-finetuned-sst-2-english"
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cpu


In [None]:
from tqdm import tqdm
tqdm.pandas()

def predict_bert(text):
    out = bert_sentiment(text[:512])[0]["label"]
    return "positive" if out == "POSITIVE" else "negative"

df_bluesky_social_unified["sent_bert"] = df_bluesky_social_unified["clean_text"].progress_apply(predict_bert)


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 125590/125590 [09:48<00:00, 213.58it/s]


In [None]:
df_politisky_unified["sent_bert"] = df_politisky_unified["clean_text"].progress_apply(predict_bert)


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 15964/15964 [48:40<00:00,  5.47it/s]


Correct Zeroshot

In [7]:
from transformers import pipeline

zero_shot = pipeline(
    "zero-shot-classification",
    model="facebook/bart-large-mnli"
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0


In [None]:
valid_texts = []
valid_indices = []

for idx, text in df_bluesky_social_unified["clean_text"].items():
    if isinstance(text, str) and text.strip():
        valid_texts.append(text[:512])
        valid_indices.append(idx)


In [5]:
valid_texts_politisky = []
valid_indices_politisky = []

for idx, text in df_politisky_unified["clean_text"].items():
    if isinstance(text, str) and text.strip():
        valid_texts_politisky.append(text[:512])
        valid_indices_politisky.append(idx)

In [6]:
def batched_zeroshot_with_conf(
    texts,
    batch_size=8,
    labels=("positive", "neutral", "negative")
):
    preds = []
    confs = []

    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i + batch_size]

        if not batch:
            continue

        try:
            results = zero_shot(
                batch,
                candidate_labels=list(labels)
            )
        except Exception as e:
            for _ in batch:
                preds.append(None)
                confs.append(None)
            continue

        if isinstance(results, dict):
            results = [results]

        for r in results:
            preds.append(r["labels"][0])
            confs.append(float(r["scores"][0]))

    return preds, confs


In [None]:
df_bluesky_social_unified["sent_zeroshot"] = None
df_bluesky_social_unified["zs_conf"] = None

preds, confs = batched_zeroshot_with_conf(
    valid_texts,
    batch_size=8
)

for idx, p, c in zip(valid_indices, preds, confs):
    df_bluesky_social_unified.at[idx, "sent_zeroshot"] = p
    df_bluesky_social_unified.at[idx, "zs_conf"] = c


  0%|          | 10/15684 [00:07<2:34:18,  1.69it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 15684/15684 [2:22:13<00:00,  1.84it/s]


In [8]:
preds_p, confs_p = batched_zeroshot_with_conf(
    valid_texts_politisky,
    batch_size=8
)

for idx, p, c in zip(valid_indices_politisky, preds_p, confs_p):
    df_politisky_unified.at[idx, "sent_zeroshot"] = p
    df_politisky_unified.at[idx, "zs_conf"] = c


  1%|          | 10/1995 [00:10<30:51,  1.07it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1995/1995 [32:34<00:00,  1.02it/s]


In [None]:
df_bluesky_social_unified.reset_index().to_csv(
    f"{OUTPUT_DIR}/llm/df_social_annotated.csv",
    index=False
)

In [None]:
df_politisky_unified.reset_index().to_csv(
    f"{OUTPUT_DIR}/llm/df_politisky_annotated.csv",
    index=False
)

200 random samples are saved from both datasets in order to be annotated by humans to calculate scores.

In [None]:
df_sample_200 = df_bluesky_social_unified.sample(n=200, random_state=42)

df_sample_200.reset_index().to_csv(
    f"{OUTPUT_DIR}/human/social_sentiment_test_sample_200.csv",
    index=False
)


In [None]:
df_sample_200_p = df_politisky_unified.sample(n=200, random_state=42)

df_sample_200_p.reset_index().to_csv(
    f"{OUTPUT_DIR}/human/politisky_sentiment_test_sample_200.csv",
    index=False
)