In [57]:
import os, time, random, re, json, requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt

DATA_DIR = 'data/lab'
RAW_API_DIR = os.path.join(DATA_DIR, 'raw_api')
RAW_HTML_DIR = os.path.join(DATA_DIR, 'raw_html')
for d in [DATA_DIR, RAW_API_DIR, RAW_HTML_DIR]:
    os.makedirs(d, exist_ok=True)

HEADERS = {"User-Agent": "MADS-LLM-2026-Project (+https://example.edu)"}

def sleep_politely(sleep_range = (0.2,0.5)):
    SLEEP_RANGE = sleep_range
    time.sleep(random.uniform(*SLEEP_RANGE))

In [58]:
# Authentication using app password. OpenAI. (2023). ChatGPT (Mar 14 version) [Large language model]. https://chat.openai.com/chat. In-text citations are simply (OpenAI, 2023) or OpenAI (2023).
pds = "https://bsky.social/xrpc/com.atproto.server.createSession"
payload = {"identifier": "celinavelazquez.bsky.social", "password": "nv22-lumi-pofs-wui7"}

resp = requests.post(pds, json=payload, timeout=60)
resp.raise_for_status()
session = resp.json()
access_jwt = session["accessJwt"]

HEADERS = {"Authorization": f"Bearer {access_jwt}"}

# Refresh token
def refresh_token():
    global access_jwt, refresh_jwt, HEADERS
    r = requests.post(
        "https://bsky.social/xrpc/com.atproto.server.refreshSession",
        headers={"Authorization": f"Bearer {refresh_jwt}"},
        timeout=60
    )
    r.raise_for_status()
    session = r.json()
    access_jwt = session["accessJwt"]
    refresh_jwt = session.get("refreshJwt", refresh_jwt)
    HEADERS = {"Authorization": f"Bearer {access_jwt}"}


In [None]:
# Get posts with the search term "politics"
TOPIC = "politics"
BASE = "https://bsky.social/xrpc/app.bsky.feed.searchPosts"
params = {"q": TOPIC, "limit":100, "since":"2026-01-01T00:00:00Z", "lang":"en"}

# Pagination. OpenAI. (2023). ChatGPT (Mar 14 version) [Large language model]. https://chat.openai.com/chat. In-text citations are simply (OpenAI, 2023) or OpenAI (2023).
# Pagination State
all_posts = []
cursor = None

#Pagination Loop
while True:
    page_params = params.copy()
    if cursor:
        page_params["cursor"] = cursor

    r = requests.get(BASE, params=page_params, headers=HEADERS, timeout=60)
    print(r.status_code, r.text[:200])
    r.raise_for_status()

    data = r.json()
    posts = data.get('posts', [])
    all_posts.extend(posts)

    cursor = data.get("cursor")
    if not cursor or not posts:
        break

sleep_politely()

len(all_posts)

200 {"posts":[{"uri":"at://did:plc:6c2cg6rhjybke42mkfuensec/app.bsky.feed.post/3mdrsoyg32k2r","cid":"bafyreid2n5nkzuyglljghugnldle2e3h23k7hm3dm3wsuzorw2zqctkkhq","author":{"did":"did:plc:6c2cg6rhjybke42mk
200 {"posts":[{"uri":"at://did:plc:7atfw5iz77vwsq5gp3l2itml/app.bsky.feed.post/3mdrq7qcaq22q","cid":"bafyreia5lhoscpuv7qmkvt7gdk6g26vwaptsfcgu46wuoq2uicb7ksc3iu","author":{"did":"did:plc:7atfw5iz77vwsq5gp
200 {"posts":[{"uri":"at://did:plc:nb6uhiglzowsdcbgwv2itwa7/app.bsky.feed.post/3mdrnze36cs2c","cid":"bafyreigjgz5q4fky2g2mqm3msrr7tqp54wzx5hwdcphopxwmeivr5ow3mm","author":{"did":"did:plc:nb6uhiglzowsdcbgw
200 {"posts":[{"uri":"at://did:plc:tpj7ajdxqvhxidtesrgru6uc/app.bsky.feed.post/3mdrlvmsfas2b","cid":"bafyreienyqheidejidg3gtmdv7u7ckwkrlcvef34mazgmblyvgroae3aym","author":{"did":"did:plc:tpj7ajdxqvhxidtes
200 {"posts":[{"uri":"at://did:plc:xzyfmdn54o5pumz6pagr2fbo/app.bsky.feed.post/3mdrjv5exts2s","cid":"bafyreibck7kakzvveoiqccfshs6nojec6hwbrthuln7p7k6hc4grqp5ao4","author":{"did":"d

9852

In [None]:
# flatten json
df = pd.json_normalize(all_posts)

In [None]:
# save to CSV file
df.to_csv(r"C:\Users\celin\OneDrive\Education\USD-MS-ADS\ADS-509-Applied-Text-Mining\Final-Project\ADS-509_LLM\scripts\bsky_posts-2.csv")

In [None]:
BASE = "https://bsky.social/xrpc/app.bsky.feed.getPostThread"

all_threads = []
missing_uris = []
failed_uris = []

batch_size = 100
uris = df["uri"].dropna().reset_index(drop=True)

for start in range(0, len(uris), batch_size):
    batch = uris.iloc[start:start + batch_size]

    for uri in batch:
        uri = str(uri).strip()
        page_params_t = {"uri": uri}

        max_retries = 5
        for attempt in range(max_retries):
            try:
                r_t = requests.get(
                    BASE,
                    params=page_params_t,
                    headers=HEADERS,
                    timeout=60
                )

                if r_t.status_code in (502, 503, 504):
                    sleep_politely((1.0 + attempt, 2.0 + attempt))
                    continue

                if r_t.status_code == 400:
                    err = r_t.json()
                    if err.get("error") == "ExpiredToken":
                        refresh_token()
                        continue  # retry same URI with new token
                    if err.get("error") == "NotFound":
                        missing_uris.append(uri)
                        break  # stop retrying this URI; move to next uri
                    r_t.raise_for_status()  # other 400s are real problems

            except requests.exceptions.Timeout:
                sleep_politely((1.0 + attempt, 2.0 + attempt))

        else:
            failed_uris.append(uri)
            continue

        data = r_t.json()
        thread = data.get("thread")
        if thread is not None:
            all_threads.append(thread)
            
len(all_threads)


In [None]:
# flatten json
df_t = pd.json_normalize(all_threads)


400
{"error":"NotFound","message":"Post not found: at://did:plc:ny4t5aqqh54xdpfx75slvmre/app.bsky.feed.post/3mdrsgbi6mec2"}


In [None]:
df_t.to_csv(r"C:\Users\celin\OneDrive\Education\USD-MS-ADS\ADS-509-Applied-Text-Mining\Final-Project\ADS-509_LLM\scripts\bsky_threads.csv")