In [5]:
import os, time, random, re, json, requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime, timedelta, timezone

DATA_DIR = 'data/lab'
RAW_API_DIR = os.path.join(DATA_DIR, 'raw_api')
RAW_HTML_DIR = os.path.join(DATA_DIR, 'raw_html')
for d in [DATA_DIR, RAW_API_DIR, RAW_HTML_DIR]:
    os.makedirs(d, exist_ok=True)

HEADERS = {"User-Agent": "MADS-LLM-2026-Project (+https://example.edu)"}

def sleep_politely(sleep_range = (0.2,0.5)):
    SLEEP_RANGE = sleep_range
    time.sleep(random.uniform(*SLEEP_RANGE))

In [6]:
# Authentication using app password. OpenAI. (2023). ChatGPT (Mar 14 version) [Large language model]. https://chat.openai.com/chat. In-text citations are simply (OpenAI, 2023) or OpenAI (2023).
pds = "https://bsky.social/xrpc/com.atproto.server.createSession"
payload = {"identifier": "celinavelazquez.bsky.social", "password": "nv22-lumi-pofs-wui7"}

resp = requests.post(pds, json=payload, timeout=60)
resp.raise_for_status()
session = resp.json()
access_jwt = session["accessJwt"]

HEADERS = {"Authorization": f"Bearer {access_jwt}"}

# Refresh token
def refresh_token():
    global access_jwt, refresh_jwt, HEADERS
    r = requests.post(
        "https://bsky.social/xrpc/com.atproto.server.refreshSession",
        headers={"Authorization": f"Bearer {refresh_jwt}"},
        timeout=60
    )
    r.raise_for_status()
    session = r.json()
    access_jwt = session["accessJwt"]
    refresh_jwt = session["refreshJwt"]
    HEADERS = {"Authorization": f"Bearer {access_jwt}"}


In [7]:
# Get posts with the search term "politics"
TOPIC = "politics"
BASE = "https://bsky.social/xrpc/app.bsky.feed.searchPosts"
since_dt = datetime.now(timezone.utc) - timedelta(days=30)
since_str = since_dt.strftime("%Y-%m-%dT%H:%M:%SZ")
params = {"q": TOPIC, "limit":100, "since":since_str, "lang":"en"}

# Pagination. OpenAI. (2023). ChatGPT (Mar 14 version) [Large language model]. https://chat.openai.com/chat. In-text citations are simply (OpenAI, 2023) or OpenAI (2023).
# Pagination State
all_posts = []
cursor = None

#Pagination Loop
while True:
    page_params = params.copy()
    if cursor:
        page_params["cursor"] = cursor

    r = requests.get(BASE, params=page_params, headers=HEADERS, timeout=60)
    print(r.status_code, r.text[:200])
    r.raise_for_status()

    data = r.json()
    posts = data.get('posts', [])
    all_posts.extend(posts)

    cursor = data.get("cursor")
    if not cursor or not posts:
        break

sleep_politely()

len(all_posts)

200 {"posts":[{"uri":"at://did:plc:cmvc2f5b5fou3ijbgagiovlj/app.bsky.feed.post/3mdt3gudkxc2i","cid":"bafyreigtagkrskwc65ymqwr5gd2f2nsjipkzhm377bicvaobrq7s4oe3sm","author":{"did":"did:plc:cmvc2f5b5fou3ijbg
200 {"posts":[{"uri":"at://did:plc:5abf2lqvnbkke7ovdrxivvhq/app.bsky.feed.post/3mdt2g7xwak2l","cid":"bafyreif24jes5q5vlooox54ze2izmpwj6fjlz4ef27gulc6v2cw5ozrsmq","author":{"did":"did:plc:5abf2lqvnbkke7ovd
200 {"posts":[{"uri":"at://did:plc:lwm5ivkqinydevqzog6eeuok/app.bsky.feed.post/3mdsze5owy22p","cid":"bafyreieosqwfdtcpcd4numkdyv6dzxf7trl73pt5vwjpiui37dgyv6f47a","author":{"did":"did:plc:lwm5ivkqinydevqzo
200 {"posts":[{"uri":"at://did:plc:kzwojb7vgizzhejvp5z6ygig/app.bsky.feed.post/3mdsyf73i622c","cid":"bafyreidaq26ihfxgyhntng525qpqm2whdcj7n24btilv7dmsqv3t7a2c4i","author":{"did":"did:plc:kzwojb7vgizzhejvp
200 {"posts":[{"uri":"at://did:plc:5hurbrrfyhq5m2gcjyu5arf4/app.bsky.feed.post/3mdsxjbaibc2a","cid":"bafyreicawjuhgprnhny2x7xo5rfiynkuuopy5cmyhsqhmstua33adsmw2q","author":{"did":"d

9844

In [8]:
# flatten json
df = pd.json_normalize(all_posts)

In [82]:
# save to CSV file
df.to_csv(r"C:\Users\celin\OneDrive\Education\USD-MS-ADS\ADS-509-Applied-Text-Mining\Final-Project\ADS-509_LLM\scripts\bsky_posts.csv")

In [None]:
# Get threads/comments
BASE = "https://bsky.social/xrpc/app.bsky.feed.getPostThread"

all_threads = []
missing_uris = []
failed_uris = []
out_csv = r"C:\Users\celin\OneDrive\Education\USD-MS-ADS\ADS-509-Applied-Text-Mining\Final-Project\ADS-509_LLM\scripts\bsky_threads.csv"

batch_size = 100
uris = df["uri"].dropna().reset_index(drop=True)

import os
if os.path.exists(out_csv):
    done_uris = set(pd.read_csv(out_csv, usecols=["thread.post.uri"])["thread.post.uri"].astype(str))
else:
    done_uris = set()


for start in range(0, len(uris), batch_size):
    batch = uris.iloc[start:start + batch_size]

    for uri in batch:
        uri = str(uri).strip()
        if uri in done_uris:
            continue  # already saved to CSV; skip
        page_params_t = {"uri": uri}

        max_retries = 5
        skip_uri = False

        for attempt in range(max_retries):
            try:
                r_t = requests.get(
                    BASE,
                    params=page_params_t,
                    headers=HEADERS,
                    timeout=60
                )

                if r_t.status_code in (502, 503, 504):
                    sleep_politely((1.0 + attempt, 2.0 + attempt))
                    continue

                if r_t.status_code == 400:
                    err = r_t.json()
                    if err.get("error") == "ExpiredToken":
                        refresh_token()
                        continue
                    if err.get("error") == "NotFound":
                        missing_uris.append(uri)
                        skip_uri = True
                        break
                    r_t.raise_for_status()

                r_t.raise_for_status()
                break  # SUCCESS

            except requests.exceptions.Timeout:
                sleep_politely((1.0 + attempt, 2.0 + attempt))

        else:
            failed_uris.append(uri)
            continue

        if skip_uri:
            continue

        data = r_t.json()
        thread = data.get("thread")
        if thread is not None:
            all_threads.append(thread)

            df_out = pd.json_normalize([thread])
            df_out.to_csv(
                out_csv,
                mode="a",
                header=not os.path.exists(out_csv),
                index=False
            )

len(all_threads)


ValueError: Usecols do not match columns, columns expected but not found: ['thread.post.uri']

In [None]:
# flatten json
len(missing_uris), len(failed_uris)

pd.Series(missing_uris, name="uri").to_csv(r"C:\Users\celin\OneDrive\Education\USD-MS-ADS\ADS-509-Applied-Text-Mining\Final-Project\ADS-509_LLM\scripts\missing_uris.csv", index=False)
pd.Series(failed_uris, name="uri").to_csv(r"C:\Users\celin\OneDrive\Education\USD-MS-ADS\ADS-509-Applied-Text-Mining\Final-Project\ADS-509_LLM\scripts\failed_uris.csv", index=False)



In [64]:
df_t.to_csv(r"C:\Users\celin\OneDrive\Education\USD-MS-ADS\ADS-509-Applied-Text-Mining\Final-Project\ADS-509_LLM\scripts\bsky_threads.csv")