In [1]:
import os, csv, time, random, re, json, requests
import pandas as pd
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime, timedelta, timezone

DATA_DIR = 'data/lab'
RAW_API_DIR = os.path.join(DATA_DIR, 'raw_api')
RAW_HTML_DIR = os.path.join(DATA_DIR, 'raw_html')
for d in [DATA_DIR, RAW_API_DIR, RAW_HTML_DIR]:
    os.makedirs(d, exist_ok=True)

HEADERS = {"User-Agent": "MADS-LLM-2026-Project (+https://example.edu)"}

def sleep_politely(sleep_range = (0.2,0.5)):
    SLEEP_RANGE = sleep_range
    time.sleep(random.uniform(*SLEEP_RANGE))

In [3]:
# Authentication using app password. OpenAI. (2023). ChatGPT (Mar 14 version) [Large language model]. https://chat.openai.com/chat. In-text citations are simply (OpenAI, 2023) or OpenAI (2023).
pds = "https://bsky.social/xrpc/com.atproto.server.createSession"
payload = {"identifier": "celinavelazquez.bsky.social", "password": "nv22-lumi-pofs-wui7"}

resp = requests.post(pds, json=payload, timeout=60)
resp.raise_for_status()
session = resp.json()
access_jwt = session["accessJwt"]

HEADERS = {"Authorization": f"Bearer {access_jwt}"}

# Refresh token
def refresh_token():
    global access_jwt, refresh_jwt, HEADERS
    r = requests.post(
        "https://bsky.social/xrpc/com.atproto.server.refreshSession",
        headers={"Authorization": f"Bearer {refresh_jwt}"},
        timeout=60
    )
    r.raise_for_status()
    session = r.json()
    access_jwt = session["accessJwt"]
    refresh_jwt = session["refreshJwt"]
    HEADERS = {"Authorization": f"Bearer {access_jwt}"}


In [4]:
# Get posts with the search term "politics"
TOPIC = "politics"
BASE = "https://bsky.social/xrpc/app.bsky.feed.searchPosts"
since_dt = datetime.now(timezone.utc) - timedelta(days=30)
since_str = since_dt.strftime("%Y-%m-%dT%H:%M:%SZ")
params = {"q": TOPIC, "limit":100, "since":since_str, "lang":"en"}

# Pagination. OpenAI. (2023). ChatGPT (Mar 14 version) [Large language model]. https://chat.openai.com/chat. In-text citations are simply (OpenAI, 2023) or OpenAI (2023).
# Pagination State
all_posts = []
cursor = None

#Pagination Loop
while True:
    page_params = params.copy()
    if cursor:
        page_params["cursor"] = cursor

    r = requests.get(BASE, params=page_params, headers=HEADERS, timeout=60)
    print(r.status_code, r.text[:200])
    r.raise_for_status()

    data = r.json()
    posts = data.get('posts', [])
    all_posts.extend(posts)

    cursor = data.get("cursor")
    if not cursor or not posts:
        break

sleep_politely()

len(all_posts)

200 {"posts":[{"uri":"at://did:plc:q2pe2753p53fdhrnt2ylsnhr/app.bsky.feed.post/3mdsxtmsqlk2t","cid":"bafyreifuuh7rziec5ewmueejfz6k4a6mcfu45bdvwrykehylszq2i4lkde","author":{"did":"did:plc:q2pe2753p53fdhrnt
200 {"posts":[{"uri":"at://did:plc:awkwhvolwrmulrsve5t3sugo/app.bsky.feed.post/3mdswxi6ub22v","cid":"bafyreie4hjtnitgjeheeht5loqwfw7jcxeewphyxsoikya33gljniccsvm","author":{"did":"did:plc:awkwhvolwrmulrsve
200 {"posts":[{"uri":"at://did:plc:lm2uhaoqoe6yo76oeihndfyi/app.bsky.feed.post/3mdsw2gfwho2l","cid":"bafyreiah5rptgidxd7lh2eewq63cqnijmffn5sp6osznvjr6nkizoo5rf4","author":{"did":"did:plc:lm2uhaoqoe6yo76oe
200 {"posts":[{"uri":"at://did:plc:2d34teyc5y23q57eb6i6zmqj/app.bsky.feed.post/3mdsv66og3v2m","cid":"bafyreifltfgcc4qeba3abxhbqcdvvnzaj4ssz4xvkyqunclexfbpazglpa","author":{"did":"did:plc:2d34teyc5y23q57eb
200 {"posts":[{"uri":"at://did:plc:r63juenfni5lyahm4prwvhdg/app.bsky.feed.post/3mdsuazvio22q","cid":"bafyreibcf45t2jp76kcxnimsqp7xs6hurk4rah2m3vtt5ijqwa4vzceti4","author":{"did":"d

9841

In [5]:
# flatten json
df = pd.json_normalize(all_posts)

In [6]:
# save to CSV file
df.to_csv(r"C:\Users\celin\OneDrive\Education\USD-MS-ADS\ADS-509-Applied-Text-Mining\Final-Project\ADS-509_LLM\scripts\bsky_posts.csv")

In [7]:
BASE = "https://bsky.social/xrpc/app.bsky.feed.getPostThread"

missing_uris = []
failed_uris = []

out_csv = r"C:\Users\celin\OneDrive\Education\USD-MS-ADS\ADS-509-Applied-Text-Mining\Final-Project\ADS-509_LLM\scripts\bsky_threads_v2.csv"

batch_size = 100
uris = df["uri"].dropna().astype(str).str.strip().reset_index(drop=True)

# Resume set (safe because schema is stable)
if os.path.exists(out_csv) and os.path.getsize(out_csv) > 0:
    done_uris = set(pd.read_csv(out_csv, usecols=["uri"])["uri"].astype(str))
else:
    done_uris = set()

for start in range(0, len(uris), batch_size):
    batch = uris.iloc[start:start + batch_size]

    for uri in batch:
        if uri in done_uris:
            continue

        page_params_t = {"uri": uri}

        max_retries = 5
        skip_uri = False

        for attempt in range(max_retries):
            try:
                r_t = requests.get(BASE, params=page_params_t, headers=HEADERS, timeout=60)

                if r_t.status_code in (502, 503, 504):
                    sleep_politely((1.0 + attempt, 2.0 + attempt))
                    continue

                if r_t.status_code == 400:
                    err = r_t.json()
                    if err.get("error") == "ExpiredToken":
                        refresh_token()
                        continue
                    if err.get("error") == "NotFound":
                        missing_uris.append(uri)
                        skip_uri = True
                        break
                    r_t.raise_for_status()

                r_t.raise_for_status()
                break

            except requests.exceptions.Timeout:
                sleep_politely((1.0 + attempt, 2.0 + attempt))

        else:
            failed_uris.append(uri)
            continue

        if skip_uri:
            continue

        thread = r_t.json().get("thread")
        if thread is None:
            continue

        # Append one stable row: uri + raw thread json
        file_exists = os.path.exists(out_csv)
        with open(out_csv, "a", newline="", encoding="utf-8") as f:
            w = csv.DictWriter(f, fieldnames=["uri", "thread_json"])
            if not file_exists:
                w.writeheader()
            w.writerow({"uri": uri, "thread_json": json.dumps(thread, ensure_ascii=False)})

        done_uris.add(uri)

len(done_uris)


9833

In [None]:
# flatten json
len(missing_uris), len(failed_uris)

pd.Series(missing_uris, name="uri").to_csv(r"C:\Users\celin\OneDrive\Education\USD-MS-ADS\ADS-509-Applied-Text-Mining\Final-Project\ADS-509_LLM\scripts\missing_uris.csv", index=False)
pd.Series(failed_uris, name="uri").to_csv(r"C:\Users\celin\OneDrive\Education\USD-MS-ADS\ADS-509-Applied-Text-Mining\Final-Project\ADS-509_LLM\scripts\failed_uris.csv", index=False)



In [64]:
df_t.to_csv(r"C:\Users\celin\OneDrive\Education\USD-MS-ADS\ADS-509-Applied-Text-Mining\Final-Project\ADS-509_LLM\scripts\bsky_threads.csv")