# Quick Introduction:
Hello, this is a jupyter notebook created by Adel for the medium article {article title}. The goal of the coade is to analyze the culture of a goegraphical region according to how people associated with said region tweet online. I recomend reading the article for more information.

# Step 0: Sanity Checks
Here we make sure everything loads and works as intended

In [70]:
# package load 
import os
from dotenv import load_dotenv
import re
import time
from typing import List, Dict


import json
from pathlib import Path

import requests
import pandas as pd
import numpy as np
import requests
from pathlib import Path
from collections import defaultdict
from tqdm import tqdm
from sklearn.decomposition import PCA
import umap

In [71]:
# This loads variables from .env into environment variables
load_dotenv()

BEARER_TOKEN = os.getenv("X_BEARER_TOKEN")
if BEARER_TOKEN is not None:
    print("Token loaded successfully from .env")
else:
    print("Token not found in .env, refer to .env.example")

HEADERS = {"Authorization": f"Bearer {BEARER_TOKEN}"}
BASE_URL = "https://api.x.com/2"

    
# maybe test token works by seeing if elon's user can be pinged

Token loaded successfully from .env


# Step 1: Finding X users to Scrape and scraping their sweet sweet data

## step 1.1: Load the config for our target area

In [77]:
# select target, default available is "uk", "nyc", "singapore"
target_area = "uk"

CONFIG_PATH = Path("..") / "configs" / f"{target_area}.json"

with open (CONFIG_PATH) as f:
    CFG = json.load(f)

print(f"Loaded config for {target_area}:")
cfg_copy = CFG.copy()
if len(CFG['bio_keywords']) > 5:
    cfg_copy['bio_keywords'] = CFG['bio_keywords'][:5] + ['...']
print(json.dumps(cfg_copy, indent=2))
del cfg_copy # only used for example print

Loaded config for uk:
{
  "region_name": "uk",
  "local_seeds": "../data/local_seeds/uk.json",
  "users_output": "../data/users/uk_users.jsonl",
  "users_adjacent_output": "../data/users_adjacent/uk_users.jsonl",
  "tweets_output": "../data/tweets/uk_tweets.jsonl",
  "bio_keywords": [
    "uk",
    "united kingdom",
    "england",
    "scotland",
    "wales",
    "..."
  ],
  "max_mentions_per_seed": 1,
  "max_tweets_per_user": 2
}


In [73]:
with open(CFG["local_seeds"]) as f:
    seeds = json.load(f)

seed_types = list(seeds.keys())
print(f"{target_area} seed types are:", ", ".join(seed_types))

# Peek at the first few in one category, e.g. sports
print("\nFirst few sports seeds:")
seeds["sports"][:5]

uk seed types are: sports, music, tech_lifestyle, comedy

First few sports seeds:


['@premierleague', '@Arsenal']

## Step 1.2: Clean + Validate seed handles

### step 1.2.1 Filter for valid seeds

In [74]:
def is_valid_username(u: str) -> bool:
    return bool(re.fullmatch(r"[A-Za-z0-9_]{1,15}", u))

seed_types = list(seeds.keys())

valid_seeds = []
invalid_seeds = []

for seed_types in seeds:
    for seed in seeds[seed_types]:
        clean = seed.lstrip("@") if seed.startswith("@") else seed
        if is_valid_username(clean):
            valid_seeds.append(clean)
        else:
            invalid_seeds.append(seed)

print(f"Valid {target_area} seeds: {len(valid_seeds)} | Invalid: {len(invalid_seeds)}")

Valid uk seeds: 8 | Invalid: 0


### step 1.2.2: Resolve seeds via `/2/users/by` (X API v2)
API level validation: "do these usernames actually exist, lets get their metadata"

In [75]:
def lookup_usernames(usernames: List[str]) -> List[Dict]:
    """
    X API v2: Lookup users by username.
    
    HTTP:
        GET /2/users/by
    
    Docs:
        https://developer.x.com/en/docs/twitter-api/users/lookup/api-reference/get-users-by
    """
    results = []
    for i in range(0, len(usernames), 100):
        batch = usernames[i:i+100]
        params = {
            "usernames": ",".join(batch),
            "user.fields": "id,username,name,location,description,public_metrics"
        }
        try:
            r = requests.get(f"{BASE_URL}/users/by", headers=HEADERS, params=params)
            r.raise_for_status()
            data = r.json().get("data", [])
            results.extend(data)
            print(f"Batch {i//100}: {len(data)}/{len(batch)} users")
        except requests.HTTPError as e:
            print(f"Batch {i//100} error: {e}")
        time.sleep(0.5)  # to avoid have timeouts & ensure expected behavior
    return results

In [76]:
seed_users = lookup_usernames(valid_seeds)
print(f"Resolved {len(seed_users)} seed user objects out of {len(valid_seeds)} valid usernames")
# TODO: Should we save seeds ... maybe seeds_unfiltered and seeds_filtered directories respectfully

Batch 0: 8/8 users
Resolved 8 seed user objects out of 8 valid usernames


### Step 1.2.3 Find Adjacent Accounts to Seeds

In [78]:
# from the PDF – this stays as "find adjacent users by mentions"
def search_region_mentions_batched(
    seeds: List[Dict],
    batch_size: int = 20,
    max_adj: int = 3000,
) -> List[Dict]:
    """
    Find 'adjacent' users who mention the seed accounts.

    Uses:
        X API v2: GET /2/tweets/search/recent

    Query pattern (for UK example):
        "UK (@premierleague OR @Arsenal OR @ChelseaFC ...)"
    """
    # usernames of your seed accounts
    usernames = [u["username"] for u in seeds]

    # simple base region term – you could later make this more complex
    region_term = CFG.get("tweet_region_term", CFG["region_name"])  # e.g. "UK"

    all_adjacent: Dict[str, Dict] = {}
    total_fetched = 0

    for i in range(0, len(usernames), batch_size):
        batch = usernames[i:i + batch_size]
        batch_handles = [f"@{u}" for u in batch]
        query = f'{region_term} ({" OR ".join(batch_handles)})'

        print(f"Batch {i//batch_size + 1}: {len(batch)} seeds → {query[:120]}...")

        url = f"{BASE_URL}/tweets/search/recent"
        params = {
            "query": query,
            "max_results": 100,
            "tweet.fields": "author_id",
            "expansions": "author_id",
            "user.fields": "id,username,name,location,description,public_metrics",
        }

        fetched_in_batch = 0

        while fetched_in_batch < 500 and total_fetched < max_adj:
            try:
                r = requests.get(url, headers=HEADERS, params=params)
                r.raise_for_status()
                resp = r.json()

                # users come back in the 'includes.users' block
                includes = resp.get("includes", {}).get("users", [])
                for user in includes:
                    uid = user["id"]
                    if uid not in all_adjacent:
                        all_adjacent[uid] = user
                        total_fetched += 1
                        fetched_in_batch += 1

                token = resp.get("meta", {}).get("next_token")
                if not token:
                    break

                params["next_token"] = token
                time.sleep(0.5)  # avoid rate limits

            except Exception as e:
                print(f"  Error: {e}")
                break

        print(f"  → {fetched_in_batch} new users (total: {total_fetched})")
        time.sleep(2)

        if total_fetched >= max_adj:
            print("Reached max_adj limit, stopping.")
            break

    return list(all_adjacent.values())


In [79]:
adjacent_users = search_region_mentions_batched(
    seed_users,
    batch_size=20,
    max_adj=CFG.get("max_adjacent_users", 200),
)

adj_dir = Path("../data/users_adjacent")
adj_dir.mkdir(parents=True, exist_ok=True)

adj_path = adj_dir / f"{target_area}_adjacent_users.jsonl"
with adj_path.open("w", encoding="utf-8") as f:
    for u in adjacent_users:
        f.write(json.dumps(u) + "\n")

print(f"Saved {len(adjacent_users)} adjacent users → {adj_path}")

Batch 1: 8 seeds → uk (@premierleague OR @Arsenal OR @TheO2 OR @RoyalAlbertHall OR @techUK OR @LDNTechWeek OR @Tate OR @britishmuseum)...
  → 212 new users (total: 212)
Reached max_adj limit, stopping.
Saved 212 adjacent users → ../data/users_adjacent/uk_adjacent_users.jsonl


### 1.2.4 Filter Adjacent Users by bio (confirmed locals)

In [80]:
"""
if wanted to load from file
adj_path = Path("../data/users_adjacent") / f"{target_area}_adjacent_users.jsonl"
with adj_path.open() as f:
     adj_rows = [json.loads(line) for line in f]

adj_df = pd.DataFrame(adj_rows)
"""

adj_df = pd.DataFrame(adjacent_users)

# Combine description + location into a single "bio-like" field
adj_df["bio"] = (
    adj_df["description"].fillna("") + " " +
    adj_df["location"].fillna("")
)

adj_df["is_in_region"] = adj_df["bio"].apply(bio_matches_region)

region_users_df = adj_df[adj_df["is_in_region"]].copy()
print("Adjacent users:", len(adj_df), "| confirmed locals:", len(region_users_df))

# Save to the final users_output path from config
final_path = Path(CFG["users_output"])
final_path.parent.mkdir(parents=True, exist_ok=True)
region_users_df.to_json(final_path, orient="records", lines=True)

print(f"Saved {len(region_users_df)} confirmed local users → {final_path}")

Adjacent users: 212 | confirmed locals: 121
Saved 121 confirmed local users → ../data/users/uk_users.jsonl


### 1.2.5 Get tweets of confirmed local users

# Scratch

In [45]:
bio_keywords = [kw.lower() for kw in CFG["bio_keywords"]]

def bio_matches_region(bio: str) -> bool:
    if not bio:
        return False
    bio_l = bio.lower()
    return any(kw in bio_l for kw in bio_keywords)

In [None]:
def search_mentions_for_handle(handle: str, max_results: int = 50):
    """
    X API v2: Recent search
    HTTP:
        GET /2/tweets/search/recent
    Docs:
        https://developer.x.com/en/docs/twitter-api/tweets/search/api-reference/get-tweets-search-recent
    """
    url = f"{BASE_URL}/tweets/search/recent"
    params = {
        "query": f"@{handle} -is:retweet",
        "tweet.fields": "author_id,created_at",
        "max_results": max_results,
    }
    r = requests.get(url, headers=HEADERS, params=params)
    r.raise_for_status()
    return r.json().get("data", [])
