We want to:
- Load the snapshots of X and BlueSky data
- Format them into threads (give replies/quotes their necessary context)
- Filter by the politician-focused keyword lists
- Export for narrative extraction

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import io
import json
import os
import pickle
import re
import uuid

from itertools import product 

from dotenv import load_dotenv
from tqdm import tqdm

In [2]:
pd.set_option('display.max_columns', 100)

# Load data
From Andrea's cleaned snapshots  

2 types of filteres: text filtering on posts with keywords, user filtering with usernames (users and replies/quotes of them), then perform OR to get relevant posts.

## Filters

In [10]:
# Load keywords
k_path = './data/keywords/bluesky_keywords_politicians.txt'
with open(k_path, 'r') as f:
    keywords_bluesky = f.readlines()
keywords_bluesky = [k.strip() for k in keywords_bluesky]

k_path = './data/keywords/x_keywords_politicians.txt'
with open(k_path, 'r') as f:
    keywords_x = f.readlines()
keywords_x = [k.strip() for k in keywords_x]

In [11]:
# Load usernames
k_path = './data/keywords/bluesky_usernames_politicians.txt'
with open(k_path, 'r') as f:
    usernames_bluesky = f.readlines()
usernames_bluesky = [k.strip().split('@')[1] for k in usernames_bluesky]

k_path = './data/keywords/x_usernames_politicians.txt'
with open(k_path, 'r') as f:
    usernames_x = f.readlines()
usernames_x = [k.strip().split('@')[1] for k in usernames_x]

In [12]:
def filter_x(
    interactions, posts, users,
    keywords, usernames,
):
    # Get relevant users (used to get all their posts)
    relevant_users = users[
        users['username'].isin(usernames)
    ]

    # Get relevant posts by keyword filter
    relevant_posts = posts[
        posts['full_text'].str.contains(
            '|'.join(keywords), case=False, na=False, regex=True
        )
    ]

    # Get all tweet IDs for relevant users and relevant posts
    relevant_tweet_ids = set(
        relevant_users['tweet_id'].apply(lambda x: x.split(',')).explode().tolist() + \
        relevant_posts['tweet_id'].tolist()
    )

    # Get interactions, posts and users for relevant tweets
    filt_interactions = interactions[
        interactions['tweet_id'].isin(relevant_tweet_ids)
    ].reset_index(drop=True)

    filt_posts = posts[
        posts['tweet_id'].isin(relevant_tweet_ids)
    ].reset_index(drop=True)

    users.loc[:,'tweet_ids'] = users['tweet_id'].apply(lambda x: x.split(','))
    users_exp = users.explode('tweet_ids')
    filt_user_ids = users_exp[
        users_exp['tweet_ids'].isin(relevant_tweet_ids)
    ]['user_id'].unique()
    filt_users = users[
        users['user_id'].isin(filt_user_ids)
    ].reset_index(drop=True)

    return filt_interactions, filt_posts, filt_users

In [6]:
def filter_bluesky(
    interactions, posts, users,
    keywords, usernames,
):
    # Get relevant users (used to get all their posts)
    relevant_users = users[
        users['username'].isin(usernames)
    ]

    # Get relevant posts by keyword filter
    relevant_posts = posts[
        posts['record_text'].str.contains(
            '|'.join(keywords), case=False, na=False, regex=True
        )
    ]

    # Get all uris for relevant users and relevant posts
    relevant_uris = set(interactions[
        interactions['did'].isin(relevant_users['did'].unique()) |
        interactions['to_did'].isin(relevant_users['did'].unique()) |
        interactions['uri'].isin(relevant_posts['uri'].unique())
    ]['uri'].unique().tolist())

    # Get interactions, posts and users for relevant posts
    filt_interactions = interactions[
        interactions['uri'].isin(relevant_uris)
    ].reset_index(drop=True)

    filt_posts = posts[
        posts['uri'].isin(relevant_uris)
    ].reset_index(drop=True)

    filt_user_dids = set(
        filt_interactions['did'].tolist() + filt_interactions['to_did'].tolist()
    )
    filt_users = users[
        users['did'].isin(filt_user_dids)
    ].reset_index(drop=True)

    return filt_interactions, filt_posts, filt_users

## Load
Load day-by-day for X and Bluesky and export to consolidated interactions, posts, users files.

In [7]:
x_path = './data/snapshots/x/clean'
bluesky_path = './data/snapshots/bluesky/clean'

output_dir = './data/consolidated'

### X

In [13]:
# X
all_interactions = []
all_posts = []
all_users = []

dirs = os.listdir(x_path)
for dirname in tqdm(dirs):
    path = os.path.join(x_path, dirname)
    fnames_interactions = ['interactions.csv', f'interactions_{dirname}.csv']
    fnames_posts = ['posts.csv', f'posts_{dirname}.csv']
    fnames_users = ['users.csv', f'users_{dirname}.csv']
    for fname in fnames_interactions:
        path_ = os.path.join(path, fname)
        if os.path.exists(path_):
            interactions = pd.read_csv(path_, sep='\t')
    for fname in fnames_posts:
        path_ = os.path.join(path, fname)
        if os.path.exists(path_):
            posts = pd.read_csv(path_, sep='\t')
    for fname in fnames_users:
        path_ = os.path.join(path, fname)
        if os.path.exists(path_):
            users = pd.read_csv(path_, sep='\t')
    
    filt_interactions, filt_posts, filt_users = filter_x(
        interactions, posts, users,
        keywords_x, usernames_x,
    )
    all_interactions.append(filt_interactions)
    all_posts.append(filt_posts)
    all_users.append(filt_users)

df_interactions = pd.concat(all_interactions).reset_index(drop=True)
df_posts = pd.concat(all_posts).reset_index(drop=True)
df_users = pd.concat(all_users).reset_index(drop=True)
print(df_interactions.shape, df_posts.shape, df_users.shape)

  users = pd.read_csv(path_, sep='\t')
  users = pd.read_csv(path_, sep='\t')
  users = pd.read_csv(path_, sep='\t')
  posts = pd.read_csv(path_, sep='\t')
  posts = pd.read_csv(path_, sep='\t')
  posts = pd.read_csv(path_, sep='\t')
  users = pd.read_csv(path_, sep='\t')
  users = pd.read_csv(path_, sep='\t')
  posts = pd.read_csv(path_, sep='\t')
  users = pd.read_csv(path_, sep='\t')
  posts = pd.read_csv(path_, sep='\t')
  users = pd.read_csv(path_, sep='\t')
  users = pd.read_csv(path_, sep='\t')
  users = pd.read_csv(path_, sep='\t')
  users = pd.read_csv(path_, sep='\t')
  users = pd.read_csv(path_, sep='\t')
100%|██████████| 50/50 [01:50<00:00,  2.22s/it]


(385208, 5) (368354, 34) (17933, 24)


In [14]:
# fname = 'interactions_x_20250301_20250419.csv'
# df_interactions.to_csv(os.path.join(output_dir, fname), index=False)

# fname = 'posts_x_20250301_20250419.csv'
# df_posts.to_csv(os.path.join(output_dir, fname), index=False)

# fname = 'users_x_20250301_20250419.csv'
# df_users.to_csv(os.path.join(output_dir, fname), index=False)

# print('Wrote to file')

Wrote to file


### Bluesky

In [52]:
# Bluesky
all_interactions = []
all_posts = []
all_users = []

dirs = os.listdir(bluesky_path)
for dirname in tqdm(dirs):
    path = os.path.join(bluesky_path, dirname)
    fnames_interactions = ['interactions.csv', f'interactions_{dirname}.csv']
    fnames_posts = ['posts.csv', f'posts_{dirname}.csv']
    fnames_users = ['users.csv', f'users_{dirname}.csv']
    for fname in fnames_interactions:
        path_ = os.path.join(path, fname)
        if os.path.exists(path_):
            interactions = pd.read_csv(path_, sep='\t')
    for fname in fnames_posts:
        path_ = os.path.join(path, fname)
        if os.path.exists(path_):
            posts = pd.read_csv(path_, sep='\t')
    for fname in fnames_users:
        path_ = os.path.join(path, fname)
        if os.path.exists(path_):
            users = pd.read_csv(path_, sep='\t')
    
    filt_interactions, filt_posts, filt_users = filter_bluesky(
        interactions, posts, users,
        keywords_bluesky, usernames_bluesky,
    )
    all_interactions.append(filt_interactions)
    all_posts.append(filt_posts)
    all_users.append(filt_users)

df_interactions = pd.concat(all_interactions).reset_index(drop=True)
df_posts = pd.concat(all_posts).reset_index(drop=True)
df_users = pd.concat(all_users).reset_index(drop=True)
print(df_interactions.shape, df_posts.shape, df_users.shape)

  posts = pd.read_csv(path_, sep='\t')
  posts = pd.read_csv(path_, sep='\t')
  posts = pd.read_csv(path_, sep='\t')
  posts = pd.read_csv(path_, sep='\t')
  posts = pd.read_csv(path_, sep='\t')
  posts = pd.read_csv(path_, sep='\t')
  posts = pd.read_csv(path_, sep='\t')
  posts = pd.read_csv(path_, sep='\t')
  posts = pd.read_csv(path_, sep='\t')
  posts = pd.read_csv(path_, sep='\t')
  posts = pd.read_csv(path_, sep='\t')
100%|██████████| 50/50 [04:31<00:00,  5.43s/it]


(460458, 6) (349796, 27) (274495, 10)


In [None]:
# fname = 'interactions_bluesky_20250301_20250419.csv'
# df_interactions.to_csv(os.path.join(output_dir, fname), index=False)

# fname = 'posts_bluesky_20250301_20250419.csv'
# df_posts.to_csv(os.path.join(output_dir, fname), index=False)

# fname = 'users_bluesky_20250301_20250419.csv'
# df_users.to_csv(os.path.join(output_dir, fname), index=False)

# print('Wrote to file')

Wrote to file
