TODO Should have clustered agents and objects together since they should have some overlap.

TODO Read deduplicated agent-action-object and narratives  
TODO Read social media data (users, posts, relationsbetween posts)  
TODO Create triples  
TODO Upload to Neo4j

In [None]:
import pandas as pd
import numpy as np

from neo4j import GraphDatabase

import datetime
import json
import os
import pickle
from tqdm import tqdm
from urllib.parse import urlparse

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_distances

from dotenv import load_dotenv

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
load_dotenv('./.env')

True

In [4]:
uri = os.getenv('NEO4J_URI')
username = os.getenv('NEO4J_USERNAME')
password = os.getenv('NEO4J_PASSWORD')
auth = (username, password)

In [5]:
# Test connection
with GraphDatabase.driver(uri, auth=auth) as driver:
    driver.verify_connectivity()

# Load data
Load extractions and social media data

In [6]:
data_dir = './data/consolidated'
fnames = os.listdir(data_dir)
for fname in fnames:
    new_object_name = '_'.join(fname.split('_')[:2])
    globals()[new_object_name] = pd.read_csv(os.path.join(data_dir, fname))
    print(new_object_name, ':', globals()[new_object_name].shape)

interactions_x : (385208, 5)


  globals()[new_object_name] = pd.read_csv(os.path.join(data_dir, fname))


posts_x : (368354, 34)
users_x : (17933, 24)
interactions_bluesky : (460458, 6)


  globals()[new_object_name] = pd.read_csv(os.path.join(data_dir, fname))


posts_bluesky : (349796, 27)
users_bluesky : (274495, 10)


In [7]:
# Cleaned extractions
extraction_path = './data/extractions/'
df_reddit = pd.read_pickle(os.path.join(extraction_path, 'df_reddit_extractions_clean.pkl'))
print(df_reddit.shape)
df_bluesky = pd.read_pickle(os.path.join(extraction_path, 'df_bluesky_extractions_clean.pkl'))
print(df_bluesky.shape)
df_x = pd.read_pickle(os.path.join(extraction_path, 'df_x_extractions_clean.pkl'))
print(df_x.shape)
print("Read from file")

(919, 14)
(38870, 12)
(54397, 12)
Read from file


In [8]:
# Load clusters
fname = './data/scoring/clustering_scores_agglomerative_best_v2.pkl'
with open(fname, 'rb') as f:
    df_clusters = pickle.load(f)
print(df_clusters.shape)
df_clusters

(5, 5)


Unnamed: 0,dataset,col,n_clusters,ss,labels
0,overall,agent_norm_clean,3500,0.18087,"[2361, 2283, 64, 2621, 594, 918, 130, 1460, 33..."
1,overall,action_or_event_norm_clean,10000,0.154503,"[6252, 3282, 5052, 7373, 6914, 6914, 8030, 426..."
2,overall,object_norm_clean,5500,0.201533,"[1915, 614, 279, 4714, 4541, 198, 1190, 3718, ..."
3,overall,narrative_clean,13000,0.102188,"[5087, 5539, 3618, 308, 3110, 1824, 5032, 4532..."
4,overall,agent_or_object_norm_clean,8000,0.190217,"[1190, 647, 789, 6285, 842, 1479, 6844, 858, 7..."


# Preprocessing
- Get clusters for each extraction
- Get some cluster statistics (to help decide whether to filter out uncommon clusters)

In [9]:
# Collect inputs to be clustered
cols_to_cluster = [
    'agent_norm_clean',
    'action_or_event_norm_clean',
    'object_norm_clean',
    'narrative_clean'
]

# Dataset name to dataframe
dataset2df = {
    'reddit': df_reddit,
    'bluesky': df_bluesky,
    'x': df_x,
}

# Get overall unique values per column
overall_col2values = {}
for col in cols_to_cluster:
    values = []
    for df in dataset2df.values():
        values.append(df[col])
    values = pd.concat(values).unique()
    values = [v for v in values if not pd.isna(v)]
    overall_col2values[col] = values
    print(f'Overall\t\tColumn: {col}\t\t# of values: {len(values)}')

# Combine agents and objects
values = []
for df in dataset2df.values():
    values.append(df['agent_norm_clean'])
    values.append(df['object_norm_clean'])
values = pd.concat(values).unique()
values = [v for v in values if not pd.isna(v)]
overall_col2values['agent_or_object_norm_clean'] = values
print(f'Overall\t\tColumn: agent_or_object_norm_clean\t\t# of values: {len(values)}')

cols_to_cluster.append('agent_or_object_norm_clean')


Overall		Column: agent_norm_clean		# of values: 10200


Overall		Column: action_or_event_norm_clean		# of values: 50118
Overall		Column: object_norm_clean		# of values: 22302
Overall		Column: narrative_clean		# of values: 81306
Overall		Column: agent_or_object_norm_clean		# of values: 30639


In [10]:
# Mapping from column to mapping from value to cluster index
overall_col2value2cluster_id = {}
for col in cols_to_cluster:
    cluster_ids = df_clusters[df_clusters['col'] == col]['labels'].iloc[0].tolist()
    value2cluster_id = {
        value: cluster_id for value, cluster_id in zip(
            overall_col2values[col], cluster_ids
        )
    }
    print('Col:', col, '# of values:', len(cluster_ids))
    overall_col2value2cluster_id[col] = value2cluster_id

Col: agent_norm_clean # of values: 10200
Col: action_or_event_norm_clean # of values: 50118
Col: object_norm_clean # of values: 22302
Col: narrative_clean # of values: 81306
Col: agent_or_object_norm_clean # of values: 30639


In [11]:
# Create dataframes for cluster analysis
overall_col2df_membership = {}
for col in cols_to_cluster:
    df_membership = pd.DataFrame({
        'text': overall_col2value2cluster_id[col].keys(),
        'cluster_id': overall_col2value2cluster_id[col].values(),
    })
    overall_col2df_membership[col] = df_membership

In [12]:
# Create grouped dataframes for membership analysis and labelling
overall_col2df_membership_grouped = {}
for col in cols_to_cluster:
    df_membership_grouped = overall_col2df_membership[col]\
        .groupby(['cluster_id']).agg({'text': ['count', list]})
    df_membership_grouped.columns = ['n', 'text']
    df_membership_grouped = df_membership_grouped.sort_values(['n'], ascending=False)
    overall_col2df_membership_grouped[col] = df_membership_grouped

In [13]:
pd.set_option('display.max_colwidth', 500)

## Utilities
To find the central extraction in clusters of extractions (for labelling)

In [14]:
# model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', device='cuda:3')

In [15]:
def find_central_text(text_list, model):
    # Utility to find central element of text_list in embedding space of model
    if len(text_list) == 0:
        return None

    # Embed all texts
    embeddings = model.encode(text_list, show_progress_bar=False)

    # Compute the center of embeddings
    embedding_center = np.mean(embeddings, axis=0)

    # Find the closest embedding to the center
    distances = cosine_distances([embedding_center], embeddings)[0]
    closest_idx = np.argmin(distances)

    # Return the text that is closest
    return text_list[closest_idx]

In [16]:
# # Assign cluster labels
# for col in cols_to_cluster:
#     overall_col2df_membership_grouped[col].loc[:,'cluster_label'] = \
#         overall_col2df_membership_grouped[col]['text'].apply(
#             find_central_text, model=model
#         )
#     print('Assigned labels for:', col)

In [17]:
fname = './data/scoring/overall_col2df_membership_grouped_v2.pkl'

# # Export cluster labels
# with open(fname, 'wb') as f:
#     pickle.dump(overall_col2df_membership_grouped, f)
# print('Wrote to:', fname)

# Read cluster labels
with open(fname, 'rb') as f:
    overall_col2df_membership_grouped = pickle.load(f)
print('Read from:', fname)

Read from: ./data/scoring/overall_col2df_membership_grouped_v2.pkl


## Agents + Objects

In [18]:
overall_col2df_membership_grouped['agent_or_object_norm_clean'][:10]

Unnamed: 0_level_0,n,text,cluster_label
cluster_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1931,157,"[tariffs, retaliatory tariffs, tariffs imposed by donald trump, the tariffs are a pretext for planned tariffs, us tariffs, tariff retaliations, tariff, tariffs paid by american importers, u.s. tariffs, tariffs set by donald trump, more tariffs, united states tariffs, dumb tariffs, trump tariff announcement, counter tariffs, tariff policy, tariff imposed by donald trump, legal justification for the tariffs, tariff policies, trade tariffs, reciprocal tariffs, the question about us tariffs, ame...",tariffs on the us
3697,124,"[🦋sarrah🦋, utinam🇨🇦🐈‍⬛🏴󠁧󠁢󠁷󠁬󠁳󠁿🐐🇺🇦🦦🐦‍🔥, solomonchik🇨🇦, macscot🇨🇦, sandyp🐳🐬🌿🐈‍⬛🇵🇸🇺🇦💙⚾️, smokey🇨🇦🇨🇦🇨🇦🇺🇦🇺🇦🇺🇦, maggiedbar🇨🇦, sgtmoredread🇨🇦, 🇨🇦leisa, karenincalgary🏈💙💛, polli🧚‍♀️, lorbsy🇨🇦, ldora🍁🏳️‍🌈🏳️‍⚧️🍉🌻🍄🌱, claudette🇨🇦, smittycanada1🇨🇦, 햏, 🇨🇦susane1960🇨🇦, gigib🇨🇦🪶🧡💛🤍🖤🇵🇪, ✡️abe🪶, 🐝, carol❤️cookies, marm🇨🇦, sueasidle49🤘🏻🎸🤘🏻🎵🤘🏻💙🤘🏻, 🦋💙indigenousartist💙🦋, 🍁mel🇨🇦, g g 🇨🇦🇨🇦🇨🇦, jmspikeguy🏴󠁧󠁢󠁷󠁬󠁳󠁿, dorothy🇨🇦, eridan🦈, ⚜️chipsauxbbq🇨🇦, shane🇨🇦, spazri💙🦋🇨🇦🇺🇦🏳️‍🌈🎼🎤, 🇨🇦🍁, hgvancouver🇨🇦, mgb💙, ,🇨🇦🍁💪⚖️, mason...",🦋sarrah🦋
3458,79,"[25% tariffs on canada, tariffs against canada, u.s. tariffs against canada, canadian tariffs on us goods, us tariffs on canada, canada's counter-tariffs against the us, retaliatory tariffs on canada, canada's tariffs on the us, canada's retaliatory tariffs, tariffs on canadian goods, canadian government regarding tariffs, canada's tariffs, tariffs on canada, mexico, and the eu, american tariffs on canadian goods, tariffs for importing canadian goods, canadian tariffs, tariffs in canada, tar...",tariffs against canada
255,73,"[elections canada, canadian elections, canadian election candidate, canadian federal election, the future of canada and its elections, canadian election, ontario elections, canada's elections, next canadian election, canadian presidential election, 2016 canadian presidential election, canadian presidential elections, canadian general election, canadian election campaign, current canadian election, presidential election in canada, canada's election, the canadian election, provincial elections...",canada's elections
1984,53,"[youngjay76, alaskagurl2000, miscellany101, bluestar47, monica1082, suza27, ikaren312, bindigirl5, forsaken61453, miasma667, bestiario123, umpire8911, tginette95, robinsnest333, tomas-tatru05, k1da1983, ruthg56, lxeagle17, riderbabe52, bnash61, nyl776@bluesky hi hi, ginger624, bakerbrown6, duenorth1@blue.sky, morey000, madeleine333_meidas, justme205, howell2913terry, silverbeard2026, nibnub47, bretjk65, lurelin2222, roadrunnerj21, fotus47, jubilado0505, myabradshaw78, tardis444, cgy2022, use...",tginette95
2161,51,"[fentanyl crisis, fentanyl issues, fentanyl, fentanyl issue, fentanyl operations, the problem drug is fentanyl, fentanyl availability, the fentanyl is a legal device, measures against fentanyl, fentanyl and firearms, fentanyl allegations, fentanyl problem, fentanyl flow, oxycontin and fentanyl, fentanyl supply, fentanyl claims, fentanyl angle, narrative about fentanyl, claims of fentanyl inflow, the narrative on fentanyl, chinese fentanyl, fentanyl labs, 43 lbs of fentanyl, on the fentanyl i...",fentanyl situation
789,45,"[kash patel, randeep dhalla, pushkar singh dhami, veerender singh jubbal, sakoon singh, kirit james singh, ira bhaskar, virinder singh kalra, yatan pal singh balhara, avtar singh lalpurwal, harcharan singh bhullar, sarwan singh pandher, balbir singh rajewal, ruldu singh mansa, kuldip singh, simran jeet singh, daleep singh, n. biren singh, davinder singh, shivani singh ghoshi, ranjit singh, shafali verma, gaganpreet singh randhawa, gaurav yadav, gurpreet singh, pankaj mishra, karanbir singh, ...",gurratan singh
923,43,"[trump's tariffs, donald trump's tariffs, warren buffett's comment on tariffs, impact of trump's tariffs, prime minister's tariff decision, trump tariffs, president trump's tariffs, donald trump and the auto tariffs, trump's tariffs question, donald trump tariffs, tariff dispute with trump, trump's tariff policies, donald trump's individual trade decisions, his electoral promise on tariffs, trump's tariffs on imports, donald trump and tariffs, donald trump's tariff threats, trump tariff, don...",trump's tariffs
371,43,"[$288 million, 1 billion dollars, $100 million worth of condoms, $1.3 billion, 125 billion dollars, $100 billion of net worth, $23 million, $2.4 billion, large amounts of wealth, 250 million dollars, 400 million dollars, 10 million dollars, 30 million dollar net worth, $5 million, 500 million dollars, 93 billion dollars, $10 million cheque, 150 billion euros, billions of dollars, $6.5 billion, $5 billion, 187 billion dollars, 6.5 billion dollars, 200 million dollars, 300 billion dollars, hun...",150 billion dollars
5633,41,"[liberal party of canada, new democratic party, green party of canada, liberal party, new democratic party, bloc québécois, green party, liberal party of canada and new democratic party, ndp or bloc québécois, pcs and ndp, bloc québécois, new democratic party, green party, liberal party of canada, new democratic party and people's party of canada, ndp and bloc, ontario liberals and ndp, liberal or ndp parties, conservative party of canada and new democratic party, us democrats and trudeau's ...",liberal party and ndp


## Agents

In [19]:
overall_col2df_membership_grouped['agent_norm_clean'][:10]

Unnamed: 0_level_0,n,text,cluster_label
cluster_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
43,116,"[🦋sarrah🦋, utinam🇨🇦🐈‍⬛🏴󠁧󠁢󠁷󠁬󠁳󠁿🐐🇺🇦🦦🐦‍🔥, solomonchik🇨🇦, macscot🇨🇦, sandyp🐳🐬🌿🐈‍⬛🇵🇸🇺🇦💙⚾️, smokey🇨🇦🇨🇦🇨🇦🇺🇦🇺🇦🇺🇦, maggiedbar🇨🇦, sgtmoredread🇨🇦, 🇨🇦leisa, karenincalgary🏈💙💛, polli🧚‍♀️, lorbsy🇨🇦, ldora🍁🏳️‍🌈🏳️‍⚧️🍉🌻🍄🌱, claudette🇨🇦, smittycanada1🇨🇦, 햏, 🇨🇦susane1960🇨🇦, gigib🇨🇦🪶🧡💛🤍🖤🇵🇪, ✡️abe🪶, 🐝, carol❤️cookies, marm🇨🇦, sueasidle49🤘🏻🎸🤘🏻🎵🤘🏻💙🤘🏻, 🦋💙indigenousartist💙🦋, 🍁mel🇨🇦, jmspikeguy🏴󠁧󠁢󠁷󠁬󠁳󠁿, dorothy🇨🇦, eridan🦈, ⚜️chipsauxbbq🇨🇦, shane🇨🇦, spazri💙🦋🇨🇦🇺🇦🏳️‍🌈🎼🎤, 🇨🇦🍁, hgvancouver🇨🇦, mgb💙, ,🇨🇦🍁💪⚖️, masonman250🇨🇦, 💙 ...",🦋sarrah🦋
401,53,"[youngjay76, miscellany101, aprile529, gaming_mama1967, bluestar47, monica1082, suza27, ikaren312, forsaken61453, miasma667, bestiario123, womanatmile0, tginette95, robinsnest333, tomas-tatru05, k1da1983, wednesday284, ruthg56, lxeagle17, mariamscog49, riderbabe52, bnash61, nyl776@bluesky hi hi, ginger624, bakerbrown6, duenorth1@blue.sky, madeleine333_meidas, justme205, howell2913terry, lynn1219, nibnub47, janeen123, bretjk65, mimi828, lurelin2222, tardis444, angelwoman501, cgy2022, ronxyz00...",luciath71
0,44,"[ndp (new democratic party), ndp liberal government, cpc caucus, ontario liberals, british columbia new democratic party, ndp, liberal party of canada, new democratic party, green party of canada, buffalo party of alberta, liberal party, new democratic party, bloc québécois, green party, ndp partisan, ndp candidate, alberta ndp, liberal party of canada and new democratic party, alberta new democratic party, saskatchewan party, ndp leader, manitoba ndp, ndp or bloc québécois, liberal caucus, ...",alberta ndp
74,37,"[🇨🇦 pauladawn🇨🇦, 🇨🇦 lulu💙💙💙, kimmie💙🌈🌊🐕🦋 🇨🇦, yvettejr718☮️💙♋️ 🌕♒️🌅♎️🏳️‍🌈🏳️‍⚧️, 🇨🇦 👊🇨🇦, 🎶🌈amber waves🌈🎶, 🇨🇦🇸🇪 🧡 🇺🇦🇵🇸, 🇨🇦 🇺🇦, ⏱📋david steinberg⏱📋, delvin 🇨🇦 🇫🇮, ᴘᴀᴡꜱ & ᴘᴏᴜᴛɪɴᴇ ❤️🇨🇦💙🇺🇦, n.j.c. 🍁 🇨🇦 🍁, 🇨🇦canadian harpy🇧🇲 🦫, b🎗️ 🇨🇦 🇺🇦 🇵🇸🏳️‍🌈🏳️‍⚧️, uɯzsoʇ 🎗🇨🇦🇺🇦🇬🇱🇫🇷🇪🇺, 🇨🇦 flushfoot 🦫🍁, x - 🇨🇦ferfer 🇨🇦, 🏳️‍🌈jorge godoy🇪🇺, 𝐭𝐚𝐥𝐞𝐬-𝐭𝐲𝐩𝐨𝐬, 🍁jocey designs🇨🇦, 🇨🇦norsk 🇨🇦 📎, 𝕠𝕤𝕔𝕒𝕣 𝕩-𝕣𝕒𝕪, 🇨🇦manitoba doubleair 🇧🇪🇺🇦, ᵀᴴᴱ ꀤngᒪoᖇioᑌᔕ🇨🇦¹, 🍁dountoothers 🇨🇦, 🇬🇧gyn 🇺🇸gynarchynow🇺🇦🥄, ⊶ 𝕎𝕋⑦, 🦋🧿tempestori 🌬🌊, 曹昌倫, ✨sabr...",🇨🇦 👊🇨🇦
168,34,"[canadians, canadian citizens, friends from canada, canadian public, people of quebec, canadian expatriates, majority of canadians, the citizens of canada, canadian people, public/canadians, citizens of canada, ontarians, albertans, canadian population, many canadians, canadian coworkers, most canadians, the canadian public, canadian public sentiment, some canadians, alberta residents, canadian consumers, canadian populace, canadian tourists, citizens of ontario, quebecers, quebeckers, any c...",citizens of canada
100,30,"[stephen harper and jeff flarety, pierre poilievre and stephen harper, stephen harper and pierre poilievre, justin trudeau, mark carney, chrystia freeland, david eby, liberal party of canada, mark carney and all premiers, canadian leaders, canadian political leaders, canadian government leaders, stephen harper, pierre poilievre, doug ford, canadian politicians, justin trudeau & mélanie joly, carney-trudeau, former canadian presidents, melanie joly, chrystia freeland, justin trudeau, canada's...",canadian political leaders
844,29,"[liberal party of canada (lpc), liberal party of canada, conservative party of canada, federal conservative voters, british columbia conservative party, conservative provinces, provincial conservative parties, conservative party of canada representative, liberal party canada, people's party of canada, canadian conservative party, federally elected conservatives, conservative provincial governments, the right wing in canada, canadian people's party, electing a conservative provincial governme...",conservative party of canada
498,28,"[justin trudeau and his government, canada’s next government, people attacking trudeau, trudeau government, justin trudeau's government, critics of trudeau, justin trudeau critic, the trudeau government, trudeau haters, justin trudeau's administration, justin trudeau's team, opponents of pm trudeau, pierre trudeau's government, anti-trudeau movement, justin trudeau and the federal government, trudeau supporters, team trudeau, trudeau governments, trudeau's ego, trudeau government backed by w...",justin trudeau's government
368,26,"[us voters, voters, white voters, potential voters, men voters, the voters, rural voters, american voters, young voters, eligible voters, non-voters, voter, kentucky voters, swing voters, the electorate, voters supporting the current leader, general voting public, electors, individual voters, australian voters, young male voters, german voters, women voters, primary voters, non-blue voters, nl voters]",voters
279,25,"[canadian government, canadian federal government, alberta government, canada's biggest oil lobby group, provincial governments of canada, governments of quebec, ontario, manitoba and british columbia, the government of canada, government of canada, federal government of canada, the canadian government, ontario government, canadian government (cda), provincial governments, canadian federal administration, canadian cabinet, alberta delegation, government of ontario, cabinet of canada, canada'...",canadian government


## Action

In [20]:
overall_col2df_membership_grouped['action_or_event_norm_clean'][:10]

Unnamed: 0_level_0,n,text,cluster_label
cluster_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
249,85,"[blame for tariffs, anticipate profits from tariffs, decide to impose tariffs, responsibility for tariffs, proposed counter-tariffs, advocating and educating about tariffs, explain tariffs to trump, is blamed for tariffs, attributing responsibility for tariffs, justify tariffs, justified tariffs, revising tariffs, acceptance of pain from tariffs, not imposing tariffs, justifies tariffs, shifting blame for tariffs, reintroducing tariffs, keeping counter tariffs, impose tariffs in response to,...",justify tariffs
377,75,"[imposed a 100% tariff on electric vehicles imported from china, imposed export tariffs, imposed tariffs, responded to tariffs, imposed unilateral tariffs, threatened further tariffs, responded to tariffs by imposing duties, led opposition against tariffs, utilized tariffs as a threat, advocated for tariffs to finance tax reductions, responded with tariffs, refused to remove tariffs, caused tariff war, refused to pause tariffs, initiated tariffs, communicated opposition to tariffs, tried to ...",decided to initiate tariffs
555,71,"[potential candidacy for prime minister, is considered as a potential prime minister, candidate for prime minister, wants to become prime minister, considered as prime minister, be recommended for prime minister, identified as next prime minister, is expected to become prime minister, is leading the race for prime minister, unlikely_to_be_prime_minister, is prime minister, is considered a strong candidate for prime minister, is predicted to become prime minister, expresses ambition for prime...",is regarded as the next prime minister
2095,70,"[threatening tariffs, issue threat of tariffs, respond to tariffs, responds to tariffs, react to tariffs, implemented retaliatory tariffs, imposed retaliatory tariffs, issue retaliatory tariffs, threatens with counter-tariffs, respond to potential tariffs, there will be counter tariffs, impose retaliatory tariffs, retaliating against tariffs, impose_retaliatory_tariff, indicated tariffs are due to territorial aggression, countered with tariffs of 155 billion dollars, threatened tariffs, took...",retaliating against tariffs
69,70,"[suggest_retaliate_with_tariffs, reiterated readiness for tariffs, responses to tariffs, response to tariffs in a phased manner, outlines response to tariffs, announce_retaliatory_tariffs, proposed retaliatory tariffs, announce retaliatory tariffs, outline response to tariffs, outlined response to tariffs, indicate retaliatory tariffs, outlining response to tariffs, campaign on retaliatory tariffs, confirms response with tariffs, promised a mirror response to tariffs, signal readiness to imp...",announcing retaliatory tariffs
4655,68,"[paid_for_tariffs, propose tariff, impose_tariffs, implementing tariffs, implement a flat 50% tariff, proposed tariffs, propose 100% tariff, implement_export_tariff, set_tariffs_to_zero, covered tariffs, impose_tariff, implement tariff policy, plan tariffs, announce reciprocal tariffs, proposes tariff, introducing tariffs on, implement targeted tariffs, proposed tariff, propose a tariff, suggesting tariff, call for tariff, proposing a tariff, enforce tariffs, utilizing tariff revenue for sup...",implementing tariffs against
382,61,"[call for an election, call a snap election, hold special elections, hold election, hold an election, postpone elections, call snap election, call a general election, call early election, hold general election, start a special election, consider calling an election, postpone election, decide to hold an election, decline to call special election, call an early snap election, call_to_postpone_elections, encourage_special_elections, support special elections, support_special_elections, urge for...",call for an election
191,60,"[confirmation of tariffs, increase tariffs, announced incremental tariff implementation, will raise tariffs, prepare tariffs, may increase tariffs, promise to extend tariff, stated current tariff status, prepared for tariffs, will impose stronger tariffs, increased tariffs, impose additional tariffs, clarified tariff situation, changes position on tariffs, will raise tariffs or prices, will implement tariffs on us goods, has people pursuing tariffs due to new tax, will impose 25% tariffs, ne...",increases tariffs
4229,56,"[impose tariffs, imposing tariffs, impose tariff, introduce tariffs, announces he will raise tariffs further, imposing a tariff, announced increase in tariffs, impose export tariff, imposition of tariffs, announce 25% tariffs, impose additional export tariffs, impose export tariffs, impose tariffs on the us, plans to increase tariffs, imposes tariffs, announcing continued 25% tariffs, implement tariffs on china, imposing tariffs on, impose tariffs on, impose tariff measures, impose a tariff ...",imposing tariffs on
82,52,"[is damaging the economy of canada, have left canada economically illiterate, have led to canada's demise, have led to the demise of canada, has led to the demise of canada, has negatively impacted canada, is seen as detrimental to canada's economy, have caused financial damage to canada, has dragged canada down to the same level as donald trump, has left canada vulnerable, has neglected canadian workers, has made canada vulnerable, has destroyed canada, has ruined canada, are a significant ...",has negatively impacted canada


## Object

In [21]:
overall_col2df_membership_grouped['object_norm_clean'][:10]

Unnamed: 0_level_0,n,text,cluster_label
cluster_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
133,86,"[25% tariffs on canada, tariffs against canada, u.s. tariffs against canada, trudeau's announcement of retaliatory tariffs, canadian tariffs on us goods, us tariffs on canada, canada's counter-tariffs against the us, retaliatory tariffs on canada, canada's tariffs on the us, canada's retaliatory tariffs, tariffs on canadian goods, canadian government regarding tariffs, canada's tariffs, tariffs on canada, mexico, and the eu, american tariffs on canadian goods, tariffs for importing canadian ...",tariffs against canada
2682,84,"[tariffs imposed by donald trump, us tariffs, tariffs paid by american importers, u.s. tariffs, tariffs set by donald trump, united states tariffs, dumb tariffs, tariff imposed by donald trump, the question about us tariffs, american tariffs, unjust tariffs, tariffs list, don's tariffs, question about us tariffs, tariffs of 155 billion dollars, usa tariffs, tariffs imposed by the trump administration, tariffs on the u.s., 38% of products subject to tariffs, tariffs on usa goods, tariffs and ...",tariffs on the us
852,66,"[the future of canada and its elections, canadian election, elections canada, canadian electoral system, canada's elections, next canadian election, canadian presidential election, canadian elections, 2016 canadian presidential election, canadian presidential elections, canadian general election, current canadian election, presidential election in canada, canada's election, canada's electoral process, the canadian election, canada and upcoming elections, canadian federal election, canada ele...",canadian elections
149,54,"[fentanyl issues, fentanyl, fentanyl issue, fentanyl operations, fentanyl_seizures, the problem drug is fentanyl, fentanyl availability, the fentanyl is a legal device, measures against fentanyl, fentanyl and firearms, fentanyl allegations, fentanyl problem, fentanyl flow, oxycontin and fentanyl, fentanyl supply, trans individuals affected by fentanyl, fentanyl claims, fentanyl angle, fentanyl seizures, fentanyl levels in maple syrup, claims of fentanyl inflow, chinese fentanyl, fentanyl lab...",fentanyl situation
813,49,"[justin trudeau and donald trump, justin trudeau & keir starmer, françois legault and justin trudeau, justin trudeau and mark carney, brian mulroney and stephen harper, kamala harris and justin trudeau, justin trudeau and claudia sheinbaum, melania trump and justin trudeau, justin trudeau, donald trump, barron and trudeau, trudeau and trump, baron and justin trudeau, justin trudeau and shinebaum, trudeau and guilbeault, justin trudeau and charlie angus, trudeau and chretien, pierre trudeau a...",justin trudeau and sophie grégoire
3728,48,"[2020 united states presidential election, 2026 elections, 2024 elections, 2024 election, presidential election in 2028, 2025 presidential election, 2020 u.s. presidential election, 2020 us presidential election, 2026_elections, 2028 election, 2028 presidential election, 2024 presidential election, fall 2025 elections, november 2026 elections, election until october 2026, federal election by october 20, 2025, 2021 election, april 1, 2025 election, next election for president of the united st...",2024 us election
1095,46,"[tariffs, tariff, more tariffs, tariff wars, trade tariffs, the trade issues caused by tariffs, the economic crisis due to tariffs, economic consequences of the tariff war, tariffs and other priorities, tariff/trade war situation, focus of tariffs, concerns about tariffs, the tariff situation, all tariffs, tariff negotiations, tariff war, tariff war ramifications, t*'s tariff problem, discussions on tariffs in the us, tariff fight, higher tariffs, tariff battle, root problem of tariffs, econ...",the tariff situation
287,44,"[the tariffs are a pretext for planned tariffs, tariff retaliations, federal tariff response, retaliatory tariffs, trump tariff announcement, counter tariffs, tariff policy, tariff policies, response in the tariff war against the united states, support for blanket tariffs, position on tariffs, a panel on response to american tariffs, tariffs plan, tariff threats, counter-tariffs and boycotts, upcoming global tariff threats in april, tariff retaliation, tariff reductions proposal, removing ta...",retaliatory tariffs
517,43,"[legal justification for the tariffs, reciprocal tariffs, tariff act, tariff question, tariff issue, conditional tariff removal, claim about tariffs, reciprocal tariff, dollar for dollar tariffs, tariff revenues, tariff revenue, tariff problem, response tariffs, tariff situation, tariff carve-outs, tariff funds, google searches for 'what is a tariff', tariff suspension, market-moving tariffs, tariff process, tariffs statement, similar tariffs, tariff system, tariff position, tariff managemen...",tariff question
496,43,"[pierre poilievre, poilievre's strategy, associates of pierre poilievre, >> poilievre, poilievre, constituents of pierre poilievre, the necessity to stop poilievre, the actions of pierre poilievre, rise of pierre poilievre, pierre poilievre (pp), pierre poilievre's interventions, poilievre's useful idiot, pierre poilievre's television advertisements, pierre poilievre's voice, comparison with pierre poilievre, pierre poilievre's response, pierre poilièvre, pierre poilievre's statements, pierr...",pierre poilievre's position


## Narrative

In [22]:
overall_col2df_membership_grouped['narrative_clean'][:10]

Unnamed: 0_level_0,n,text,cluster_label
cluster_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5013,221,"[the election truth alliance claims to have uncovered evidence suggesting potential tampering of voting machines in the ongoing discussions about election integrity., the election truth alliance claims to have discovered evidence of discrepancies in the 2024 presidential election, prompting discussions about the integrity of the election process., election truth has undertaken a detailed examination of the 2024 election integrity by reviewing various statistics and data., election truth has ...","the election truth alliance is calling for an investigation into the 2024 election, claiming to have found evidence of anomalies suggesting it was rigged in favor of trump."
9787,178,"[trudeau is resigning from his position as prime minister under pressure, reflecting internal political turmoil., justin trudeau is resigning while maintaining a strong leadership for canada., justin trudeau has resigned from his leadership role in the liberal party., justin trudeau has resigned, impacting the political landscape ahead of the next election., justin trudeau is expected to resign next week amidst criticism over tariffs that will burden canadian citizens., prime minister trudea...",justin trudeau is resigning as prime minister following the liberal party's decision to choose his successor.
1713,163,"[carney is portrayed as intentionally dishonest and reckless, undermining his credibility and potentially losing voter support., mark carney has been accused of dishonesty multiple times before even being elected., mark carney has been exposed for dishonesty multiple times., mark carney is accused of fabricating statements on a regular basis., accusation made against mark carney concerning dishonesty regarding the phoenix project., mark carney's credibility is under scrutiny as he is accused...",mark carney has been accused of dishonest behavior.
2640,149,"[nathan taylor from election truth alliance is uncovering election discrepancies in the united states., nathan taylor from the election truth alliance raises concerns about election result discrepancies during a discussion on the mark thompson show., nathan taylor from the election truth alliance discusses potential discrepancies in election results on a forum aimed at transparency., nathan taylor from the election truth alliance is addressing concerns about discrepancies in election results...",nathan taylor from the election truth alliance addresses concerns regarding potential discrepancies in election results during an appearance on the mark thompson show.
321,144,"[donald trump announced a 25% tariff on canadian goods, escalating trade tensions., donald trump threatens tariffs against canada, possibly as a distraction from other news., trump is expected to impose tariffs on canada, raising concerns about economic impacts., donald trump announced that previously set tariffs on canada and mexico would take effect shortly, contributing to a decline in us stocks., trump has imposed a significant tariff on canadian imports affecting trade relations., donal...",donald trump is taking economic action against canada by imposing tariffs.
6953,130,"[justin trudeau is taking a stand against donald trump's trade policies by announcing canada's response to the ongoing trade war., pm trudeau and his team announced retaliatory tariffs in response to u.s. government actions., justin trudeau announced retaliatory tariffs against the u.s. in response to trade disputes., justin trudeau is asserting canada's resolve to respond decisively against u.s. tariffs imposed by president trump., justin trudeau addressed concerns regarding tariffs after w...",justin trudeau is taking measures to respond to tariffs imposed on canada.
10106,123,"[european leaders, including emmanuel macron, convened to discuss urgent matters with ukrainian president volodymyr zelensky., european leaders, including olaf scholz, convened to discuss urgent matters with ukrainian president volodymyr zelensky., european leaders, including giorgia meloni, convened to discuss urgent matters with ukrainian president volodymyr zelensky., european leaders, including donald tusk, convened to discuss urgent matters with ukrainian president volodymyr zelensky., ...","european leaders, including mette frederiksen, held an emergency meeting with ukrainian president volodymyr zelensky to discuss urgent issues."
2595,116,"[justin trudeau reaffirms canada's support for ukraine amidst the ongoing conflict with russia., justin trudeau reaffirms canada's support for ukraine in the context of ongoing conflict., justin trudeau is promoting canada internationally to highlight its support for ukraine., pm trudeau's presence with zelensky in the eu emphasizes canada's support for ukraine., pm trudeau is reaffirming canada's commitment to support ukraine amidst rising international tensions., justin trudeau is expressi...",justin trudeau is expressing canada's support for ukraine amid ongoing conflicts.
2383,108,"[trudeau stated that canada will have a strong and proportional response to anticipated tariffs from trump., justin trudeau confirmed canada's preparedness to implement targeted tariffs in response to potential actions by donald trump., justin trudeau has declared a 25% tariff on goods from the united states as part of escalating trade tensions between the two countries., justin trudeau acknowledges the tariffs imposed by trump and plans to respond thoughtfully to mitigate impacts on canadia...",justin trudeau has announced retaliatory tariffs against the u.s. due to trump's tariff policies.
803,106,"[the negative sentiment towards jagmeet singh indicates a broader skepticism about alternate leadership options., jagmeet singh is also being criticized for contributing to unproductive debates in parliament., jagmeet singh is criticized for his actions which may undermine the party's efforts., jagmeet singh faces criticism from user, who considers him unsuitable as leader., jagmeet singh is being criticized for his perceived alignment with conservative positions, suggesting he is not fit to...",jagmeet singh is being criticized for his political stance and actions.


## Normalizing dataframes

In [120]:
def parse_created_at(value):
    """Convert timestamp into a Python datetime object."""
    if pd.isna(value):
        return None
    
    if isinstance(value, (float, int)):
        return datetime.datetime.utcfromtimestamp(value)
    
    if isinstance(value, str):
        try:
            return datetime.datetime.fromisoformat(value.replace('Z', '+00:00'))
        except ValueError:
            return None  # or handle differently
    
    return None

In [121]:
if not 'full_text' in df_x:
    df_x = df_x.set_index('record_id').join(
        posts_x.drop_duplicates(['tweet_id'], keep='last').set_index('tweet_id')[[
            'full_text', 'created_at',
        ]]
    ).reset_index()
print(df_x.shape)

(54397, 14)


In [122]:
if not 'record_text' in df_bluesky:
    df_bluesky = df_bluesky.set_index('record_id').join(
        posts_bluesky.drop_duplicates(['uri'], keep='last').set_index('uri')[[
            'record_text', 'record_createdAt',
        ]]
    ).reset_index()
print(df_bluesky.shape)

(38870, 14)


# KG Definition

In [123]:
pd.set_option('display.max_colwidth', 50)
pd.set_option('display.max_columns', 100)

In [237]:
# Mappings from platform-specific column names to universal ones
platform2col_norm2col = {
    'Reddit': {
        'created_at': 'created_utc',
    },
    'Bluesky': {
        'text': 'record_text',
        'created_at': 'record_createdAt',
        'record_id': 'uri',
        'like_count': 'like_activity',
        'user_id': 'did',
        'followers_count': 'follower_count',
        'post_count': 'status_count',
        'from_id': 'did',
        'to_id': 'to_did',
        'interaction_type': 'mentiontype',
        'author_id': 'did',
    },
    'X': {
        'text': 'full_text',
        'created_at': 'created_at',
        'record_id': 'tweet_id',
        'like_count': 'like_count',
        'user_id': 'user_id',
        'profile_url': 'url',
        'post_count': 'tweet_count',
        'from_id': 'from_user_id',
        'to_id': 'to_user_id',
        'interaction_type': 'tweet_type',
        'author_id': 'author_id',
    },
}

In [266]:
class SocialMediaKnowledgeGraph:
    def __init__(self, uri, user, password, database="neo4j"):
        """Initialize connection to Neo4j database."""
        self.driver = GraphDatabase.driver(uri, auth=(user, password))
        self.database = database

    def close(self):
        """Close the driver connection."""
        self.driver.close()

    def clear_database(self):
        """Clear all nodes and relationships in the database."""
        with self.driver.session(database=self.database) as session:
            session.run("MATCH (n) DETACH DELETE n")

    def create_indices(self):
        """Create indices for better performance."""
        with self.driver.session(database=self.database) as session:
            session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (p:Post) REQUIRE p.id IS UNIQUE")
            session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (u:User) REQUIRE u.id IS UNIQUE")
            session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (e:Entity) REQUIRE e.name IS UNIQUE")
            session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (a:Action) REQUIRE a.name IS UNIQUE")
            session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (n:Narrative) REQUIRE n.name IS UNIQUE")

    def create_post_node(self, tx, post_info):
        """Create a Post node."""
        query = """
        MERGE (p:Post {id: $id})
        SET p.text = $text,
            p.platform = $platform,
            p.created_at = $created_at,
            p.score = $score,
            p.impressions = $impressions,
            p.quotes = $quotes,
            p.likes = $likes,
            p.bookmarks = $bookmarks,
            p.retweets = $retweets,
            p.replies = $replies,
            p.url = $url
        RETURN p
        """
        tx.run(query, **post_info)

    def create_user_node(self, tx, user_info):
        """Create a User node."""
        query = """
        MERGE (u:User {id: $id})
        SET u.name = $name,
            u.username = $username,
            u.description = $description,
            u.profile_url = $profile_url,
            u.created_at = $created_at,
            u.location = $location,
            u.verified = $verified,
            u.followers_count = $followers_count,
            u.following_count = $following_count,
            u.post_count = $post_count
        RETURN u
        """
        tx.run(query, **user_info)

    def create_entity_node(self, tx, entity_info):
        """Create an Entity node."""
        query = """
        MERGE (e:Entity {name: $name})
        SET e.raw_names = $raw_names,
            e.cluster_id = $cluster_id
        RETURN e
        """
        tx.run(query, **entity_info)

    def create_action_node(self, tx, action_info):
        """Create an Action node."""
        query = """
        MERGE (a:Action {name: $name})
        SET a.raw_names = $raw_names,
            a.cluster_id = $cluster_id
        RETURN a
        """
        tx.run(query, **action_info)

    def create_narrative_node(self, tx, narrative_info):
        """Create a Narrative node."""
        query = """
        MERGE (n:Narrative {name: $name})
        SET n.raw_names = $raw_names,
            n.cluster_id = $cluster_id
        RETURN n
        """
        tx.run(query, **narrative_info)

    def create_post_user_relationship(self, tx, post_id, user_id):
        """Connect a User to a Post."""
        query = """
        MATCH (p:Post {id: $post_id})
        MATCH (u:User {id: $user_id})
        MERGE (u)-[:POSTED]->(p)
        """
        tx.run(query, post_id=post_id, user_id=user_id)

    def create_post_extraction_relationships(
        self, tx, post_id, extraction_id, agent_name, action_name, object_name,
        narrative_name
    ):
        """Connect Post to Agent, Action, Object, and Narrative nodes."""
        query = """
        MATCH (p:Post {id: $post_id})
        MATCH (a:Entity {name: $agent_name})
        MATCH (ac:Action {name: $action_name})
        MATCH (o:Entity {name: $object_name})
        MATCH (n:Narrative {name: $narrative_name})
        
        MERGE (p)-[:MENTIONS {extraction_id: $extraction_id, post_id: $post_id}]->(a)
        MERGE (p)-[:MENTIONS {extraction_id: $extraction_id, post_id: $post_id}]->(ac)
        MERGE (p)-[:MENTIONS {extraction_id: $extraction_id, post_id: $post_id}]->(o)
        MERGE (p)-[:CONTAINS_NARRATIVE {extraction_id: $extraction_id, post_id: $post_id}]->(n)
        """
        tx.run(query, post_id=post_id, extraction_id=extraction_id,
               agent_name=agent_name, action_name=action_name,
               object_name=object_name, narrative_name=narrative_name)

    def create_extraction_chain_relationships(
        self, tx, post_id, extraction_id, agent_name, action_name, object_name,
        narrative_name
    ):
        """Connect Agent → Action → Object + link to Narrative."""
        query = """
        MATCH (a:Entity {name: $agent_name})
        MATCH (ac:Action {name: $action_name})
        MATCH (o:Entity {name: $object_name})
        MATCH (n:Narrative {name: $narrative_name})
        
        MERGE (a)-[:DOES {extraction_id: $extraction_id, post_id: $post_id}]->(ac)
        MERGE (ac)-[:AFFECTS {extraction_id: $extraction_id, post_id: $post_id}]->(o)
        MERGE (a)-[:PART_OF_NARRATIVE {extraction_id: $extraction_id, post_id: $post_id}]->(n)
        MERGE (ac)-[:PART_OF_NARRATIVE {extraction_id: $extraction_id, post_id: $post_id}]->(n)
        MERGE (o)-[:PART_OF_NARRATIVE {extraction_id: $extraction_id, post_id: $post_id}]->(n)
        """
        tx.run(query, post_id=post_id, extraction_id=extraction_id,
               agent_name=agent_name, action_name=action_name,
               object_name=object_name, narrative_name=narrative_name)

    def create_user_interaction(
        self, tx, from_user_id, to_user_id, interaction_type
    ):
        """Connect Users based on interactions (mention, reply, quote)."""
        query = """
        MATCH (u1:User {id: $from_user_id})
        MATCH (u2:User {id: $to_user_id})
        MERGE (u1)-[r:INTERACTS_WITH {type: $interaction_type}]->(u2)
        """
        tx.run(
            query, from_user_id=from_user_id, to_user_id=to_user_id,
            interaction_type=interaction_type
        )

    def lookup_cluster_info(
        self, text, mapping_df, mapping_df_grouped, default=(None, None, [])
    ):
        """Helper to find cluster id, label and raw names for a given text."""
        # Mapping_df must have "text" and "cluster_label" columns
        match = mapping_df[mapping_df['text'] == text]
        if not match.empty:
            cluster_id = match.iloc[0]['cluster_id']
            label = mapping_df_grouped.loc[cluster_id]['cluster_label']
            raw_names = mapping_df_grouped.loc[cluster_id]['text']
            return cluster_id, label, raw_names
        else:
            return default if default else (None, None, [])

    def create_knowledge_graph(
        self,
        df_reddit,
        df_bluesky,
        df_x,
        posts_x,
        posts_bluesky,
        users_x,
        users_bluesky,
        interactions_x,
        interactions_bluesky,
        overall_col2df_membership,
        overall_col2df_membership_grouped,
    ):
        """Build the full knowledge graph from multiple datasets."""
        
        # Create indices first
        self.create_indices()

        with self.driver.session(database=self.database) as session:
            # ==== Process Posts and Extractions ====
            # for platform_name, df_posts in zip(
            #     ['Reddit', 'Bluesky', 'X'],
            #     [df_reddit, df_bluesky, df_x]
            # ):
            #     print(f"Processing platform: {platform_name}")
                
            #     for post_id, post_df in tqdm(df_posts.groupby('record_id')):
            #         # Create Post node
            #         sample_row = post_df.iloc[0]
                    
            #         post_info = {
            #             "id": post_id,
            #             "text": sample_row[platform2col_norm2col[platform_name].get('text', 'text')],
            #             "platform": platform_name,
            #             "created_at": parse_created_at(
            #                 sample_row[platform2col_norm2col[platform_name].get('created_at')]
            #             ),
            #             "score": sample_row.get('score', None),
            #             # will fill later from full post metadata
            #             "impressions": None,
            #             "quotes": None,
            #             "likes": None,
            #             "bookmarks": None,
            #             "retweets": None,
            #             "replies": None,
            #             "url": sample_row.get('permalink', None)
            #         }
            #         session.execute_write(self.create_post_node, post_info)

            #         extraction_counter = 0
                    
            #         for _, row in post_df.iterrows():
            #             agent_raw = row['agent_norm_clean']
            #             action_raw = row['action_or_event_norm_clean']
            #             object_raw = row['object_norm_clean']
            #             narrative_raw = row['narrative_clean']
                        
            #             # Skip incomplete extractions
            #             if pd.isna(agent_raw) or pd.isna(action_raw) or pd.isna(object_raw) or pd.isna(narrative_raw):
            #                 continue
                        
            #             # Lookup cluster labels
            #             agent_cluster_id, agent_cluster_label, agent_cluster_raw = \
            #                 self.lookup_cluster_info(
            #                 agent_raw, overall_col2df_membership['agent_or_object_norm_clean'],
            #                 overall_col2df_membership_grouped['agent_or_object_norm_clean']
            #             )
            #             action_cluster_id, action_cluster_label, action_cluster_raw = \
            #                 self.lookup_cluster_info(
            #                 action_raw, overall_col2df_membership['action_or_event_norm_clean'],
            #                 overall_col2df_membership_grouped['action_or_event_norm_clean']
            #             )
            #             object_cluster_id, object_cluster_label, object_cluster_raw = \
            #                 self.lookup_cluster_info(
            #                 object_raw, overall_col2df_membership['agent_or_object_norm_clean'],
            #                 overall_col2df_membership_grouped['agent_or_object_norm_clean']
            #             )
            #             narrative_cluster_id, narrative_cluster_label, narrative_cluster_raw = \
            #                 self.lookup_cluster_info(
            #                 narrative_raw, overall_col2df_membership['narrative_clean'],
            #                 overall_col2df_membership_grouped['narrative_clean']
            #             )

            #             # Create Entity Nodes
            #             session.execute_write(self.create_entity_node, {
            #                 "name": agent_cluster_label,
            #                 "raw_names": agent_cluster_raw,
            #                 "cluster_id": agent_cluster_id,
            #             })
            #             session.execute_write(self.create_action_node, {
            #                 "name": action_cluster_label,
            #                 "raw_names": action_cluster_raw,
            #                 "cluster_id": action_cluster_id,
            #             })
            #             if object_raw and isinstance(object_raw, str):
            #                 session.execute_write(self.create_entity_node, {
            #                     "name": object_cluster_label,
            #                     "raw_names": object_cluster_raw,
            #                     "cluster_id": object_cluster_id,
            #                 })
            #             session.execute_write(self.create_narrative_node, {
            #                 "name": narrative_cluster_label,
            #                 "raw_names": narrative_cluster_raw,
            #                 "cluster_id": narrative_cluster_id,
            #             })

            #             # Create Relationships
            #             session.execute_write(
            #                 self.create_post_extraction_relationships,
            #                 post_id, extraction_counter,
            #                 agent_cluster_label, action_cluster_label,
            #                 object_cluster_label, narrative_cluster_label,
            #             )
            #             session.execute_write(
            #                 self.create_extraction_chain_relationships,
            #                 post_id, extraction_counter,
            #                 agent_cluster_label, action_cluster_label,
            #                 object_cluster_label, narrative_cluster_label
            #             )
                        
            #             extraction_counter += 1

            # # ==== Enrich Posts with Likes/Retweets/Reactions ====
            # print("Enriching posts with full metadata...")
            # self._update_posts_from_metadata(session, posts_x, "X")
            # self._update_posts_from_metadata(session, posts_bluesky, "Bluesky")

            # # ==== Process Users ====
            # print("Creating Users...")
            # self._create_users(session, users_x, "X")
            # self._create_users(session, users_bluesky, "Bluesky")

            # # ==== Create POSTED Relationships ====
            # print("Creating POSTED relationships...")
            # self._create_post_user_relationships(session, posts_x, "X")
            # self._create_post_user_relationships(session, posts_bluesky, "Bluesky")

            # ==== Process User Interactions ====
            print("Creating User Interactions...")
            self._create_user_interactions(session, interactions_x, platform="X")
            self._create_user_interactions(session, interactions_bluesky, platform="Bluesky")
            
            print("DONE Knowledge Graph creation complete!")

    def _update_posts_from_metadata(self, session, posts_df, platform):
        """Update Post nodes with engagement metadata (likes, retweets, replies)."""
        if posts_df is None:
            return
        
        for _, row in tqdm(posts_df.iterrows(), total=len(posts_df), desc=f"Updating posts ({platform})"):
            post_id = row.get(platform2col_norm2col[platform].get('record_id', 'record_id'))
            if not post_id:
                continue

            impressions = row.get('impression_count', None)
            quotes = row.get('quote_count', None)
            likes = row.get(platform2col_norm2col[platform].get('like_count', 'like_count'), None)
            bookmarks = row.get('bookmark_count', None)
            retweets = row.get('retweet_count', None)
            replies = row.get('reply_count', None)

            query = """
            MATCH (p:Post {id: $post_id})
            SET p.impressions = $impressions,
                p.quotes = $quotes,
                p.likes = $likes,
                p.bookmarks = $bookmarks,
                p.retweets = $retweets,
                p.replies = $replies
            """
            session.run(
                query, post_id=str(post_id),
                impressions=impressions,
                quotes=quotes,
                likes=likes,
                bookmarks=bookmarks,
                retweets=retweets,
                replies=replies,
            )

    def _create_users(self, session, users_df, platform):
        """Create User nodes."""
        if users_df is None:
            return
        
        for _, row in tqdm(users_df.iterrows(), total=len(users_df), desc=f"Creating users ({platform})"):
            user_id = row.get(platform2col_norm2col[platform].get('user_id', 'user_id'))
            if not user_id:
                continue

            user_info = {
                "id": str(user_id),
                "name": row.get('name', ''),
                "username": row.get('username', ''),
                "description": row.get('description', ''),
                "profile_url": row.get(
                    platform2col_norm2col[platform].get('profile_url', 'profile_url'),
                    None
                ),
                "created_at": parse_created_at(
                    row.get('created_at', None)
                ),
                "location": row.get('location', None),
                "verified": row.get('verified', None),
                "followers_count": row.get(
                    platform2col_norm2col[platform].get('followers_count', 'followers_count'),
                    None
                ),
                "following_count": row.get('following_count', None),
                "post_count": row.get(
                    platform2col_norm2col[platform].get('post_count', 'post_count'),
                    None
                ),
            }

            session.execute_write(self.create_user_node, user_info)

    def _create_user_interactions(self, session, interactions_df, platform):
        """Create User-to-User interactions."""
        if interactions_df is None:
            return
        
        for _, row in tqdm(
            interactions_df.iterrows(), total=len(interactions_df),
            desc=f"Creating interactions ({platform})"
        ):
            from_id = row.get(platform2col_norm2col[platform].get('from_id', 'from_id'))
            to_id = row.get(platform2col_norm2col[platform].get('to_id', 'to_id'))
            if not (from_id and to_id):
                continue

            interaction_type = row.get(
                platform2col_norm2col[platform].get('interaction_type', 'interaction_type')
            )
            
            if pd.isna(interaction_type):
                continue

            session.execute_write(
                self.create_user_interaction, str(from_id), str(to_id), interaction_type
            )
    
    def _create_post_user_relationships(self, session, posts_df, platform):
        """Create POSTED relationships between Users and Posts."""
        if posts_df is None:
            return
        
        for _, row in tqdm(posts_df.iterrows(), total=len(posts_df), desc=f"Creating POSTED relationships ({platform})"):
            post_id = row.get(platform2col_norm2col[platform].get('record_id', 'record_id'))
            author_id = row.get(platform2col_norm2col[platform].get('author_id', 'author_id'))
            
            if post_id and author_id:
                session.execute_write(
                    self.create_post_user_relationship, str(post_id), str(author_id)
                )

# Create KG

## Filter X and Bluesky data

In [241]:
# Filter posts_x by statistics
filtered_posts_x = posts_x[
    (posts_x['impression_count'] >= 100_000) | 
    (posts_x['quote_count'] >= 500) | 
    (posts_x['like_count'] >= 100_000) |
    (posts_x['retweet_count'] >= 50_000)
]
keep_x_post_ids = set(filtered_posts_x['tweet_id'].astype(str))
print('# to keep:', len(keep_x_post_ids))

# to keep: 3066


In [242]:
# Filter posts_bluesky by statistics
filtered_posts_bluesky = posts_bluesky[
    posts_bluesky['follower_activity'] >= 50
]
keep_bluesky_post_ids = set(filtered_posts_bluesky['uri'])
print('# to keep:', len(keep_bluesky_post_ids))

# to keep: 3528


In [243]:
# Filter extraction dataframes
filtered_df_x = df_x[
    df_x['record_id'].astype(str).isin(keep_x_post_ids)
]
print(filtered_df_x.shape)
filtered_df_bluesky = df_bluesky[
    df_bluesky['record_id'].astype(str).isin(keep_bluesky_post_ids)
]
print(filtered_df_bluesky.shape)

(935, 14)
(619, 14)


In [244]:
# Filter users and interactions
# X
filtered_users_x = users_x[
    users_x['user_id'].isin(filtered_posts_x['author_id'].values)
].drop_duplicates(['user_id'], keep='last')
print(filtered_users_x.shape)

filtered_interactions_x = interactions_x[
    interactions_x['tweet_id'].isin(filtered_posts_x['tweet_id'].values)
].drop_duplicates(['tweet_id'], keep='last')
print(filtered_interactions_x.shape)

# Bluesky
filtered_users_bluesky = users_bluesky[
    users_bluesky['did'].isin(filtered_posts_bluesky['did'].values)
].drop_duplicates(['did'], keep='last')
print(filtered_users_bluesky.shape)

filtered_interactions_bluesky = interactions_bluesky[
    interactions_bluesky['uri'].isin(filtered_posts_bluesky['uri'].values)
].drop_duplicates(['uri'], keep='last')
print(filtered_interactions_bluesky.shape)

(431, 24)
(3066, 5)
(1402, 10)
(3528, 6)


## Run

In [267]:
def main():
    # Initialize knowledge graph
    kg = SocialMediaKnowledgeGraph(
        uri=uri,
        user=username,
        password=password,
        database="neo4j",
    )

    try:
        # Clear existing data
        # kg.clear_database()

        # Create knowledge graph
        kg.create_knowledge_graph(
            df_reddit=df_reddit,
            df_bluesky=filtered_df_bluesky,
            df_x=filtered_df_x,
            posts_x=filtered_posts_x,
            posts_bluesky=filtered_posts_bluesky,
            users_x=filtered_users_x,
            users_bluesky=filtered_users_bluesky,
            interactions_x=filtered_interactions_x,
            interactions_bluesky=filtered_interactions_bluesky,
            overall_col2df_membership=overall_col2df_membership,
            overall_col2df_membership_grouped=overall_col2df_membership_grouped,
        )

    finally:
        # Close the driver connection
        kg.close()

In [None]:
print("Creating KG")
main()

Creating KG
Creating User Interactions...


Creating interactions (X):   1%|          | 26/3066 [00:04<08:50,  5.73it/s]

# Queries

In [None]:
"""
1. 2-hop neighborhood around a given post

MATCH (p:Post {id: 'ID'})
OPTIONAL MATCH (p)-[r1]->(n1)
OPTIONAL MATCH (n1)-[r2]->(n2)
RETURN p, r1, n1, r2, n2

2. Ego-graph around a given post

MATCH (p:Post {id: 'ID'})
OPTIONAL MATCH (p)-[r1]->(n1)
OPTIONAL MATCH (n1)-[r2]->(n2)
WHERE n2 IN [p, n1]
RETURN p, r1, n1, r2, n2
"""

In [None]:
# Example queries from ChatGPT
"""
1. Find the Top Agents by Number of Posts

MATCH (a:Agent)<-[:HAS_AGENT]-(p:Post)
RETURN a.name AS agent, COUNT(p) AS post_count
ORDER BY post_count DESC
LIMIT 10

✅ What it does: Shows the most frequently mentioned agents across posts.
2. Find Agent → Action → Object Chains

MATCH (agent:Agent)-[:PERFORMS_ACTION]->(action:Action)-[:AFFECTS_OBJECT]->(object:Object)
RETURN agent.name AS agent, action.name AS action, object.name AS object
LIMIT 20

✅ What it does: Lists out extracted semantic triples: who is doing what to whom.
3. Top Narratives by Number of Posts

MATCH (n:Narrative)<-[:HAS_NARRATIVE]-(p:Post)
RETURN n.name AS narrative, COUNT(p) AS post_count
ORDER BY post_count DESC
LIMIT 10

✅ What it does: Shows the most common narrative clusters.
4. Posts and Their Associated Extraction Chains

MATCH (p:Post)-[:HAS_AGENT]->(a:Agent),
      (p)-[:HAS_ACTION]->(ac:Action),
      (p)-[:HAS_OBJECT]->(o:Object),
      (p)-[:HAS_NARRATIVE]->(n:Narrative)
RETURN p.record_id AS post_id, a.name AS agent, ac.name AS action, o.name AS object, n.name AS narrative
LIMIT 25

✅ What it does: For each post, shows its extractions: agent, action, object, narrative.
5. Interactions Between Users (Replies, Mentions, Quotes)

MATCH (u1:User)-[i:INTERACTS_WITH]->(u2:User)
RETURN u1.username AS from_user, i.type AS interaction_type, u2.username AS to_user
LIMIT 25

✅ What it does: Displays user-user social interactions (mention, reply, quote).
6. Find All Posts Describing a Specific Narrative

MATCH (n:Narrative {name: "your narrative name"})<-[:HAS_NARRATIVE]-(p:Post)
RETURN p.record_id, p.text, p.platform
LIMIT 10

✅ What it does: Find posts that match a given narrative cluster.

(Replace "your narrative name" with a real narrative cluster name!)
"""