# Importing all relevant libraries

In [20]:
from datasets import load_dataset, concatenate_datasets
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
from collections import defaultdict
import netwulf as nw

# Load dataset from Hugging Face

In [4]:
dataset = load_dataset('m-newhauser/senator-tweets')

In [5]:
#convert DatasetDict to DataFrame
datasets_to_combine = [dataset[split] for split in dataset.keys()]
combined_dataset = concatenate_datasets(datasets_to_combine)
df = combined_dataset.to_pandas()

In [23]:
df

Unnamed: 0,date,id,username,text,party,labels,embeddings
0,2021-10-13 19:47:44,1448374915636383745,SenatorHassan,Happy th birthday to the @USNavy! The strength...,Democrat,1,"[-0.026915843, 0.087234065, 0.018707331, -0.03..."
1,2021-06-30 14:53:13,1410250073003462656,SenatorMenendez,The greatest generation's investment in infras...,Democrat,1,"[0.024044158, -0.0048382296, 0.09699756, -0.03..."
2,2021-08-08 01:11:29,1424176405881966599,SenBillCassidy,"Thanks to @SenTedCruz and @SenatorWarnock, th...",Republican,0,"[-0.002620128, -0.042515174, 0.065084696, 0.01..."
3,2021-04-14 14:02:49,1382333523567185921,SenBlumenthal,/ To get lasting change we cant just lock up t...,Democrat,1,"[-0.045103785, 0.0762336, -0.011798679, -0.044..."
4,2021-12-11 16:06:38,1469700160934621188,SenatorBraun,Today were celebrating years of the Hoosier st...,Republican,0,"[-0.038810886, 0.11611319, 0.06621017, -0.0184..."
...,...,...,...,...,...,...,...
99688,2021-11-09 18:39:29,1458142213574049797,SenatorWicker,The world is a more dangerous place than ever ...,Republican,0,"[0.06120153, -0.009898015, 0.044701263, 0.0103..."
99689,2021-04-02 15:54:50,1378013060816855041,SenRickScott,"Great news from @MyFWC: This weekend, Florida ...",Republican,0,"[0.00034493094, 0.008543573, 0.061530214, -0.0..."
99690,2021-10-14 15:39:01,1448674715711807497,RogerMarshallMD,"Driving through Kansas, you cant help but noti...",Republican,0,"[0.031099945, 0.00020814339, 0.06814072, 0.061..."
99691,2021-06-01 22:37:46,1399857734447677441,SenMarkey,I stand with the workers of Pavement Coffee an...,Democrat,1,"[-0.0664301, 0.042548176, 0.07595737, 0.075365..."


# Find number of senators and mentions

In [15]:
unique_usernames = set(df['username'].unique())

valid_mentions_count = 0
for index, row in df.iterrows():
    text = row['text']
    tokens = text.split('@')[1:]
    for token in tokens:
        potential_username = token.split()[0] if ' ' in token else token
        if potential_username in unique_usernames:
            valid_mentions_count += 1

print(f"Number of senators: {len(unique_usernames)}")
print(f"Number of valid mentions: {valid_mentions_count}")

Number of senators: 99
Number of valid mentions: 4744


# Build Network

In [16]:
G = nx.Graph()
edges = defaultdict(int)

for index, row in df.iterrows():
    sender = row['username']
    text = row['text']
    unique_usernames = set(df['username'].unique())
    
    tokens = text.split('@')[1:] 
    
    for token in tokens:
        potential_username = token.split()[0] if ' ' in token else token
        if potential_username in unique_usernames and potential_username != sender:
            edge = tuple(sorted((sender, potential_username)))
            edges[edge] += 1

for (source, target), weight in edges.items():
    G.add_edge(source, target, weight=weight)

In [24]:
#add party attribute
for node in G.nodes:
    party = df[df['username'] == node]['party'].values[0]
    G.nodes[node]['party'] = party

In [25]:
num_nodes = G.number_of_nodes()
num_edges = G.number_of_edges()
print(num_nodes, num_edges)

99 1608


In [30]:
print(G.nodes['SenSchumer']['party'])

Democrat
