In [1]:
import pandas as pd
import numpy as np

In [2]:
followers = pd.read_csv('followers.csv')
following = pd.read_csv('following.csv')

In [3]:
print(followers['relation'].value_counts())
print(following['relation'].value_counts())

followers    1116655
Name: relation, dtype: int64
following    2626979
Name: relation, dtype: int64


In [5]:
followers.head()

Unnamed: 0,source_id,relation,target_id
0,u980749991491682304,followers,u1480979504696864775
1,u2704715387,followers,u1483909830159085571
2,u753165526566899713,followers,u1484420078
3,u114820052,followers,u18876842
4,u1924484352,followers,u3408319427


In [6]:
following.head()

Unnamed: 0,source_id,relation,target_id
0,u105387876,following,u402576793
1,u148520716,following,u59653593
2,u1276438425457967110,following,u1389155636693381120
3,u1445432327367237638,following,u848348952084828160
4,u1445432327367237638,following,u850507814023942144


In [8]:
label = pd.read_csv('twi22/label.csv')
label.head()

Unnamed: 0,id,label
0,u1217628182611927040,human
1,u2664730894,human
2,u1266703520205549568,human
3,u1089159225148882949,human
4,u36741729,bot


## Select the interested topics (hashtags)

In [9]:
# select the topic
#tweet = pd.read_csv('topic_tweet1_covid.csv')
tweet = pd.read_csv('topic_tweet2_war.csv') 
#tweet = pd.read_csv('topic_tweet3_climate.csv') 

tweet['uid'] = 'u'+tweet['uid'].astype(str)
tweet.head()

Unnamed: 0,uid,tweet
0,u763596433534496768,@spbail Is this post-Covid lethargy? I’m so sc...
1,u1291883446307037187,https://t.co/tKm8HtIAlS\n\nHealth Canada autho...
2,u1291883446307037187,https://t.co/OANVXAlvDU\n\nA team of internati...
3,u1291883446307037187,https://t.co/IZm3Rttpw9\n\nJesus is the Truth ...
4,u1291883446307037187,https://t.co/RUjr0MJgFw\n\n(CNN)The world coul...


In [20]:
select_user_id = set(tweet['uid'])&set(label['id'])
print(len(select_user_id))
select_user_id = pd.DataFrame({'select_uid': list(select_user_id)})
select_user_id.head()

269623


Unnamed: 0,select_uid
0,u91032331
1,u110830686
2,u1256595851859726337
3,u706621126
4,u41090635


In [23]:
# 只保留网络中讨论过这个话题的用户
select_followers = pd.merge(followers, select_user_id, left_on='source_id', right_on='select_uid', how='right')
select_followers.dropna(axis=0, inplace=True)
select_followers = pd.merge(select_followers, select_user_id, left_on='target_id', right_on='select_uid', how='right')
select_followers.dropna(axis=0, inplace=True)
select_followers = select_followers[['source_id', 'target_id']]

select_following = pd.merge(following, select_user_id, left_on='source_id', right_on='select_uid', how='right')
select_following.dropna(axis=0, inplace=True)
select_following = pd.merge(select_following, select_user_id, left_on='target_id', right_on='select_uid', how='right')
select_following.dropna(axis=0, inplace=True)
select_following = select_following[['source_id', 'target_id']]
# print(len(select_followers), len(select_following))

# 转化为(2, Num_edges)格式的numpy array, 其中第0行节点follow第1行节点
edges_1 = np.array([select_followers['target_id'].to_list(), select_followers['source_id'].to_list()])
edges_2 = np.array([select_following['source_id'].to_list(), select_following['target_id'].to_list()])
edges = np.concatenate([edges_1, edges_2], axis=1)
print("Check num of edges: ", edges.shape)

# 相应主题下有bot label和relation的uid（用于最终GNN training和effect estimatin）
final_uid = (set(select_user_id['select_uid'])&set(edges[0]))|(set(select_user_id['select_uid'])&set(edges[1]))
print("Num of user selected: ", len(final_uid))

339849 1016433


In [33]:
# Using networkx for network data analysis
import networkx as nx
G = nx.DiGraph()
# add the edges (directed)
for e in range(len(edges[0])):
    G.add_edge(edges[0][e], edges[1][e])
print("Num nodes: ", len(G.nodes), "Num edges: ", len(G.edges))

# add the node label (bot or human)
label.set_index('id', inplace=True)
for node_id in G.nodes():
    G.add_nodes_from([(node_id, {'label': label.loc[node_id, 'label']})])
print("Num nodes: ", len(G.nodes))
print("Num edges: ", len(G.edges))

Num nodes:  195491
Num edges:  1341914


In [47]:
# data analysis
import matplotlib.pyplot as plt
# 1. check the node degree
degree_hist = nx.degree_histogram(G)
degree_stat = pd.DataFrame({'Degree': range(len(degree_hist)), 'Node Count': degree_hist})
print("Node degree stat: ", degree_stat)
# 2. check the average in/out degree for human/bot
node_labels = nx.get_node_attributes(G, 'label')
human_nodes = [node for node, label in node_labels.items() if label=='human']
bot_nodes = [node for node, label in node_labels.items() if label=='bot']
ind0 = sum(G.in_degree(node) for node in human_nodes)/len(human_nodes)
ind1 = sum(G.in_degree(node) for node in bot_nodes)/len(bot_nodes)
otd0 = sum(G.out_degree(node) for node in human_nodes)/len(human_nodes)
otd1 = sum(G.out_degree(node) for node in bot_nodes)/len(bot_nodes)
print("Ave in-degree of human ({:.4f}) and bot ({:.4f})".format(ind0, ind1))
print("Ave out-degree of human ({:.4f}) and bot ({:.4f})".format(otd0, otd1))

Node degree stat:        Degree  Node Count
0          0           0
1          1       53027
2          2       37302
3          3       20313
4          4       14295
...      ...         ...
1163    1163           0
1164    1164           0
1165    1165           0
1166    1166           0
1167    1167           1

[1168 rows x 2 columns]
Ave in-degree of human (7.1490) and bot (3.0872)
Ave out-degree of human (6.9373) and bot (5.8961)


In [49]:
# 3. "a following b" graph
G_HfollowingH = G.edge_subgraph([(u,v) for u,v,d in G.edges(data=True) if G.nodes[u]['label']=='human' and G.nodes[v]['label']=='human'])
G_HfollowingB = G.edge_subgraph([(u,v) for u,v,d in G.edges(data=True) if G.nodes[u]['label']=='human' and G.nodes[v]['label']=='bot'])
G_BfollowingH = G.edge_subgraph([(u,v) for u,v,d in G.edges(data=True) if G.nodes[u]['label']=='bot' and G.nodes[v]['label']=='human'])
G_BfollowingB = G.edge_subgraph([(u,v) for u,v,d in G.edges(data=True) if G.nodes[u]['label']=='bot' and G.nodes[v]['label']=='bot'])
print("Human following Human: ", G_HfollowingH.number_of_edges())
print("Human following Bot: ", G_HfollowingB.number_of_edges())
print("Bot following Human: ", G_BfollowingH.number_of_edges())
print("Bot following Bot: ", G_BfollowingB.number_of_edges())

Human following Human:  1222850
Human following Bot:  38282
Bot following Human:  76766
Bot following Bot:  4016
