In [1]:
import pandas as pd
import numpy as np

In [35]:
followers = pd.read_csv('followers.csv')
following = pd.read_csv('following.csv')
label = pd.read_csv('twi22/label.csv')

## Select the interested topics (hashtags)

In [36]:
# select the topic
#tweet = pd.read_csv('topic_tweet1_covid.csv')
tweet = pd.read_csv('topic_tweet2_war.csv') 
#tweet = pd.read_csv('topic_tweet3_climate.csv') 

tweet['uid'] = 'u'+tweet['uid'].astype(str)
tweet.head()

Unnamed: 0,uid,tweet
0,u2838934563,"RT @Everton: The Club, players and fans are to..."
1,u2838934563,RT @TheAthleticUK: Everton players line up dra...
2,u763596433534496768,@IgorBrigadir @yoavgo I think this is more abo...
3,u1291883446307037187,https://t.co/gBgXkNDcEd\n\nThe self-proclaimed...
4,u1291883446307037187,https://t.co/Naqs2n5bx0\nThe wider region is l...


In [37]:
select_user_id = set(tweet['uid'])&set(label['id'])
print(len(select_user_id))
select_user_id = pd.DataFrame({'select_uid': list(select_user_id)})
select_user_id.head()

277467


Unnamed: 0,select_uid
0,u1343522204043128832
1,u137424427
2,u15995577
3,u93946167
4,u1122773861228060672


In [38]:
# 只保留网络中讨论过这个话题的用户
select_followers = pd.merge(followers, select_user_id, left_on='source_id', right_on='select_uid', how='right')
select_followers.dropna(axis=0, inplace=True)
select_followers = pd.merge(select_followers, select_user_id, left_on='target_id', right_on='select_uid', how='right')
select_followers.dropna(axis=0, inplace=True)
select_followers = select_followers[['source_id', 'target_id']]

select_following = pd.merge(following, select_user_id, left_on='source_id', right_on='select_uid', how='right')
select_following.dropna(axis=0, inplace=True)
select_following = pd.merge(select_following, select_user_id, left_on='target_id', right_on='select_uid', how='right')
select_following.dropna(axis=0, inplace=True)
select_following = select_following[['source_id', 'target_id']]
# print(len(select_followers), len(select_following))

# 转化为(2, Num_edges)格式的numpy array, 其中第0行节点follow第1行节点
edges_1 = np.array([select_followers['target_id'].to_list(), select_followers['source_id'].to_list()])
edges_2 = np.array([select_following['source_id'].to_list(), select_following['target_id'].to_list()])
edges = np.concatenate([edges_1, edges_2], axis=1)
print("Check num of edges: ", edges.shape)

# 相应主题下有bot label和relation的uid（用于最终GNN training和effect estimatin）
final_uid = (set(select_user_id['select_uid'])&set(edges[0]))|(set(select_user_id['select_uid'])&set(edges[1]))
print("Num of user selected: ", len(final_uid))

Check num of edges:  (2, 800554)
Num of user selected:  157029


## Network Analysis

In [39]:
# Using networkx for network data analysis
import networkx as nx
from collections import Counter

G = nx.DiGraph()
# add the edges (directed)
for e in range(len(edges[0])):
    G.add_edge(edges[0][e], edges[1][e])
print("Num nodes: ", len(G.nodes), "Num edges: ", len(G.edges))

# add the node label (bot or human)
label.set_index('id', inplace=True)
for node_id in G.nodes():
    G.add_nodes_from([(node_id, {'label': label.loc[node_id, 'label']})])
print("Num nodes: ", len(G.nodes), "Num edges: ", len(G.edges))

label_counts = Counter(nx.get_node_attributes(G, 'label').values())

print(dict(label_counts))

Num nodes:  157029 Num edges:  796236
Num nodes:  157029 Num edges:  796236
{'human': 150595, 'bot': 6434}


In [40]:
# data analysis
import matplotlib.pyplot as plt
# 1. check the node degree
degree_hist = nx.degree_histogram(G)
degree_stat = pd.DataFrame({'Degree': range(len(degree_hist)), 'Node Count': degree_hist})
print("Node degree stat: ", degree_stat)
# 2. check the average in/out degree for human/bot
node_labels = nx.get_node_attributes(G, 'label')
human_nodes = [node for node, label in node_labels.items() if label=='human']
bot_nodes = [node for node, label in node_labels.items() if label=='bot']
ind0 = sum(G.in_degree(node) for node in human_nodes)/len(human_nodes)
ind1 = sum(G.in_degree(node) for node in bot_nodes)/len(bot_nodes)
otd0 = sum(G.out_degree(node) for node in human_nodes)/len(human_nodes)
otd1 = sum(G.out_degree(node) for node in bot_nodes)/len(bot_nodes)
print("Ave in-degree of human ({:.4f}) and bot ({:.4f})".format(ind0, ind1))
print("Ave out-degree of human ({:.4f}) and bot ({:.4f})".format(otd0, otd1))

Node degree stat:        Degree  Node Count
0          0           0
1          1       48816
2          2       32292
3          3       17487
4          4       12072
...      ...         ...
1035    1035           0
1036    1036           0
1037    1037           1
1038    1038           0
1039    1039           1

[1040 rows x 2 columns]
Ave in-degree of human (5.1664) and bot (2.8283)
Ave out-degree of human (5.0497) and bot (5.5609)


In [41]:
# 3. "a following b" graph
G_HfollowingH = G.edge_subgraph([(u,v) for u,v,d in G.edges(data=True) if G.nodes[u]['label']=='human' and G.nodes[v]['label']=='human'])
G_HfollowingB = G.edge_subgraph([(u,v) for u,v,d in G.edges(data=True) if G.nodes[u]['label']=='human' and G.nodes[v]['label']=='bot'])
G_BfollowingH = G.edge_subgraph([(u,v) for u,v,d in G.edges(data=True) if G.nodes[u]['label']=='bot' and G.nodes[v]['label']=='human'])
G_BfollowingB = G.edge_subgraph([(u,v) for u,v,d in G.edges(data=True) if G.nodes[u]['label']=='bot' and G.nodes[v]['label']=='bot'])
print("Human following Human: ", G_HfollowingH.number_of_edges())
print("Human following Bot: ", G_HfollowingB.number_of_edges())
print("Bot following Human: ", G_BfollowingH.number_of_edges())
print("Bot following Bot: ", G_BfollowingB.number_of_edges())

Human following Human:  743707
Human following Bot:  16750
Bot following Human:  34332
Bot following Bot:  1447


In [42]:
#nx.write_gpickle(G, 'topic_tweet1_covid.pkl')
nx.write_gpickle(G, 'topic_tweet2_war.pkl') 
#nx.write_gpickle(G, 'topic_tweet3_climate.pkl')