In [7]:
libs <- c(
  "dplyr",
  "arrow",
  "igraph"
)
new.packages <- libs[!(libs %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
invisible(lapply(libs, library, character.only = TRUE))

Find users common in replies, votes, and follows edge lists

In [8]:
votes = read_parquet("data/df_edge_list_directed_users_votes_to_postings_net.parquet") %>%
  rename(weight = count_votes_to_postings_net)
replies <- read_parquet("data/df_edge_list_directed_users_postings_replies.parquet") %>%
    rename(weight = count_posting_replies)
follows = read_parquet('data/df_edge_list_directed_users_combined_postings_replies_and_votes_to_postings_net_and_follow_connections.parquet')
follows = follows %>% filter(count_follow_connection > 0) %>%
  rename(weight = count_follow_connection)

els = list(
  votes = votes,
  replies = replies,
  follows = follows
)

# Get the users that are present in all three datasets
users = unique(c(votes$ID_CommunityIdentity_Target, votes$ID_CommunityIdentity_Source, replies$ID_CommunityIdentity_Target, replies$ID_CommunityIdentity_Source, follows$ID_CommunityIdentity_Target, follows$ID_CommunityIdentity_Source))
users = users[
    (users %in% votes$ID_CommunityIdentity_Target |
    users %in% votes$ID_CommunityIdentity_Source) &
    (users %in% replies$ID_CommunityIdentity_Target |
    users %in% replies$ID_CommunityIdentity_Source) &
    (users %in% follows$ID_CommunityIdentity_Target |
    users %in% follows$ID_CommunityIdentity_Source)
]

We extract subgraphs of an appropriate size from the sparsest graph, the replies graph, using the modularity maximisation community detection algorithm, then extract the same nodes in the community from the votes and follows graphs.

In [9]:
gs = list()
base = 'follows'
g_undir = graph_from_data_frame(els[[base]], directed = FALSE)
nodes = V(g_undir)$name
nodes = nodes[as.integer(nodes) %in% users]
g_undir = induced_subgraph(g_undir, nodes)

lower = 240
upper = 260

# use modularity maximisation on the undirected graph to find a reply community of appropriate size
while(TRUE) {
  comm = cluster_louvain(g_undir)
  comm_sizes = sizes(comm)
  n = comm_sizes[(lower < comm_sizes & comm_sizes < upper)][1]
  if(!is.na(n[[1]])) break
}
idx = which(comm_sizes == n)

g_base = graph_from_data_frame(els[[base]], directed = TRUE)
gs[[base]] = induced_subgraph(g_base, communities(comm)[[idx]])

Extract same nodes as base graph from other graphs

In [10]:
types = c('votes', 'replies', 'follows')
nodes = V(gs[[base]])$name

for (t in c('replies', 'votes', 'follows')) {
  if (t == base) next
  g = graph_from_data_frame(els[[t]], directed = TRUE)
  gs[[t]] = induced_subgraph(g, vids = nodes)
}

data.frame(type = c("replies", "votes", "follows"),
  nodes = c(vcount(gs[['replies']]), vcount(gs[['votes']]), vcount(gs[['follows']])),
  edges = c(ecount(gs[['replies']]), ecount(gs[['votes']]), ecount(gs[['follows']]))
)


type,nodes,edges
<chr>,<dbl>,<dbl>
replies,255,2210
votes,255,7372
follows,255,330


In [6]:
for (t in types) {
  write_graph(gs[[t]], paste0('graphs/',t,'-',n,'.graphml'), format = 'graphml')
}