In [1]:
import json
import pandas as pd
from infomap import Infomap

In [2]:
df = pd.read_json('dataset/user_timeline.json')

# the following inputs is generated by the notebooks `Content Sim` and `Retweet Sim`
content_sim_df = pd.read_csv('outputs/high_content_sim_07_df.csv')
retweet_sim_df = pd.read_csv('outputs/retweet_author_sim_df.csv')

## Construct the user network

In [3]:
content_sim_threshold = .8
retweet_sim_threshold = .5

content_sim_df = content_sim_df.loc[content_sim_df['sim'] > content_sim_threshold]
retweet_sim_df = retweet_sim_df.loc[retweet_sim_df['sim'] > retweet_sim_threshold]

In [4]:
df['author'] = df['author'].apply(lambda v: (json.loads(v, strict=False)) if v else None)

In [5]:
retweet_sim_df = retweet_sim_df.astype({'user_i':'int', 'user_j':'int'})
content_sim_df = content_sim_df.astype({'user_i':'int', 'user_j':'int'})

In [6]:
id_list = list(set(retweet_sim_df['user_i'].tolist() + content_sim_df['user_i'].tolist() + content_sim_df['user_j'].tolist()))

id_to_code = {j: i for i, j in enumerate(id_list)}
code_to_id = {i: j for i, j in enumerate(id_list)}

In [7]:
retweet_sim_df['retweet_code_i'] = retweet_sim_df['user_i'].apply(lambda v: id_to_code[v])
retweet_sim_df['retweet_code_j'] = retweet_sim_df['user_j'].apply(lambda v: id_to_code[v])

content_sim_df['useri_code'] = content_sim_df['user_i'].apply(lambda v: id_to_code[v])
content_sim_df['userj_code'] = content_sim_df['user_j'].apply(lambda v: id_to_code[v])

In [8]:
retweet_list = [(i, j) for i, j in zip(retweet_sim_df['retweet_code_i'], retweet_sim_df['retweet_code_j'])]
content_list = [(i, j) for i, j in zip(content_sim_df['useri_code'], content_sim_df['userj_code'])]
links = tuple(set(retweet_list) | set(content_list))

## Community Detection

In [9]:
im = Infomap("--two-level --directed")
im.add_links(links)
im.run()

In [10]:
infomap_group_df = {
    'code':[],
    'group':[],
}

for node in im.nodes:
    infomap_group_df['code'].append(node.node_id)
    infomap_group_df['group'].append(node.module_id)

infomap_group_df = pd.DataFrame(infomap_group_df)
infomap_group_df['user_id'] = infomap_group_df['code'].apply(lambda x: code_to_id[x])

In [11]:
infomap_group_df.head()

Unnamed: 0,code,group,user_id
0,30,1,1287075452
1,22,1,1073634875050213381
2,92,1,421921631
3,26,1,522321484
4,7,1,1319193888507179008


In [12]:
infomap_group_df.to_csv("outputs/user_clusters.csv", index=False)