In [1]:
import csv
import pandas as pd
from datetime import datetime
import time
import numpy as np
import scipy.stats as st
from scipy.special import comb, perm
from pathlib import Path
import os
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from itertools import combinations
from igraph import *
import json

In [2]:
year = 2021
condition = '15m'
gap = 15*60

In [3]:
root_path = Path(fr'/Volumes/SP PHD U3/{year}_PTT_raw_data/covid-19')
output_path = Path(fr'/Volumes/SP PHD U3/{year}_PTT_raw_data/covid-19_{condition}')
root_path_csv = Path(fr'/Volumes/SP PHD U3/{year}_PTT_raw_data/covid-19_csv')
clustering_path = Path(fr'/Volumes/SP PHD U3/{year}_covid-19_clustering')

# Step 1: gather data (15min, pos)

In [4]:
def count_time_gap(time_a, time_c):
    time_format = '%Y/%m/%d %H:%M:%S'
    timeString_a = time_a
    struct_time_a = time.strptime(timeString_a, time_format)
    time_stamp_a = int(time.mktime(struct_time_a))
    timeString_c = time_c
    struct_time_c = time.strptime(timeString_c, time_format)
    time_stamp_c = int(time.mktime(struct_time_c))
    time_gap = time_stamp_c - time_stamp_a
    return time_gap

In [31]:
#csv版
data_range = 'covid'
df_file_name = f'{year}_{data_range}_comment_gos_{condition}_pos'
header = ['id', 'author_id', 'from_id', 'reaction', 'post_time']

with open(output_path / f'{df_file_name}.csv', 'w', newline='') as written_file:
    writeCsv = csv.writer(written_file, delimiter=',')
    writeCsv.writerow(header) 

for file in root_path_csv.glob(f'*{year}*Gossiping_{data_range}*'):
    if '._' not in file.stem:

        print(f'opening {file.stem} at {time.ctime()}')
        comment_lst = []       
        raw_data = pd.read_csv(file)
        
        for rows in raw_data.itertuples():
            time_a = rows.time
            time_c = rows.post_time                      
            time_gap = count_time_gap(time_a, time_c)
            
            #設定條件
            if time_gap <= gap and rows.reaction=='推' and rows.time < '2021/05/20 00:00:00':
                comment_lst.append([rows.id, rows.author_id, rows.from_id, 
                                    rows.reaction, rows.post_time]) 

            if rows.Index % 100000 == 0:
                print(f'saving {rows.Index} at {time.ctime()}')
                with open(output_path / f'{df_file_name}.csv', 'a', newline='') as written_file:
                    writeCsv = csv.writer(written_file, delimiter=',')
                    writeCsv.writerows(comment_lst)                       
                comment_lst = []     

        if rows.Index % 100000 != 0:
            print(f'saving {rows.Index} at {time.ctime()}')
            with open(output_path / f'{df_file_name}.csv', 'a', newline='') as written_file:
                writeCsv = csv.writer(written_file, delimiter=',')
                writeCsv.writerows(comment_lst)   

print('done')

opening iii_202102_Gossiping_covid at Sun Jul 11 17:48:34 2021
saving 0 at Sun Jul 11 17:48:35 2021
saving 78121 at Sun Jul 11 17:48:37 2021
opening iii_202101_Gossiping_covid at Sun Jul 11 17:48:37 2021
saving 0 at Sun Jul 11 17:48:38 2021
saving 100000 at Sun Jul 11 17:48:40 2021
saving 200000 at Sun Jul 11 17:48:42 2021
saving 231456 at Sun Jul 11 17:48:42 2021
opening iii_202103_Gossiping_covid at Sun Jul 11 17:48:42 2021
saving 0 at Sun Jul 11 17:48:43 2021
saving 54006 at Sun Jul 11 17:48:44 2021
opening iii_202104_Gossiping_covid at Sun Jul 11 17:48:44 2021
saving 0 at Sun Jul 11 17:48:45 2021
saving 73481 at Sun Jul 11 17:48:46 2021
opening iii_202105_Gossiping_covid at Sun Jul 11 17:48:46 2021
saving 0 at Sun Jul 11 17:48:56 2021
saving 100000 at Sun Jul 11 17:48:58 2021
saving 200000 at Sun Jul 11 17:49:00 2021
saving 300000 at Sun Jul 11 17:49:02 2021
saving 400000 at Sun Jul 11 17:49:04 2021
saving 500000 at Sun Jul 11 17:49:06 2021
saving 600000 at Sun Jul 11 17:49:08 2021

In [32]:
comment_gos_df = pd.read_csv(output_path / f'{year}_{data_range}_comment_gos_{condition}_pos.csv')
print(comment_gos_df.shape)

(184898, 5)


In [33]:
id_dict = {}

for x in comment_gos_df.itertuples():
    if x.from_id in id_dict:
        id_dict[x.from_id].add(x.id)
    else:
        id_dict[x.from_id] = {x.id}

In [34]:
url_threshold = 0 #出沒文章數
acc_over_url_threshold = set()

In [35]:
for x in id_dict:
    if len(id_dict[x]) >= url_threshold:
        acc_over_url_threshold.add(x)

In [36]:
#check the amount of accounts
print(f'all accounts: {len(set(comment_gos_df.from_id))}')
print(f'above url_threshold: {len(acc_over_url_threshold)}')
print(f'combination: {comb(len(acc_over_url_threshold), 2)}')
print(f'time: {comb(len(acc_over_url_threshold), 2)*13/10000000/60} minutes')

all accounts: 26029
above url_threshold: 26029
combination: 338741406.0
time: 7.33939713 minutes


# Step 2: calculate Jaccard Similarity (edge weight)

In [37]:
#setting thresholds
len_url_inter_threshold = 2
similarity_threshold = 0.1
ss = str(similarity_threshold).replace('.', '')
sim_file_name = f'{year}_{data_range}_gos_{condition}_pos_inter{len_url_inter_threshold}_sim{ss}'
print(sim_file_name)

2021_covid_gos_15m_pos_inter2_sim01


In [38]:
def jc_sim(a, b):

    url_inter = id_dict[a].intersection(id_dict[b])
    url_union = id_dict[a].union(id_dict[b])
    len_url_inter = len(url_inter)
    len_url_union = len(url_union)

    if len_url_inter >= len_url_inter_threshold:
        similarity = len_url_inter / len_url_union        
        if similarity >= similarity_threshold:
            outcome_similarity.add((c[0], c[1], similarity, len_url_inter, len_url_union))
                
    return outcome_similarity

In [39]:
outcome_similarity = set()

header = ['source', 'target', 'weight', 'inter', 'union']
with open(clustering_path / f'{sim_file_name}.csv', 'w', newline='') as written_file:
    writeCsv = csv.writer(written_file, delimiter=',')
    writeCsv.writerow(header) 
    
for idx, c in enumerate(combinations(acc_over_url_threshold, 2)):
    outcome_similarity = jc_sim(c[0], c[1])

    if idx % 10000000 == 0:
        print(idx, time.ctime())
        with open(clustering_path / f'{sim_file_name}.csv', 'a', newline='') as written_file:
            writeCsv = csv.writer(written_file, delimiter=',')
            writeCsv.writerows(outcome_similarity)

        outcome_similarity = set()

if idx % 10000000 != 0:
    print(idx, time.ctime())
    with open(clustering_path / f'{sim_file_name}.csv', 'a', newline='') as written_file:
        writeCsv = csv.writer(written_file, delimiter=',')
        writeCsv.writerows(outcome_similarity)
        
print('done')

0 Sun Jul 11 17:50:22 2021
10000000 Sun Jul 11 17:50:32 2021
20000000 Sun Jul 11 17:50:42 2021
30000000 Sun Jul 11 17:50:52 2021
40000000 Sun Jul 11 17:51:02 2021
50000000 Sun Jul 11 17:51:12 2021
60000000 Sun Jul 11 17:51:22 2021
70000000 Sun Jul 11 17:51:32 2021
80000000 Sun Jul 11 17:51:43 2021
90000000 Sun Jul 11 17:51:53 2021
100000000 Sun Jul 11 17:52:03 2021
110000000 Sun Jul 11 17:52:13 2021
120000000 Sun Jul 11 17:52:22 2021
130000000 Sun Jul 11 17:52:32 2021
140000000 Sun Jul 11 17:52:42 2021
150000000 Sun Jul 11 17:52:52 2021
160000000 Sun Jul 11 17:53:02 2021
170000000 Sun Jul 11 17:53:12 2021
180000000 Sun Jul 11 17:53:22 2021
190000000 Sun Jul 11 17:53:31 2021
200000000 Sun Jul 11 17:53:41 2021
210000000 Sun Jul 11 17:53:51 2021
220000000 Sun Jul 11 17:54:01 2021
230000000 Sun Jul 11 17:54:12 2021
240000000 Sun Jul 11 17:54:22 2021
250000000 Sun Jul 11 17:54:32 2021
260000000 Sun Jul 11 17:54:41 2021
270000000 Sun Jul 11 17:54:51 2021
280000000 Sun Jul 11 17:55:00 2021
29

In [40]:
sim_df = pd.read_csv(clustering_path / f'{sim_file_name}.csv')
print(sim_df.shape)

(8517, 5)


In [41]:
# sim_df[['weight', 'inter', 'union']].describe()

In [42]:
print(f'a=0.1, weight={sim_df.weight.quantile(q=0.90)}')
print(f'a=0.05, weight={sim_df.weight.quantile(q=0.95)}')
print(f'a=0.01, weight={sim_df.weight.quantile(q=0.99)}')

a=0.1, weight=0.3333333333333333
a=0.05, weight=0.5
a=0.01, weight=0.9600000000000364


# Step 3: build graph and detect communities

In [43]:
sim_df = pd.read_csv(clustering_path / f'{sim_file_name}.csv')

In [44]:
g = Graph.TupleList([x[:3] for x in sim_df.to_numpy()], edge_attrs=['weight'], directed=False)

In [45]:
summary(g)

IGRAPH UNW- 4055 8517 -- 
+ attr: name (v), weight (e)


In [46]:
print(time.ctime())
communities_ifm = g.community_infomap(edge_weights='weight')
print(time.ctime())

Sun Jul 11 17:55:55 2021
Sun Jul 11 17:55:57 2021


In [47]:
print(time.ctime())
gos_pos_communities=[]

for gp_idx, community in enumerate(communities_ifm):
    if 300 > len(community) > 2:
        gos_pos_communities.append([len(community), [g.vs[x]['name'] for x in community]])

gos_pos_communities_df = pd.DataFrame(gos_pos_communities, columns=['num', 'member'])
print(time.ctime())

Sun Jul 11 17:55:57 2021
Sun Jul 11 17:55:57 2021


In [48]:
gos_pos_communities_df = gos_pos_communities_df.sort_values(by=['num'], ascending=False).reset_index(inplace=False)
gos_pos_communities_df = gos_pos_communities_df.drop(columns=['index'])
for rows in gos_pos_communities_df.itertuples():
    gos_pos_communities_df.loc[rows.Index, 'group_index'] = f'gp_{rows.Index+1}'
gos_pos_communities_df = gos_pos_communities_df.reindex(columns=['group_index', 'num', 'member'])
gos_pos_communities_df.to_csv(clustering_path / f'{sim_file_name}_com.csv', index=False)

In [53]:
gos_pos_communities_df.head(10)

Unnamed: 0,group_index,num,member
0,gp_1,60,"[exarawdon, acer12356, blue1232, YocoVodka, hz..."
1,gp_2,55,"[mathew12310, wel1103, mybapu, sin31429, yanke..."
2,gp_3,49,"[jillene, a3050909, yamason, zenar, vdml, s900..."
3,gp_4,42,"[legendd, xyz530, Imgoodjob, wobbuffet, Degfxl..."
4,gp_5,36,"[fish31, durarara2020, dambosrx, wanhlily, sha..."
5,gp_6,29,"[FutuReStronG, cwuwang, ImHoluCan, hllmayday, ..."
6,gp_7,29,"[s87069, littlemame, bart102617, dlter, rusa, ..."
7,gp_8,26,"[luckyhsin199, erichha, jhoo53640, bryantiswil..."
8,gp_9,26,"[vincentrufus, sam60609797, barry910543, f8605..."
9,gp_10,26,"[despair78214, sony1256, bwichiro, KingJamesS,..."


In [50]:
gos_pos_communities_gpid = []

for rows in gos_pos_communities_df.itertuples():
    for mem in rows.member:
        gos_pos_communities_gpid.append([mem, rows.group_index])
        
gos_pos_communities_gpid_df = pd.DataFrame(gos_pos_communities_gpid, columns=['id', 'group_index']).sort_values(by='group_index')
gos_pos_communities_gpid_df.to_csv(clustering_path / f'{sim_file_name}_gpid.csv', index=False)

In [51]:
node_set = set(gos_pos_communities_gpid_df['id'])
edge_df = []
for rows in sim_df.itertuples():
    if rows.source in node_set and rows.target in node_set:
        edge_df.append([
            rows.source,
            rows.target,
            float(rows.weight),
            int(rows.inter),
            int(rows.union)
        ])
edge_df = pd.DataFrame(edge_df, columns=['source', 'target', 'weight', 'inter', 'union'])
print(edge_df.shape)

(8161, 5)


In [52]:
edge_df.to_csv(clustering_path / f'{sim_file_name}_edge.csv', index=False)