In [19]:
import pandas as pd
import csv
from pathlib import Path
import json
from sklearn.metrics.pairwise import cosine_similarity
from itertools import combinations
from itertools import permutations
from igraph import *
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime
import time
import seaborn as sns
import io
import requests
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import AgglomerativeClustering
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse
from collections import Counter
import re
import math
from opencc import OpenCC 
import string

In [20]:
root_path = ###deleted for security reasons###
out_path = ###deleted for security reasons###

In [21]:
# stop_words = open (r'/Users/ycchen/Desktop/dict/stop_words.txt','r', encoding='utf8')

In [22]:
# ['id', 'text', 'createdAt', 'stats', 'articleReplies', 'url', 'segment']
cofacts_segment = {}
tformat = "%Y-%m-%d"
with open(root_path/'cofacts_20220319-20220513.json' , 'r', encoding='big5') as reader:
    data = json.loads(reader.read())
    for idx, article in enumerate(data):
        for d in article['stats']:
            ori_time = datetime.strptime(d['date'].split('T')[0], tformat)
            if ori_time >= datetime.strptime('2022-03-19', tformat) and ori_time <= datetime.strptime('2022-05-13', tformat):
                cofacts_segment[idx] = {'article_id':data[idx]['id'],
                                        'createdAt': data[idx]['createdAt'].split('T')[0],
                                        'text': data[idx]['text'], 
                                        'segment': data[idx]['segment']}

In [23]:
print(len(cofacts_segment))

22525


In [24]:
#simplified chinese to traditional chinese
cc = OpenCC('s2t')

for k in cofacts_segment.keys():
    cofacts_segment[k]['segment_t'] = [cc.convert(s) for s in cofacts_segment[k]['segment']]

# functions

In [25]:
def is_all_chinese(strs):
    for _char in strs:
        if not '\u4e00' <= _char <= '\u9fa5':
            return False
    return True

In [26]:
def contains_chinese(strs):
    for _char in strs:
        if '\u4e00' <= _char <= '\u9fa5':
            return True
    return False

In [27]:
def contains_url(text):
    url_elements = ['http', 'www']
    result = 'no'
    if any(ue in text and ue for ue in url_elements)==True:
        result = 'yes'
    return result

In [28]:
def contains_url_format(text):
    url_elements = ['http', 'www']
    check = list([np.sign(text.find(ue)) for ue in url_elements])
    
    result = 'both'
    if check==[-1, -1]:
        result = 'no_url'

    elif len(re.split('\n| |　', text)) == 1:
        if check==[0,-1] or check==[0,1]:
            result = 'only_url'
    else:
        pass
        
    return result

In [29]:
def catagorize(text, key_word_list):
    result = 'no'
    if any(kw in text and kw for kw in key_word_list)==True:
        return 'yes'
    else:
        return 'no'

In [30]:
def creat_new_dict(old_dict, new_dict, category):
    new_dict = {
        'article_id':old_dict[k]['article_id'],
        'createdAt':old_dict[k]['createdAt'],
#         'text':old_dict[k]['text'],
        'clean_segment':old_dict[k]['clean_segment'],
        'category':category
        }
    return new_dict

# catagorize

In [31]:
punctuation = string.punctuation
for k in cofacts_segment.keys():
    clean_segment = []
    segments = cofacts_segment[k]['segment_t']
    
    for w in segments:
        if is_all_chinese(w)==True and len(w)>1 and w not in punctuation:
            clean_segment.append(w.replace('\n',''))
    cofacts_segment[k]['clean_segment'] = clean_segment

In [32]:
key_words_1 = ['疫情', '隔離', '疫苗', '防疫' ,'病毒', '新冠' , '肺炎', '快篩']
key_words_2 = ['共存', '清零']
key_words_3 = ['疫苗']
key_words_4 = ['快篩', 'pcr', 'PCR', 'Pcr']

In [44]:
start = time.process_time()
end = time.process_time()

covid_dict = {}
coexist_dict = {}
vaccine_dict = {}
rapid_test_dict = {}

for k in cofacts_segment.keys():
    text = cofacts_segment[k]['text']
    covid_text = catagorize(text, key_words_1)
    coexist_text = catagorize(text, key_words_2)
    vaccine_text = catagorize(text, key_words_3)
    rapid_test_text = catagorize(text, key_words_4)
    
    if covid_text == 'yes':
        covid_dict[k] = creat_new_dict(cofacts_segment, covid_dict, 'covid')
        if coexist_text == 'yes':
            coexist_dict[k] = creat_new_dict(cofacts_segment, coexist_dict, 'coexist')
        if vaccine_text == 'yes':
            vaccine_dict[k] = creat_new_dict(cofacts_segment, vaccine_dict, 'vaccine')
        if rapid_test_text == 'yes':
            rapid_test_dict[k] = creat_new_dict(cofacts_segment, rapid_test_dict, 'rapid_test')

print("This time is being calculated")
print(end - start)  

This time is being calculated
7.300000000043383e-05


In [45]:
coexist_set = set()
for k in coexist_dict.keys():
    coexist_set.add(coexist_dict[k]['article_id'])

In [46]:
rapid_test_set = set()
for k in rapid_test_dict.keys():
    rapid_test_set.add(rapid_test_dict[k]['article_id'])

In [54]:
len(covid_dict)

6509

In [48]:
with open(root_path/"cofacts_covid_20220319-20220513.json", "w") as f:
    json.dump(covid_dict, f, indent = 4)
    
with open(root_path/"cofacts_coexist_20220319-20220513.json", "w") as f:
    json.dump(coexist_dict, f, indent = 4)
    
with open(root_path/"cofacts_vaccine_20220319-20220513.json", "w") as f:
    json.dump(vaccine_dict, f, indent = 4)
    
with open(root_path/"cofacts_rapid_test_20220319-20220513.json", "w") as f:
    json.dump(rapid_test_dict, f, indent = 4)

# url_format

In [17]:
# start = time.process_time()# end = time.process_time()

# for k in cofacts_segment.keys():
#     text = cofacts_segment[k]['text'].lower()
#     cofacts_segment[k]['contains_url'] = contains_url(text)
#     cofacts_segment[k]['contains_url_format'] = contains_url_format(text)
    
# print("This time is being calculated")
# print(end - start)  

In [18]:
# for k in cofacts_segment.keys():
#     clean_segment = []
#     segments = cofacts_segment[k]['segment_t']
    
# #     if data[k]['contains_url_format'] != 'only_url':          
#     for w in segments:
#         if is_all_chinese(w)==True and len(w)>1:
#             clean_segment.append(w.replace('\n',''))
#     cofacts_segment[k]['clean_segment'] = clean_segment