In [1]:
import pandas as pd
import csv
from pathlib import Path
import json
from sklearn.metrics.pairwise import cosine_similarity
from itertools import combinations
from itertools import permutations
from igraph import *
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime
import time
import seaborn as sns
import io
import requests
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import AgglomerativeClustering
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse
from collections import Counter
import re
from zhon.hanzi import punctuation
from zhon.hanzi import characters
import math
from opencc import OpenCC 
cc = OpenCC('s2t')

In [2]:
root_path = #deleted for security reasons
out_path = #deleted for security reasons

# catagorize

In [3]:
def is_all_chinese(strs):
    for _char in strs:
        if not '\u4e00' <= _char <= '\u9fa5':
            return False
    return True

In [4]:
def catagorize(segment, key_word_list):
    result = 'no'
    if any(kw in segment and kw for kw in key_word_list)==True:
        result = 'yes'
    return result

In [5]:
key_words_1 = ['疫情', '隔離', '疫苗', '防疫' ,'病毒', '新冠' , '肺炎']
key_words_2 = ['共存', '清零']

In [11]:
start = time.process_time()
end = time.process_time()

#匯入已經斷詞好的檔案
print('loading...')
with open(root_path/'cofacts_wordseg_20220322-20220422.json' , 'r', encoding='big5') as reader:
    data = json.loads(reader.read())
print(len(data))

print('cleaning...')
for idx, doc in enumerate(data):
    #clean_segment
    segment = data[idx]['segment']
    clean_segment = []
    for s in segment:
        clean_s = cc.convert(s.replace('\n',''))
        if is_all_chinese(clean_s)==True and len(clean_s)>1:
            clean_segment.append(clean_s)

    data[idx]['clean_segment'] = clean_segment

    #catagorize
    covid_text = catagorize(clean_segment, key_words_1) 
    if covid_text == 'yes':
        coexist_text = catagorize(clean_segment, key_words_2)
    else:
        coexist_text = None
    data[idx]['covid'] = covid_text
    data[idx]['coexist'] = coexist_text

reader.close()

data_df = pd.DataFrame(data)
data_df.to_csv(root_path/'cofacts_clean_wordseg_20220322-20220422.csv')

print("This time is being calculated")
print(end - start)  

# HAC+KNN

In [6]:
#計算詞頻，產生詞頻矩陣
def vectorize(tokenList):
    stringifyToks = [" ".join(t) for t in tokenList]
    cv = CountVectorizer(binary=True, min_df=1, max_df=0.9)
    wordVector_csr = cv.fit_transform(stringifyToks) #計算詞頻
    return wordVector_csr

In [7]:
def calculate_similarity(mat):
    #壓縮稀疏矩陣
    if not scipy.sparse.issparse(mat):
        mat = scipy.sparse.csr_matrix(mat)
    mat = mat.astype(float)

    intrsct = mat * mat.T

    # for rows
    row_sums = mat.getnnz(axis=1)
    nnz_i = np.repeat(row_sums, intrsct.getnnz(axis=1))
    nnz_j = row_sums[intrsct.indices]

    intrsct.data = intrsct.data / np.maximum(nnz_i, nnz_j)
    return intrsct.A

In [8]:
def HAC(tokenList=None, wordVector=None, distance_threshold=0.6, linkage='average'):
    if wordVector is None:
        wordVector = vectorize(tokenList)

    similarityMatrix = calculate_similarity(wordVector)
    distanceMatrix = 1 - similarityMatrix

    del similarityMatrix

    #HAC 層次聚類 Hierarchical Clustering
    model = AgglomerativeClustering(distance_threshold=distance_threshold,
                                    n_clusters=None,
                                    affinity="precomputed",
                                    linkage=linkage)
    labels = model.fit_predict(distanceMatrix)

    return labels

In [9]:
def clustering(clustering_data, train_portion):
    n_articles = len(clustering_data)
    out = np.full(n_articles, -100)  # init

    #將結果轉化為稀疏矩陣矩陣的表示方式
    tokenList = np.array([x['clean_segment'] for x in clustering_data.values()], dtype=object) 

    print(f'n_articles: {n_articles}')
    print(f'train_portion: {train_portion}')
    
    stringifyToks = [" ".join(t) for t in tokenList]
    cv = CountVectorizer(binary=True, min_df=1, max_df=0.9, analyzer='word',token_pattern=u"(?u)\\b\\w+\\b")
    wv = cv.fit_transform(stringifyToks) #計算詞頻

    training_size = int(n_articles * train_portion)
    uid_train = np.random.choice(n_articles, training_size, replace=False)
    uid_train = np.sort(uid_train)
    uid_test = np.array(list(set(range(n_articles)) - set(uid_train)))
    
    wv_train = wv[uid_train, :]
    wv_test = wv[uid_test, :]
    
    # 層次聚類 Hierarchical Clustering，產生LABEL
    distance_threshold = 0.6
    labels = HAC(wordVector=wv_train, distance_threshold=distance_threshold)
    
    countsof = Counter(labels)
    y_train = np.array([x if countsof[x] > 1 else -1 for x in labels]) #取value
    out[uid_train] = y_train
    
    knn = KNeighborsClassifier(n_neighbors=10, weights='distance')
    knn.fit(wv_train, y_train)

    y_pred = knn.predict(wv_test)
    out[uid_test] = y_pred
    
    uid_leftover = np.where(out == -1)[0]
    wv_leftover = wv[uid_leftover, :]

    labels_leftover = HAC(wordVector=wv_leftover, distance_threshold=distance_threshold)
    labels_leftover = (np.max(out) + 1) + labels_leftover

    out[uid_leftover] = labels_leftover
    id2label = dict(zip(clustering_data.keys(), out))
    
    label_df = pd.DataFrame.from_dict(clustering_data, orient='index')
    label_df['label'] = label_df.index.to_series().apply(lambda x: id2label.get(x))
    label_df = label_df.reset_index()
    label_df = label_df.rename(columns={'index':'doc_id'})
#     label_df = label_df[label_df['label'].isna()==False]

    return label_df

In [None]:
start = time.process_time()
end = time.process_time()

print('loading...')
print(file.stem)

try:
    data_df = pd.read_csv(root_path/'cofacts_clean_wordseg_20220322-20220422.csv')
except:
    data_df = pd.read_csv(root_path/'cofacts_clean_wordseg_20220322-20220422.csv', lineterminator='\n')    

clustering_data = {}
for rows in data_df[data_df.coexist=='yes'].itertuples():
    clustering_data[rows.doc_id] = {'clean_segment': rows.clean_segment}

print('clustering...')
label_df = clustering(clustering_data, 0.2)

label_df.to_csv(out_path/f'cofacts_clustering_label_20220322-20220422.csv', index=False)

print("This time is being calculated")
print(end - start)  