In [1]:
import pandas as pd
import csv
from pathlib import Path
import json
from sklearn.metrics.pairwise import cosine_similarity
from itertools import combinations
from itertools import permutations
from igraph import *
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime
import time
import seaborn as sns
import io
import requests
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import AgglomerativeClustering
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse
from collections import Counter
import re
import math
from opencc import OpenCC 
cc = OpenCC('s2t')

In [2]:
root_path = ###deleted for security reasons###
out_path = ###deleted for security reasons###

In [3]:
cofacts_truth_score = pd.read_csv(out_path/'cofacts_truth_score_20220319-20220513.csv')

# HAC+KNN

In [4]:
def vectorize(tokenList):
    stringifyToks = [" ".join(t) for t in tokenList]
    cv = CountVectorizer(binary=True, min_df=1, max_df=0.9)
    wordVector_csr = cv.fit_transform(stringifyToks) #計算詞頻
    return wordVector_csr

In [5]:
def calculate_similarity(mat):

    if not scipy.sparse.issparse(mat):
        mat = scipy.sparse.csr_matrix(mat)
    mat = mat.astype(float)

    intrsct = mat * mat.T

    # for rows
    row_sums = mat.getnnz(axis=1)
    nnz_i = np.repeat(row_sums, intrsct.getnnz(axis=1))
    nnz_j = row_sums[intrsct.indices]

    intrsct.data = intrsct.data / np.maximum(nnz_i, nnz_j)
    return intrsct.A

In [6]:
def HAC(tokenList=None, wordVector=None, distance_threshold=0.6, linkage='average'):
    if wordVector is None:
        wordVector = vectorize(tokenList)

    similarityMatrix = calculate_similarity(wordVector)
    distanceMatrix = 1 - similarityMatrix

    del similarityMatrix

    model = AgglomerativeClustering(distance_threshold=distance_threshold,
                                    n_clusters=None,
                                    affinity="precomputed",
                                    linkage=linkage)
    labels = model.fit_predict(distanceMatrix)

    return labels

In [7]:
def clustering(clustering_data, train_portion):
    n_articles = len(clustering_data)
    out = np.full(n_articles, -100)  # init

    tokenList = np.array([x['clean_segment'] for x in data.values()], dtype=object) 

    print(f'n_articles: {n_articles}')
    print(f'train_portion: {train_portion}')
    
    stringifyToks = [" ".join(t) for t in tokenList]
    cv = CountVectorizer(binary=True, min_df=1, max_df=0.9, analyzer='word',token_pattern=u"(?u)\\b\\w+\\b")
    wv = cv.fit_transform(stringifyToks) 

    training_size = int(n_articles * train_portion)
    uid_train = np.random.choice(n_articles, training_size, replace=False)
    uid_train = np.sort(uid_train)
    uid_test = np.array(list(set(range(n_articles)) - set(uid_train)))
    
    wv_train = wv[uid_train, :]
    wv_test = wv[uid_test, :]
    
    distance_threshold = 0.6
    labels = HAC(wordVector=wv_train, distance_threshold=distance_threshold)
    
    countsof = Counter(labels)
    y_train = np.array([x if countsof[x] > 1 else -1 for x in labels]) #取value
    out[uid_train] = y_train
    
    knn = KNeighborsClassifier(n_neighbors=10, weights='distance')
    knn.fit(wv_train, y_train)

    y_pred = knn.predict(wv_test)
    out[uid_test] = y_pred
    
    uid_leftover = np.where(out == -1)[0]
    wv_leftover = wv[uid_leftover, :]

    labels_leftover = HAC(wordVector=wv_leftover, distance_threshold=distance_threshold)
    labels_leftover = (np.max(out) + 1) + labels_leftover

    out[uid_leftover] = labels_leftover
    id2label = dict(zip(clustering_data.keys(), out))
    
    label_df = pd.DataFrame.from_dict(data, orient='index')
    label_df['label'] = label_df.index.to_series().apply(lambda x: id2label.get(x))
#     label_df = label_df.reset_index()
#     label_df = label_df.rename(columns={'index':'doc_id'})
    label_df = label_df[label_df['label'].isna()==False]

    return label_df

In [14]:
start = time.process_time()
end = time.process_time()

sub = ['covid', 'coexist', 'vaccine', 'rapid_test']
print('loading...')
for file in root_path.glob(f'*cofacts*'):
    for x in sub:
        if x in file.stem:
            print(file.stem)
            with open(file , 'r', encoding='big5') as reader:
                data = json.loads(reader.read())

            print('clustering...')
            label_df = clustering(data, 0.1)
            label_df.label = label_df.label.astype(str)
            label_df.label = label_df.category + '_' + label_df.label
            label_df = label_df.merge(cofacts_truth_score[['article_id', 'article_type_count']], how='left')
            label_df.to_csv(out_path/f'{file.stem}_clustering.csv', index=False)

print("This time is being calculated")
print(end - start)  

loading...
cofacts_covid_20220319-20220513
clustering...
n_articles: 6509
train_portion: 0.1
cofacts_vaccine_20220319-20220513
clustering...
n_articles: 2776
train_portion: 0.1
cofacts_coexist_20220319-20220513
clustering...
n_articles: 103
train_portion: 0.1
cofacts_rapid_test_20220319-20220513
clustering...
n_articles: 573
train_portion: 0.1
This time is being calculated
7.000000000090267e-05


In [10]:
label_df

Unnamed: 0,article_id,createdAt,clean_segment,category,label,article_type_count
0,1ewp9hkii5mcy,2020-01-28,"[補充, 湖北省, 武漢, 華中, 科技, 大學, 同濟, 醫學院, 附屬, 同濟, 醫院,...",rapid_test,rapid_test_508,謠言或個人意見
1,c4rj2rxep739,2020-01-23,"[武漢, 肺炎, 傳染力, 潛伏期, 尚未, 發病, 看似, 症狀, 狀態, 已經, 傳染,...",rapid_test,rapid_test_373,謠言或個人意見
2,20ewmsj8w05j2,2020-01-19,"[武漢, 肺炎, 證實, 新型, 目前, 泰國, 日本, 越南, 確診, 病例, 中國, 政...",rapid_test,rapid_test_318,謠言或個人意見
3,3cpvcucef9sgv,2020-02-24,"[耳鼻喉科, 醫師, 羣組, 裡面, 大部分, 開業, 醫師, 認為, 這些, 臺灣, 最多...",rapid_test,rapid_test_399,謠言或個人意見
4,2vpo1u9e1fat5,2020-02-24,"[共同, 開發, 武漢, 肺炎, 疫苗, 共同, 中央社, 記者, 陳韻聿, 臺北, 臺灣,...",rapid_test,rapid_test_491,不是謠言
...,...,...,...,...,...,...
568,1brwzm6np570e,2022-05-01,"[不要, 相信, 鼻用, 篩戳, 喉嚨, 或者, 鼻孔, 酒精, 不會, 確診, 網路, 謠...",rapid_test,rapid_test_43,不是謠言
569,2bgsy3fwz0s6h,2022-05-01,"[無能, 還是, 人謀不臧, 國內, 篩試劑, 國際, 衛福部長, 陳時中, 因為, 臺灣,...",rapid_test,rapid_test_111,謠言或個人意見
570,34emnzhqxrmq9,2022-05-01,"[自己, 快篩, 功能, 測試, 住呼吸, 多少, 正常, 超強]",rapid_test,rapid_test_42,謠言或個人意見
571,f5vbbwot1z8m,2022-05-01,"[唾液, 覈准, 進口, 扯上, 高端, 陳時中, 高端, 什麼, 關係, 只要, 顏色, ...",rapid_test,rapid_test_60,
