In [1]:
import pandas as pd
import csv
from pathlib import Path
import json
from sklearn.metrics.pairwise import cosine_similarity
from itertools import combinations
from itertools import permutations
from igraph import *
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime
import time
import statistics
from scipy import stats
import seaborn as sns

In [2]:
root_path = ###deleted for security reasons###
out_path = ###deleted for security reasons###

# import data

In [3]:
cofacts_daily_views = []
tformat = "%Y-%m-%d"
with open(root_path/'cofacts_20220319-20220513.json' , 'r', encoding='big5') as reader:
    data = json.loads(reader.read())
    for idx, article in enumerate(data):
        for d in article['stats']:
            ori_time = datetime.strptime(d['date'].split('T')[0], tformat)
            if ori_time >= datetime.strptime('2022-03-19', tformat) and ori_time <= datetime.strptime('2022-05-13', tformat):
                view_list = [v for v in d.values()]
                cofacts_daily_views.append([article['id'], d['date'].split('T')[0], sum([0 if value==None else value for value in view_list[:-1]])])

In [4]:
cofacts_daily_views_df = pd.DataFrame(cofacts_daily_views, columns=['article_id', 'date', 'views'])

In [5]:
cofacts_daily_views_pivot = cofacts_daily_views_df.pivot(index='article_id', columns='date', values='views')\
                            .reset_index().fillna(0)

# background - COVID

In [6]:
cofacts_covid = pd.read_csv(out_path/'cofacts_covid_20220319-20220513_clustering.csv')

In [7]:
cofacts_covid_report = cofacts_covid[['label', 'article_id', 'article_type_count']].merge(cofacts_daily_views_pivot, how='left')

### sum all

In [8]:
cofacts_covid_report_bg_array = cofacts_covid_report.drop(columns=['label', 'article_id', 'article_type_count'])\
                                .sum().to_list()

### group by label

In [9]:
cofacts_covid_report_label = cofacts_covid_report.groupby(by='label', as_index=False).sum()

# subset

In [10]:
cofacts_coexist = pd.read_csv(out_path/'cofacts_coexist_20220319-20220513_clustering.csv')
cofacts_vaccine = pd.read_csv(out_path/'cofacts_vaccine_20220319-20220513_clustering.csv')
cofacts_rapid_test = pd.read_csv(out_path/'cofacts_rapid_test_20220319-20220513_clustering.csv')

In [11]:
len(set(cofacts_covid.article_id))

6509

In [12]:
cofacts_daily_views_count = cofacts_daily_views_df.groupby(by='article_id', as_index=False).sum()
cofacts_daily_views_count = cofacts_daily_views_df.groupby(by='article_id', as_index=False).sum()
cofacts_daily_views_count = cofacts_daily_views_df.groupby(by='article_id', as_index=False).sum()

In [13]:
cofacts_coexist_report = cofacts_coexist[['label', 'article_id', 'article_type_count']].merge(cofacts_daily_views_pivot, how='left')
cofacts_vaccine_report = cofacts_vaccine[['label', 'article_id', 'article_type_count']].merge(cofacts_daily_views_pivot, how='left')
cofacts_rapid_test_report = cofacts_rapid_test[['label', 'article_id', 'article_type_count']].merge(cofacts_daily_views_pivot, how='left')

### group by label

In [14]:
cofacts_coexist_report_label = cofacts_coexist_report.groupby(by='label', as_index=False).sum()
cofacts_vaccine_report_label = cofacts_vaccine_report.groupby(by='label', as_index=False).sum()
cofacts_rapid_test_report_label = cofacts_rapid_test_report.groupby(by='label', as_index=False).sum()

In [15]:
def cofacts_stats(df):
    label_stats = []
    for idx in df.index:
        label_daily_array = np.array(df.iloc[idx, 1:], dtype = np.float64)
        label_cos_sim = cosine_similarity([label_daily_array], [cofacts_covid_report_bg_array])[0][0]
        label_corr = np.corrcoef(label_daily_array.reshape(1, -1), 
                                 np.array(cofacts_covid_report_bg_array, dtype = np.int64).reshape(1, -1))[0,1]
        
        
        label_stats.append([df.iloc[idx, 0],
                              sum(df.iloc[idx][1:]),
                              label_corr,
                              label_cos_sim,
                              df.iloc[idx][1:].kurt()])
        
    label_stats = pd.DataFrame(label_stats, columns=['label', 'label_views_sum', 'label_corr',
                                                     'label_cos_sim', 'label_kurt'])
    return label_stats

In [16]:
cofacts_covid_report_label_sim = cofacts_stats(cofacts_covid_report_label)

In [17]:
cofacts_coexist_report_label_sim = cofacts_stats(cofacts_coexist_report_label)
cofacts_vaccine_report_label_sim = cofacts_stats(cofacts_vaccine_report_label)
cofacts_rapid_test_report_label_sim = cofacts_stats(cofacts_rapid_test_report_label)

# merge back

In [18]:
# article_sum
cofacts_daily_views_count = cofacts_daily_views_df.groupby(by='article_id', as_index=False).sum()
cofacts_daily_views_count = cofacts_daily_views_count.rename(columns={'views':'article_views'})

In [19]:
# text
cofacts_text = []
tformat = "%Y-%m-%d"
with open(root_path/'cofacts_20220319-20220513.json' , 'r', encoding='big5') as reader:
    data = json.loads(reader.read())
    for idx, article in enumerate(data):
            cofacts_text.append([article['id'], article['text']])
cofacts_text_df = pd.DataFrame(cofacts_text, columns=['article_id', 'text'])

In [20]:
def merge_back(df):
    df_stats = df.merge(cofacts_text_df, how='left')\
                 .merge(cofacts_daily_views_count, how='left')\
                 .merge(cofacts_covid_report_label_sim, how='left')
    return df_stats

In [21]:
cofacts_covid_stats = merge_back(cofacts_covid)
cofacts_covid_stats.to_csv(out_path/'cofacts_covid_20220319-20220513_clustering_stats.csv', index=False)

cofacts_coexist_stats = merge_back(cofacts_coexist)
cofacts_coexist_stats.to_csv(out_path/'cofacts_coexist_20220319-20220513_clustering_stats.csv', index=False)

cofacts_vaccine_stats = merge_back(cofacts_vaccine)
cofacts_vaccine_stats.to_csv(out_path/'cofacts_vaccine_20220319-20220513_clustering_stats.csv', index=False)

cofacts_rapid_test_stats = merge_back(cofacts_rapid_test)
cofacts_rapid_test_stats.to_csv(out_path/'cofacts_rapid_tset_20220319-20220513_clustering_stats.csv', index=False)