In [1]:
import pandas as pd
import csv
from pathlib import Path
import json
import numpy as np
from datetime import datetime
import time
import os

In [2]:
root_path = ###deleted for security reasons###
out_path = ###deleted for security reasons###

In [3]:
# ['id', 'text', 'createdAt', 'stats', 'articleReplies', 'url', 'segment']
cofacts_reply = []
article_id_set = set()
with open(root_path/'cofacts_20220319-20220513.json' , 'r', encoding='big5') as reader:
    data = json.loads(reader.read())
    for idx, article in enumerate(data):
        article_id_set.add(article['id'])
        
        if not article['articleReplies'] == []:
            for reply in article['articleReplies']:

                cofacts_reply.append([
                    article['id'],
                    reply['reply']['id'],
                    reply['reply']['type'],
                    reply['positiveFeedbackCount'],
                    reply['negativeFeedbackCount']
                ])
        
cofacts_reply_df = pd.DataFrame(cofacts_reply, 
                                columns=['article_id', 'reply_id', 'reply_type',
                                         'positiveFeedbackCount', 'negativeFeedbackCount'])

In [4]:
#all reports
len(article_id_set)

23079

In [5]:
#fact-check reports
replied_article_id_set = set(cofacts_reply_df['article_id'])
print(len(replied_article_id_set))

12326


In [6]:
#non-fact-check reports
len(article_id_set) - len(replied_article_id_set)

10753

In [7]:
#fact-check proportion
len(replied_article_id_set)/len(article_id_set)

0.5340785995927033

In [8]:
#num of fact-check replies per report
reply_count = cofacts_reply_df[['article_id', 'reply_id']]\
.groupby(by='article_id', as_index=False).count()\
.sort_values(by='reply_id', ascending=False)\
.rename(columns={'reply_id':'reply_id_size'})

In [9]:
cofacts_reply_df = cofacts_reply_df.merge(reply_count, how='right')

In [10]:
set(cofacts_reply_df.reply_type)

{'NOT_ARTICLE', 'NOT_RUMOR', 'OPINIONATED', 'RUMOR'}

# encoding

In [11]:
def reply_type_score(reply_type):
    if reply_type == 'RUMOR':
        return -1
    if reply_type == 'OPINIONATED':
        return -1
    if reply_type == 'NOT_RUMOR':
        return 1
    if reply_type == 'NOT_ARTICLE':
        return 0
    else:
        return 0

In [12]:
reply_type_score('NOT_ARTICLE')

0

In [13]:
for rows in cofacts_reply_df.itertuples():
    cofacts_reply_df.loc[rows.Index, 'type_score'] = reply_type_score(cofacts_reply_df.loc[rows.Index, 'reply_type'])

In [14]:
cofacts_reply_df.columns

Index(['article_id', 'reply_id', 'reply_type', 'positiveFeedbackCount',
       'negativeFeedbackCount', 'reply_id_size', 'type_score'],
      dtype='object')

In [15]:
cofacts_reply_df = cofacts_reply_df.reindex(columns=['article_id', 'reply_id', 
                                                     'reply_id_size', 'type_score', 'reply_type', 
                                                     'positiveFeedbackCount', 'negativeFeedbackCount'])

In [16]:
cofacts_reply_df

Unnamed: 0,article_id,reply_id,reply_id_size,type_score,reply_type,positiveFeedbackCount,negativeFeedbackCount
0,2o1zve8awcgvn,aiZOMoABvUvLpBdgBzEO,12,1.0,NOT_RUMOR,0,1
1,2o1zve8awcgvn,wyXqon8BvUvLpBdgAat7,12,-1.0,RUMOR,4,1
2,2o1zve8awcgvn,5SZ3J4ABvUvLpBdgXSUn,12,1.0,NOT_RUMOR,0,1
3,2o1zve8awcgvn,rSaQJoABvUvLpBdgfyTR,12,1.0,NOT_RUMOR,0,1
4,2o1zve8awcgvn,zCVEnX8BvUvLpBdgBKZH,12,1.0,NOT_RUMOR,2,3
...,...,...,...,...,...,...,...
14266,2cxbm4zpv2kr2,pSYQqYABvUvLpBdgirzW,1,-1.0,RUMOR,2,0
14267,2cxq2samivaqi,w6Sd_30BnX5-aOa4eJoi,1,1.0,NOT_RUMOR,5,1
14268,2cy7jwh1ad7a7,RqN5Y3UB9w1KR1IkMWt9,1,0.0,NOT_ARTICLE,1,0
14269,2cy8agsbhdn7w,YgPy3nABrhVJn3LNfVih,1,-1.0,RUMOR,1,0


# article_truth_score
## count

In [17]:
def count_truth_score(type_score, positiveFeedbackCount, negativeFeedbackCount):
    return type_score*positiveFeedbackCount - type_score*negativeFeedbackCount

In [18]:
start = time.process_time()
end = time.process_time()

cofacts_reply_df['reply_truth_score'] = cofacts_reply_df.apply(lambda r:count_truth_score(r['type_score'],
                                                               r['positiveFeedbackCount'],
                                                               r['negativeFeedbackCount']), axis=1)

print("This time is being calculated")
print(end - start)  

This time is being calculated
4.7999999999603915e-05


In [19]:
reply_article_truth_score = cofacts_reply_df[['article_id', 'reply_truth_score']]\
                            .groupby(by='article_id', as_index=False).sum()\
                            .rename(columns={'reply_truth_score':'article_truth_score_count'})

In [20]:
reply_type_count=[]

for idx in replied_article_id_set:
    reply_type_set = set(cofacts_reply_df[cofacts_reply_df.article_id==idx]['reply_type'])
    reply_type_count.append((idx, len(reply_type_set), str(reply_type_set)))
    
article_reply_type_set = pd.DataFrame(reply_type_count, 
                                      columns=['article_id', 'reply_type_count', 'reply_type_set'])

In [21]:
reply_size = cofacts_reply_df[['article_id', 'reply_id']].groupby(by='article_id', as_index=False).size()

In [22]:
article_truth_score = article_reply_type_set.merge(reply_size).merge(reply_article_truth_score)

In [23]:
def article_type_decision(article_truth_score_count):
    if article_truth_score_count > 0:
        return 'not rumor' 
    elif article_truth_score_count == 0:
        return 'cannot be fact-checked' 
    elif article_truth_score_count < 0:
        return 'rumor or opinion'

In [24]:
article_truth_score['article_type_count'] = article_truth_score.apply(lambda r:
                                            article_type_decision(r['article_truth_score_count']), 
                                            axis=1)

In [25]:
article_truth_score.sort_values(by=['article_truth_score_count', 'reply_type_count'], ascending=True)

Unnamed: 0,article_id,reply_type_count,reply_type_set,size,article_truth_score_count,article_type_count
9918,3uzlh41rh4oqm,1,{'RUMOR'},2,-1006.0,rumor or opinion
4999,1qhjmled1mv2w,1,{'RUMOR'},2,-502.0,rumor or opinion
11556,oikfzsvt011y,2,"{'RUMOR', 'NOT_ARTICLE'}",4,-310.0,rumor or opinion
2401,119qngktbf3wm,1,{'RUMOR'},3,-277.0,rumor or opinion
11405,wn5wrmpxylxn,1,{'RUMOR'},1,-276.0,rumor or opinion
...,...,...,...,...,...,...
10848,3p3sunew3un5j,1,{'NOT_RUMOR'},1,78.0,not rumor
6536,2lnupn6kprnz5,1,{'NOT_RUMOR'},1,79.0,not rumor
2195,3ppeiqj59yffi,1,{'NOT_RUMOR'},1,111.0,not rumor
7988,1frt07jdgygwy,1,{'NOT_RUMOR'},1,112.0,not rumor


In [41]:
article_truth_score.to_csv(out_path/'cofacts_truth_score_20220319-20220513.csv', index=False)