In [1]:
import pandas as pd
import json
import numpy as np
from os import listdir
import requests
from tqdm import tqdm
from datetime import datetime
from datetime import timedelta

## Functions

In [2]:
def load_misinformation():
    misinformations = pd.read_excel('/Users/cblanesg/misinformation_socialmedia/data/2-misinformation/newspapers/0-misinformation/misinformation_newspapers.xlsx').drop('Unnamed: 0', axis = 1)
    return(misinformations)

In [3]:
def load_engagements():
    path = '../../../../data/2-misinformation/newspapers/1-engagements/1-scrapes/'
    data = []
    for i in listdir(path):
        try:
            data.append(pd.read_json(path + i))
        except:
            print(i)
    data = pd.concat(data)
    return(data)

In [4]:
def clean_engagements():
    engagements = load_engagements()
    misinformations = load_misinformation()
    
    engagements['date_publication'] = engagements['date'].apply(lambda x: x.date())
    engagements = engagements[['id_desinformacion',
                               'id',
                               'date_publication',
                               'history',
                               'postUrl']].rename(columns = {'id':'id_post_desinformacion', 
                                             'postUrl':'link_post_desinformacion'})
    
    all_data = pd.merge(left = engagements, 
            right = misinformations, 
            on = 'id_desinformacion', 
            how = 'left')
    return(all_data)

## Apply Functions

In [7]:
raw = load_misinformation()

In [9]:
len(raw.id_desinformacion.unique())

1967

In [5]:
misinformation = clean_engagements()

.DS_Store
newspapers_link_checker.json


In [25]:
def counter_history(misinformation):
    list_dummy_history = []
    for i in range(0, len(misinformation)):
        if len(misinformation.loc[i]['history']) > 1:
            list_dummy_history.append(1)
        else:
            list_dummy_history.append(0)
    return(list_dummy_history)

In [26]:
misinformation['dummy_timeseries'] = counter_history(misinformation)

In [28]:
misinformation['dummy_factcheck'] = np.where(pd.isna(misinformation['date_factcheck_final']), 0, 1)

In [29]:
misinformation[['id_desinformacion','dummy_factcheck', 'dummy_timeseries']].groupby(['dummy_factcheck', 'dummy_timeseries']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,id_desinformacion
dummy_factcheck,dummy_timeseries,Unnamed: 2_level_1
0,0,518
0,1,1268
1,0,30688
1,1,36910


In [41]:
misinformation[['id_desinformacion','dummy_factcheck', 'dummy_timeseries']].drop_duplicates().groupby(['dummy_factcheck', 'dummy_timeseries']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,id_desinformacion
dummy_factcheck,dummy_timeseries,Unnamed: 2_level_1
0,0,27
0,1,29
1,0,1180
1,1,1237


In [None]:
613 + 

In [30]:
subset = misinformation[misinformation['dummy_factcheck'] == 1]
subset = subset[subset['dummy_timeseries'] == 1]

In [31]:
len(subset.id_desinformacion.unique())

1237

In [13]:
len(misinformation.id_post_desinformacion), len(misinformation.id_desinformacion.unique())

(69384, 1532)

In [21]:
len(raw[raw['date_factcheck_final'].isna()]), len(raw[~raw['date_factcheck_final'].isna()])

(23, 1960)

In [37]:
subset[['id_desinformacion', 'label_desinformacion']].groupby(['label_desinformacion']).count()

Unnamed: 0_level_0,id_desinformacion
label_desinformacion,Unnamed: 1_level_1
fake,20387
misleading,7082
true,9395
true,23


In [38]:
9395 + 23

9418

In [36]:
subset[['id_desinformacion', 'label_desinformacion']].drop_duplicates().groupby(['label_desinformacion']).count()

Unnamed: 0_level_0,id_desinformacion
label_desinformacion,Unnamed: 1_level_1
fake,613
misleading,145
true,475
true,1


In [48]:
misinformation[['id_desinformacion', 'id_post_desinformacion']].groupby(['id_desinformacion']).count().reset_index().id_post_desinformacion.mean()

45.28981723237598

In [49]:
misinformation[['id_desinformacion', 'id_post_desinformacion']].groupby(['id_desinformacion']).count().reset_index().id_post_desinformacion.max()

1000

In [51]:
misinformation[['id_desinformacion', 'id_post_desinformacion']].groupby(['id_desinformacion']).count().reset_index().id_post_desinformacion.min()

1

In [62]:
counts_df = misinformation[['id_desinformacion', 'id_post_desinformacion']].groupby(['id_desinformacion']).count().reset_index().rename(columns = {'id_post_desinformacion':'counts'})

In [52]:
len(misinformation.id_desinformacion.unique())

1532

In [54]:
raw = load_misinformation()

In [55]:
len(raw.id_desinformacion.unique())

1967

In [67]:
no_posts = raw[~raw['id_desinformacion'].isin(counts_df.id_desinformacion)][['id_desinformacion']]

In [68]:
no_posts['counts'] = 0

In [69]:
complete_counts = pd.concat([counts_df, no_posts])

In [72]:
complete_counts.counts.mean(), complete_counts.counts.median()

(34.901408450704224, 5.0)