In [224]:
import pandas as pd
import json
import numpy as np
from os import listdir
import requests
from tqdm import tqdm
from datetime import datetime
from datetime import timedelta

## Functions

In [225]:
def load_misinformation():
    misinformations = pd.read_excel('/Users/cblanesg/misinformation_socialmedia/data/2-misinformation/newspapers/0-misinformation/misinformation_newspapers.xlsx').drop('Unnamed: 0', axis = 1)
    return(misinformations)

In [226]:
def load_engagements():
    path = '../../../../data/2-misinformation/newspapers/1-engagements/1-scrapes/'
    data = []
    for i in listdir(path):
        try:
            data.append(pd.read_json(path + i))
        except:
            print(i)
    data = pd.concat(data)
    return(data)

In [227]:
def obtain_missing_desinformations():
    misinformations = load_misinformation()
    engagements = load_engagements()
    
    missinf = misinformations[~misinformations['id_desinformacion'].isin(engagements.id_desinformacion)]
    return(missinf)

In [251]:
def clean_engagements():
    engagements = load_engagements()
    misinformations = load_misinformation()
    
    engagements['date_publication'] = engagements['date'].apply(lambda x: x.date())
    engagements = engagements[['id_desinformacion',
                               'id',
                               'date_publication',
                               'history',
                               'postUrl']].rename(columns = {'id':'id_post_desinformacion', 
                                             'postUrl':'link_post_desinformacion'})
    
    all_data = pd.merge(left = engagements, 
            right = misinformations, 
            on = 'id_desinformacion', 
            how = 'left')
    return(all_data)

def add_missing_timesteps(temp):
    
    for i in list(range(-16, 16)):
        if i not in temp.days_since_factcheck.unique():
            row =[None, 
             None,
             None,
             None,
             None,
             None,
             None,
             None,
             None,
             None
            ]
            timestep = list(temp.date_factcheck)[0] + timedelta(i)
            row.append(timestep)
            row.append(list(temp.id_desinformacion)[0])
            row.append(list(temp.date_publication)[0])
            row.append((timestep - list(temp.date_publication)[0]).days)
            row.append(list(temp.date_factcheck)[0])
            row.append(i)
            row.append(list(temp.id_post_desinformacion)[0])
            row.append(list(temp.id_factcheck)[0])
            row.append(list(temp.link_desinformacion)[0])
            row.append(list(temp.facebook_partnership_date)[0])
            row.append(list(temp.organizacion)[0])
            row.append(list(temp.pais)[0])
    
    temp.append(pd.DataFrame([row], columns = list(temp.columns)))
    #temp = temp[temp['days_since_publication'] >= 0]
    return(temp)

## Notebook to obtain growth virality timesteps

In [24]:
missing_data = obtain_missing_desinformations()

.DS_Store
newspapers_link_checker.json


In [25]:
len(missing_data),len(load_misinformation())

(1448, 1983)

## Scrape Missing Data

In [26]:
api_dashboard = 'V6lMawCCbCP2rShqtU3TmYCY3Em1Osd8I2DLYxwr'

In [45]:
posts = []

In [47]:
for i, id_ in tqdm(zip(list(missing_data.link_desinformacion)[1250:], 
                      list(missing_data.id_desinformacion)[1250:]), total = len(list(missing_data.id_desinformacion)[1250:])):
    URL_BASE = "https://api.crowdtangle.com/links"
    PARAMS = {'link': i, 'count': 1000,'token': api_dashboard, 'platforms': 'facebook',
         'includeHistory':'true'}

    r = requests.get(url = URL_BASE, params=PARAMS)
    data = r.json()
    out = data['result']['posts']
    if len(out) >= 1:
        df = pd.DataFrame(out)
        df['id_desinformacion'] = id_
        posts.append(df)
    else:
        pass

100%|██████████| 198/198 [16:46<00:00,  5.08s/it] 


In [48]:
df_out = pd.concat(posts)

In [62]:
df_out = df_out.reset_index().drop('index', axis = 1)

In [63]:
df_out.to_json('../../../../data/2-misinformation/newspapers/1-engagements/1-scrapes/missing_misinformations.json')

## Load Again Engagements

In [64]:
missing_data = obtain_missing_desinformations()

.DS_Store
newspapers_link_checker.json


In [67]:
len(missing_data)

456

In [68]:
misinformations = load_misinformation()
engagements = load_engagements()

.DS_Store
newspapers_link_checker.json


In [278]:
df_all = []

In [279]:
engagements = engagements.reset_index().drop('index', axis = 1)

for i in tqdm(range(0, 100)):
    post = engagements.loc[i]
    if pd.isna(post['date_factcheck_final']):
        pass
    else:

        df_history = pd.DataFrame(post['history'])
        df_history['date_timestep'] = df_history['date'].apply(lambda x: datetime.strptime(x.split(' ')[0], '%Y-%m-%d').date())
        df_history = df_history.reset_index().drop('index', axis = 1)

        df_eng_post = []
        for h in range(0, len(df_history)):
            columns = list(df_history.loc[h]['actual'].keys())
            values = list(df_history.loc[h]['actual'].values())
            df_temp = pd.DataFrame([values], columns = columns)
            df_temp['date_timestep'] = df_history.loc[h]['date_timestep']
            df_temp['id_desinformacion'] = engagements.loc[i]['id_desinformacion']
            df_temp = df_temp.drop_duplicates('date_timestep', keep = 'first')
            df_temp['date_publication'] = post['date_publication']
            df_temp['days_since_publication'] = df_temp.apply(lambda x: (x['date_timestep'] - x['date_publication']).days, axis = 1)
            df_temp['date_factcheck'] = datetime.strptime(post['date_factcheck_final'], '%Y-%m-%d').date()
            df_temp['days_since_factcheck'] = df_temp.apply(lambda x: (x['date_timestep'] - x['date_factcheck']).days, axis = 1)
            df_temp['id_post_desinformacion'] = post['id_post_desinformacion']
            df_temp['id_factcheck'] = post['id_factcheck']
            df_temp['link_desinformacion'] = post['link_desinformacion']
            df_temp['facebook_partnership_date'] = post['facebook_partnership_date']
            df_temp['organizacion'] = post['organizacion']
            df_temp['pais'] = post['pais']
            df_eng_post.append(df_temp)
        temp = pd.concat(df_eng_post).drop_duplicates('days_since_publication', keep = 'first')
        panel = add_missing_timesteps(temp)
        df_all.append(panel)

100%|██████████| 100/100 [00:34<00:00,  2.94it/s]


In [280]:
df_final_aggregated = pd.concat(df_all)

In [287]:
add_missing_timesteps(temp)

Unnamed: 0,likeCount,shareCount,commentCount,loveCount,wowCount,hahaCount,sadCount,angryCount,thankfulCount,careCount,...,date_publication,days_since_publication,date_factcheck,days_since_factcheck,id_post_desinformacion,id_factcheck,link_desinformacion,facebook_partnership_date,organizacion,pais
0,18,6,6,1,0,0,0,2,0,0,...,2017-12-15,1055,2010-01-30,3931,10148087|1893788223974284,31c6ea36-ad57-377e-8e4d-3d7b1d63c2ee,http://www.placasrojas.tv/2417914-700-ooo-legi...,2018-05-01,chequeado,argentina


In [288]:
    missing_rows = []
    for i in list(range(-16, 16)):
        if i not in temp.days_since_factcheck.unique():
            row =[None, 
             None,
             None,
             None,
             None,
             None,
             None,
             None,
             None,
             None]
            timestep = list(temp.date_factcheck)[0] + timedelta(i)
            row.append(timestep)
            row.append(list(temp.id_desinformacion)[0])
            row.append(list(temp.date_publication)[0])
            row.append((timestep - list(temp.date_publication)[0]).days)
            row.append(list(temp.date_factcheck)[0])
            row.append(i)
            row.append(list(temp.id_post_desinformacion)[0])
            row.append(list(temp.id_factcheck)[0])
            row.append(list(temp.link_desinformacion)[0])
            row.append(list(temp.facebook_partnership_date)[0])
            row.append(list(temp.organizacion)[0])
            row.append(list(temp.pais)[0])
            m_row = pd.DataFrame([row], columns = list(temp.columns))
            missing_rows.append(m_row)

In [292]:
temp_clean = pd.concat([temp, pd.concat(missing_rows)])
    

In [301]:
add_missing_timesteps(temp)

Unnamed: 0,likeCount,shareCount,commentCount,loveCount,wowCount,hahaCount,sadCount,angryCount,thankfulCount,careCount,...,date_publication,days_since_publication,date_factcheck,days_since_factcheck,id_post_desinformacion,id_factcheck,link_desinformacion,facebook_partnership_date,organizacion,pais
0,18,6,6,1,0,0,0,2,0,0,...,2017-12-15,1055,2010-01-30,3931,10148087|1893788223974284,31c6ea36-ad57-377e-8e4d-3d7b1d63c2ee,http://www.placasrojas.tv/2417914-700-ooo-legi...,2018-05-01,chequeado,argentina


In [300]:
def add_missing_timesteps(temp):
    missing_rows = []
    for i in list(range(-16, 16)):
        if i not in temp.days_since_factcheck.unique():
            row =[None, 
             None,
             None,
             None,
             None,
             None,
             None,
             None,
             None,
             None]
            timestep = list(temp.date_factcheck)[0] + timedelta(i)
            row.append(timestep)
            row.append(list(temp.id_desinformacion)[0])
            row.append(list(temp.date_publication)[0])
            row.append((timestep - list(temp.date_publication)[0]).days)
            row.append(list(temp.date_factcheck)[0])
            row.append(i)
            row.append(list(temp.id_post_desinformacion)[0])
            row.append(list(temp.id_factcheck)[0])
            row.append(list(temp.link_desinformacion)[0])
            row.append(list(temp.facebook_partnership_date)[0])
            row.append(list(temp.organizacion)[0])
            row.append(list(temp.pais)[0])
            m_row = pd.DataFrame([row], columns = list(temp.columns))
            missing_rows.append(m_row)
    temp_clean = pd.concat([temp, pd.concat(missing_rows)])
    temp_clean = temp_clean[temp_clean['days_since_publication'] >= 0]
    return(temp_clean)

In [223]:
import pyreadr
pyreadr.write_rdata("/Users/cblanesg/misinformation_socialmedia/data/2-misinformation/newspapers/3-clean_misinformation/newspapers.RData", df_final_aggregated, df_name="misinformation_newspapers_panel")