In [136]:
import pandas as pd
import json
import numpy as np
from os import listdir
import requests
from tqdm import tqdm
from datetime import datetime
from datetime import timedelta

## Functions

In [154]:
def load_misinformation():
    misinformations = pd.read_excel('/Users/cblanesg/misinformation_socialmedia/data/2-misinformation/newspapers/0-misinformation/misinformation_newspapers.xlsx').drop('Unnamed: 0', axis = 1)
    return(misinformations)

In [155]:
def load_engagements():
    path = '../../../../data/2-misinformation/newspapers/1-engagements/1-scrapes/'
    data = []
    for i in listdir(path):
        try:
            data.append(pd.read_json(path + i))
        except:
            print(i)
    data = pd.concat(data)
    return(data)

In [156]:
def obtain_missing_desinformations():
    misinformations = load_misinformation()
    engagements = load_engagements()
    
    missinf = misinformations[~misinformations['id_desinformacion'].isin(engagements.id_desinformacion)]
    return(missinf)

In [157]:
def clean_engagements():
    engagements = load_engagements()
    misinformations = load_misinformation()
    
    engagements['date_publication'] = engagements['date'].apply(lambda x: x.date())
    engagements = engagements[['id_desinformacion',
                               'id',
                               'date_publication',
                               'history',
                               'postUrl']].rename(columns = {'id':'id_post_desinformacion', 
                                             'postUrl':'link_post_desinformacion'})
    
    all_data = pd.merge(left = engagements, 
            right = misinformations, 
            on = 'id_desinformacion', 
            how = 'left')
    return(all_data)

def add_missing_timesteps(temp):
    missing_rows = []
    for i in list(range(-16, 16)):
        if i not in temp.days_since_factcheck.unique():
            row =[np.nan, 
             np.nan,
             np.nan,
             np.nan,
             np.nan,
             np.nan,
             np.nan,
             np.nan,
             np.nan,
             np.nan]
            timestep = list(temp.date_factcheck)[0] + timedelta(i)
            row.append(timestep)
            row.append(list(temp.id_desinformacion)[0])
            row.append(list(temp.date_publication)[0])
            row.append((timestep - list(temp.date_publication)[0]).days)
            row.append(list(temp.date_factcheck)[0])
            row.append(i)
            row.append(list(temp.id_post_desinformacion)[0])
            row.append(list(temp.id_factcheck)[0])
            row.append(list(temp.link_desinformacion)[0])
            row.append(list(temp.facebook_partnership_date)[0])
            row.append(list(temp.organizacion)[0])
            row.append(list(temp.pais)[0])
            m_row = pd.DataFrame([row], columns = list(temp.columns))
            missing_rows.append(m_row)
    temp_clean = pd.concat([temp, pd.concat(missing_rows)])
    temp_clean = temp_clean[temp_clean['days_since_publication'] >= 0]
    return(temp_clean)

## Notebook to obtain growth virality timesteps

In [141]:
missing_data = obtain_missing_desinformations()

.DS_Store
newspapers_link_checker.json


In [142]:
len(missing_data),len(load_misinformation())

(456, 1983)

## Scrape Missing Data

In [26]:
api_dashboard = 'V6lMawCCbCP2rShqtU3TmYCY3Em1Osd8I2DLYxwr'

In [45]:
posts = []

In [47]:
for i, id_ in tqdm(zip(list(missing_data.link_desinformacion)[1250:], 
                      list(missing_data.id_desinformacion)[1250:]), total = len(list(missing_data.id_desinformacion)[1250:])):
    URL_BASE = "https://api.crowdtangle.com/links"
    PARAMS = {'link': i, 'count': 1000,'token': api_dashboard, 'platforms': 'facebook',
         'includeHistory':'true'}

    r = requests.get(url = URL_BASE, params=PARAMS)
    data = r.json()
    out = data['result']['posts']
    if len(out) >= 1:
        df = pd.DataFrame(out)
        df['id_desinformacion'] = id_
        posts.append(df)
    else:
        pass

100%|██████████| 198/198 [16:46<00:00,  5.08s/it] 


In [48]:
df_out = pd.concat(posts)

In [62]:
df_out = df_out.reset_index().drop('index', axis = 1)

In [63]:
df_out.to_json('../../../../data/2-misinformation/newspapers/1-engagements/1-scrapes/missing_misinformations.json')

## Load Again Engagements

In [159]:
engagements = clean_engagements()

.DS_Store
newspapers_link_checker.json


In [160]:
df_all = []

In [150]:
    for i in tqdm(range(0, len(engagements))):
    
        post = engagements.loc[i]
        if pd.isna(post['date_factcheck_final']):
            pass
        else:

            df_history = pd.DataFrame(post['history'])
            df_history['date_timestep'] = df_history['date'].apply(lambda x: datetime.strptime(x.split(' ')[0], '%Y-%m-%d').date())
            df_history = df_history.reset_index().drop('index', axis = 1)

            df_eng_post = []
            for h in range(0, len(df_history)):
                columns = list(df_history.loc[h]['actual'].keys())
                values = list(df_history.loc[h]['actual'].values())
                df_temp = pd.DataFrame([values], columns = columns)
                df_temp['date_timestep'] = df_history.loc[h]['date_timestep']
                df_temp['id_desinformacion'] = engagements.loc[i]['id_desinformacion']
                df_temp = df_temp.drop_duplicates('date_timestep', keep = 'first')
                df_temp['date_publication'] = post['date_publication']
                df_temp['days_since_publication'] = df_temp.apply(lambda x: (x['date_timestep'] - x['date_publication']).days, axis = 1)
                df_temp['date_factcheck'] = datetime.strptime(post['date_factcheck_final'], '%Y-%m-%d').date()
                df_temp['days_since_factcheck'] = df_temp.apply(lambda x: (x['date_timestep'] - x['date_factcheck']).days, axis = 1)
                df_temp['id_post_desinformacion'] = post['id_post_desinformacion']
                df_temp['id_factcheck'] = post['id_factcheck']
                df_temp['link_desinformacion'] = post['link_desinformacion']
                df_temp['facebook_partnership_date'] = post['facebook_partnership_date']
                df_temp['organizacion'] = post['organizacion']
                df_temp['pais'] = post['pais']
                df_eng_post.append(df_temp)
            temp = pd.concat(df_eng_post).drop_duplicates('days_since_publication', keep = 'first')
            panel = add_missing_timesteps(temp)
            df_all.append(panel)

  0%|          | 0/68933 [00:00<?, ?it/s]


KeyError: 'date_factcheck_final'

In [161]:
engagements = engagements.reset_index().drop('index', axis = 1)

for i in tqdm(range(0, len(engagements))):
    try:
        post = engagements.loc[i]
        if pd.isna(post['date_factcheck_final']):
            pass
        else:

            df_history = pd.DataFrame(post['history'])
            df_history['date_timestep'] = df_history['date'].apply(lambda x: datetime.strptime(x.split(' ')[0], '%Y-%m-%d').date())
            df_history = df_history.reset_index().drop('index', axis = 1)

            df_eng_post = []
            for h in range(0, len(df_history)):
                columns = list(df_history.loc[h]['actual'].keys())
                values = list(df_history.loc[h]['actual'].values())
                df_temp = pd.DataFrame([values], columns = columns)
                df_temp['date_timestep'] = df_history.loc[h]['date_timestep']
                df_temp['id_desinformacion'] = engagements.loc[i]['id_desinformacion']
                df_temp = df_temp.drop_duplicates('date_timestep', keep = 'first')
                df_temp['date_publication'] = post['date_publication']
                df_temp['days_since_publication'] = df_temp.apply(lambda x: (x['date_timestep'] - x['date_publication']).days, axis = 1)
                df_temp['date_factcheck'] = datetime.strptime(post['date_factcheck_final'], '%Y-%m-%d').date()
                df_temp['days_since_factcheck'] = df_temp.apply(lambda x: (x['date_timestep'] - x['date_factcheck']).days, axis = 1)
                df_temp['id_post_desinformacion'] = post['id_post_desinformacion']
                df_temp['id_factcheck'] = post['id_factcheck']
                df_temp['link_desinformacion'] = post['link_desinformacion']
                df_temp['facebook_partnership_date'] = post['facebook_partnership_date']
                df_temp['organizacion'] = post['organizacion']
                df_temp['pais'] = post['pais']
                df_eng_post.append(df_temp)
            temp = pd.concat(df_eng_post).drop_duplicates('days_since_publication', keep = 'first')
            panel = add_missing_timesteps(temp)
            df_all.append(panel)
    except:
        continue

100%|██████████| 69384/69384 [5:22:18<00:00,  3.59it/s]   


In [163]:
df_final_aggregated = pd.concat(df_all)

In [164]:
import pyreadr
pyreadr.write_rdata("/Users/cblanesg/misinformation_socialmedia/data/2-misinformation/newspapers/3-clean_misinformation/newspapers.RData", df_final_aggregated, df_name="misinformation_newspapers_panel")

## Interpolate data

In [165]:
from tqdm import tqdm
tqdm.pandas()

In [6]:
import pyreadr

result = pyreadr.read_r('/Users/cblanesg/misinformation_socialmedia/data/2-misinformation/newspapers/3-clean_misinformation/newspapers.RData')

# done! let's see what we got
print(result.keys()) # let's check what objects we got

odict_keys(['misinformation_newspapers_panel'])


In [60]:
misinformation_newspapers_panel = result["misinformation_newspapers_panel"] # extract the pandas

In [166]:
misinformation_newspapers_pane = df_final_aggregated

In [168]:
misinformation_newspapers_panel  = misinformation_newspapers_panel.sort_values(['id_post_desinformacion',
                                            'days_since_publication']).reset_index()#.drop('rownames', axis = 1)

In [169]:
misinformation_newspapers_panel['loveCount'] = pd.to_numeric(misinformation_newspapers_panel.loveCount)
misinformation_newspapers_panel['hahaCount'] = pd.to_numeric(misinformation_newspapers_panel.hahaCount)
misinformation_newspapers_panel['wowCount'] = pd.to_numeric(misinformation_newspapers_panel.wowCount)
misinformation_newspapers_panel['sadCount'] = pd.to_numeric(misinformation_newspapers_panel.sadCount)
misinformation_newspapers_panel['angryCount'] = pd.to_numeric(misinformation_newspapers_panel.angryCount)
misinformation_newspapers_panel['thankfulCount'] = pd.to_numeric(misinformation_newspapers_panel.thankfulCount)
misinformation_newspapers_panel['careCount'] = pd.to_numeric(misinformation_newspapers_panel.careCount)

In [170]:
misinformation_newspapers_panel['reactions'] = misinformation_newspapers_panel.apply(
    lambda x: x['loveCount'] + x['hahaCount'] + x['wowCount'] + x['sadCount'] + x['angryCount'] + 
    x['thankfulCount'] + x['careCount'], axis = 1)

In [171]:
misinformation_newspapers_panel['likes'] = pd.to_numeric(misinformation_newspapers_panel.likeCount)
misinformation_newspapers_panel['shares'] = pd.to_numeric(misinformation_newspapers_panel.shareCount)
misinformation_newspapers_panel['comments'] = pd.to_numeric(misinformation_newspapers_panel.commentCount)

In [172]:
panel_input = misinformation_newspapers_panel[['date_timestep', 'id_desinformacion', 'date_publication',
       'days_since_publication', 'date_factcheck', 'days_since_factcheck',
       'id_post_desinformacion', 'id_factcheck', 'link_desinformacion',
       'facebook_partnership_date', 'organizacion', 'pais', 
                                'likes', 'shares', 'comments', 'reactions']]

In [173]:
panel_input = panel_input.sort_values(['id_post_desinformacion','days_since_publication'])

In [174]:
panel_interpolate = []
for i in tqdm(panel_input.id_post_desinformacion.unique()):
    df = panel_input[panel_input['id_post_desinformacion'] == i].sort_values('days_since_publication')
    df['approx_likes'] = df.likes.interpolate()
    df['approx_shares'] = df.shares.interpolate()
    df['approx_comments'] = df.comments.interpolate()
    df['approx_reactions'] = df.reactions.interpolate()
    df['approx_interactions'] = df.apply(lambda x: x['approx_likes'] + 
                                        x['approx_shares'] + x['approx_comments'] + 
                                        x['approx_reactions'], axis = 1)
    panel_interpolate.append(df)   

100%|██████████| 62755/62755 [1:42:40<00:00, 10.19it/s] 


In [175]:
df_interpolate = pd.concat(panel_interpolate)

In [176]:
df_interpolate = df_interpolate[abs(df_interpolate['days_since_factcheck']) <=15]

In [177]:
import pyreadr

In [178]:
df_interpolate.to_csv('/Users/cblanesg/misinformation_socialmedia/data/4-panel_data/newspapers/panel_interpolate.csv')

### Include Growth

In [179]:
df_interpolate['growth_likes'] = df_interpolate.groupby(['id_post_desinformacion']).approx_likes.pct_change()
df_interpolate['growth_shares'] = df_interpolate.groupby(['id_post_desinformacion']).approx_shares.pct_change()
df_interpolate['growth_reactions'] = df_interpolate.groupby(['id_post_desinformacion']).approx_reactions.pct_change()
df_interpolate['growth_comments'] = df_interpolate.groupby(['id_post_desinformacion']).approx_comments.pct_change()
df_interpolate['growth_interactions'] = df_interpolate.groupby(['id_post_desinformacion']).approx_interactions.pct_change()

In [None]:
'../../../../data/'

In [182]:
pyreadr.write_rdata('../../../../data/4-panel_data/newspaper/panel_interpolate.RData', df_interpolate, df_name="misinformation_newspaper_panel")

KeyError: 0

In [None]:
df_interpolate.to_csv('/Users/cblanesg/misinformation_socialmedia/data/4-panel_data/newspapers/panel_interpolate.csv')