# Creating a table of news, corresponding tweets, and their re-tweets

This table consists of ids and will be used to create the propagation network of each news article.

In [1]:
import pandas as pd
import numpy as np
import json
import os
import time
import math
import configs

In [None]:
# settings from configs.py file

dataset = configs.data_source # 'politifact' or 'gossipcop'
label = configs.fake_or_real # 'fake' or 'real'

# Directory consisting downloaded dataset and twitter files
source = '{}/{}/{}/'.format(configs.dic_source, dataset, label) 

In [2]:
news_list = [f for f in os.listdir(source)
             if os.path.exists(os.path.join(source, f, 'news content.json')) and
             os.path.exists(os.path.join(source, f, 'tweets')) and
             os.path.exists(os.path.join(source, f, 'retweets'))]

In [24]:
df = pd.DataFrame(columns=['id_news', 'created_at', 'id_str', 're_created_at',
                           're_id_str', 're_text', 're_user_id_str',
                           're_retweet_count', 're_favorite_count'])

for news in news_list:
    
    tweets = [f for f in os.listdir(os.path.join(source, news, 'tweets'))]
    for tweet in tweets:
        # Extract required features of each tweet
        # Opening JSON file
        f = open(os.path.join(source, news, 'tweets', tweet))

        # returns JSON object as a dictionary
        data = json.load(f)
        
        new_record = {}
        new_record['id_news'] = news
        new_record['created_at'] = data['created_at']
        new_record['id_str'] = data['id_str']

        # Closing file
        f.close()
        
        # Append a tweet no matter it has retweets or how many it has
        row_df = pd.DataFrame([new_record])
        df = pd.concat([df,row_df], ignore_index=True)
        
        # Extract required features of each retweet
        retweets = os.path.join(source, news, 'retweets', tweet)
        if os.path.exists(retweets): # This if is always evaluated as TRUE
            f = open(retweets)
            
            data_retweet = json.load(f)
            if (len(data_retweet['retweets'])) > 0:
                for retweet in data_retweet['retweets']:
                    new_record['re_created_at'] = retweet['created_at']
                    new_record['re_id_str'] = retweet['id_str']
                    new_record['re_text'] = retweet['text']
                    new_record['re_user_id_str'] = retweet['user']['id_str']
                    new_record['re_retweet_count'] = retweet['retweet_count']
                    new_record['re_favorite_count'] = retweet['favorite_count']

                    # Append a tweet + one of its retweets
                    row_df = pd.DataFrame([new_record])
                    df = pd.concat([df,row_df], ignore_index=True)
#             else:
#                 # Append a tweet which has not any retweets
#                 row_df = pd.DataFrame([new_record])
#                 df = pd.concat([df,row_df], ignore_index=True)
            
            f.close()

### Storing df in a json file. Set the name of the file.

In [25]:
# storing data in JSON format
dataset_name = '{}_{}_counting.json'.format(dataset, label)
df.to_json(dataset_name, orient = 'table', index=False, compression = 'infer')