In [1]:
import pandas as pd
import numpy as np
import json
import os
import time
import math
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import configs

In [None]:
# settings from configs.py file

dataset = configs.data_source # 'politifact' or 'gossipcop'
label_or_class = configs.fake_or_real # 'fake' or 'real'

hours = configs.detection_deadline # detection time in hours

# Creating a propagation tree with all cascades and without cutting length for each piece of news
<strong>"{}_{}_propagation_{}hours_free_len_shortest_first.json".format(dataset, label, hours)</strong>

In [64]:
# reading the JSON file
dataset_name = '{}_{}_counting_{}hours.json'.format(dataset, label_or_class, hours)
df = pd.read_json(dataset_name, orient ='table', compression = 'infer')

In [65]:
df['count'] = df.groupby(['id_news','id_str'])['re_id_str'].transform('count')

In [66]:
df.shape

(372516, 16)

In [67]:
# t1 = 'Wed Dec 07 22:07:01 +0000 2016'
# t2 = 'Wed Dec 07 22:07:02 +0000 2016'
# t3 = '2016-12-17 14:51:06+00:00'
# t4 = '2016-12-17 14:51:07+00:00'
# print(time.mktime(time.strptime(t1, "%a %b %d %H:%M:%S +0000 %Y")))
# print(time.mktime(time.strptime(t2, "%a %b %d %H:%M:%S +0000 %Y")))
# print(time.mktime(time.strptime(t3, "%Y-%m-%d %H:%M:%S+00:00")))
# print(time.mktime(time.strptime(t4, "%Y-%m-%d %H:%M:%S+00:00")))

In [68]:
df.drop(['created_at', 're_created_at', 're_text', 'new_re_created_at',
         're_user_id_str', 're_retweet_count', 're_favorite_count','new_created_at', 
         're_id_str', 'zero_time', 'diff_t', 'diff_r', 'diff'], 
        axis=1).drop_duplicates().shape

(338388, 3)

In [69]:
df_counts = df.drop(['created_at', 're_created_at', 're_text', 'new_re_created_at',
                     're_user_id_str', 're_retweet_count', 're_favorite_count', 'diff',
                     'new_created_at', 're_id_str', 'zero_time', 'diff_t', 'diff_r'], 
        axis=1).drop_duplicates()

In [70]:
df_counts.head(2)

Unnamed: 0,id_news,id_str,count
0,gossipcop-846866,858030987837673472,0
1,gossipcop-846866,858032064276377600,0


In [71]:
df_counts.shape

(338388, 3)

In [72]:
news_ids = list(df_counts['id_news'].drop_duplicates())

In [73]:
len(news_ids)

14119

In [75]:
# without trancating trees and branches, but padding is applied
propagation_trees = {}
cascade_len = 16   # 2^3
cascade_num = 256 # 2^7
for news_id in news_ids:

    propagation_trees[news_id] = []
    # ascending=True means selecting the shortest branches
    t_ids = list(df_counts[df_counts['id_news'] == news_id]\
                 .sort_values(['count'], ascending=True)['id_str'])
    
    for t_id in t_ids:

        r_ids = list(df[(df['id_news'] == news_id) & (df['id_str'] == t_id)]\
                     .sort_values(['diff'], 
                                  ascending=True)['re_id_str'])
#         try:
#             # check to see there is any retweet for the current tweet.
#             # insert the tweet in the begining of current cascade
#             if np.isnan(r_ids[0]):
#                 r_ids=[]
#                 r_ids.insert(0, t_id)
#         except:
#             r_ids.insert(0, t_id)
        r_ids.insert(0, t_id)
        
        # add 'pad' ids to the short cascades untill their length becomes cascade_len
        r_ids += ['pad'] * (cascade_len - len(r_ids))

        propagation_trees[news_id].append(r_ids)
    
    # add ['pad', ..., 'pad'] cascades to the trees untill their 
    # cascade number become cascade_num
    cascade_pad = cascade_num - len(propagation_trees[news_id])
    propagation_trees[news_id] += [['pad'] * (cascade_len)] * cascade_pad


In [76]:
# save the propagation trees to a json file
propagation_trees_json = json.dumps(propagation_trees)
file_name = "{}_{}_propagation_{}hours_free_len_shortest_first.json".format(dataset, label_or_class, hours)
f = open(file_name,"w")
f.write(propagation_trees_json)
f.close()