In [1]:
import pandas as pd
import numpy as np
import json
import os
import time
import math
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import configs

In [2]:
# settings from configs.py file

dataset = configs.data_source # 'politifact' or 'gossipcop'
label_or_class = configs.fake_or_real # 'fake' or 'real'

hours = configs.detection_deadline # detection time in hours

# Filtering tweets and retweets created in the first above hours considering the time of first tweet as the starting point
<strong>'{}_{}_counting_{}hours.json'.format(dataset, label, hours)</strong>

In [4]:
# Reading total nodes table to filter
dataset_name = '{}_{}_counting.json'.format(dataset, label_or_class)
df = pd.read_json(dataset_name, orient ='table', compression = 'infer')

In [5]:
df.shape

(818202, 9)

In [6]:
df['id_news'].drop_duplicates().shape

(14119,)

In [7]:
# generating new time field for each tweet
df['new_created_at'] = df.created_at.apply(
    lambda x: time.mktime(time.strptime(str(x), "%Y-%m-%d %H:%M:%S+00:00")))

In [8]:
# generating new time field for each retweet considering tweet time as zero
# if there is no retweets, the value is -100
df['new_re_created_at'] = df.re_created_at.apply(
    lambda x: -100 if pd.isnull(x) else 
    time.mktime(time.strptime(str(x), "%Y-%m-%d %H:%M:%S+00:00")))

In [9]:
df.shape

(818202, 11)

In [10]:
df.head()

Unnamed: 0,id_news,created_at,id_str,re_created_at,re_id_str,re_text,re_user_id_str,re_retweet_count,re_favorite_count,new_created_at,new_re_created_at
0,gossipcop-846866,2017-04-28 18:51:43+00:00,858030987837673472,NaT,,,,,,1493416000.0,-100.0
1,gossipcop-846866,2017-04-29 01:55:55+00:00,858137740566827008,NaT,,,,,,1493442000.0,-100.0
2,gossipcop-846866,2017-04-28 18:56:00+00:00,858032064276377600,NaT,,,,,,1493417000.0,-100.0
3,gossipcop-846866,2017-04-28 18:59:27+00:00,858032934699360256,NaT,,,,,,1493417000.0,-100.0
4,gossipcop-846866,2017-04-28 21:01:43+00:00,858063704407105537,NaT,,,,,,1493424000.0,-100.0


In [11]:
df['zero_time'] = df.groupby(['id_news'])['new_created_at'].transform('min')

In [12]:
df.shape

(818202, 12)

In [13]:
df['diff_t'] = df['new_created_at'] - df['zero_time']

In [14]:
df.shape

(818202, 13)

In [15]:
df['diff_r'] = df['new_re_created_at'] - df['zero_time']

In [16]:
df.shape

(818202, 14)

In [17]:
# The total number of tweets created in first 5 hours after releasing news
df = df[df['diff_t'] < (hours*60*60)]

In [18]:
df.shape

(396650, 14)

In [19]:
# The total number of retweets created in first 4 hours after releasing news
# The field below is -100 for all tweet recoreds
df = df[df['diff_r'] < (hours*60*60)]

In [20]:
df.shape

(372516, 14)

In [21]:
df.drop_duplicates().shape

(372516, 14)

In [22]:
df['diff'] = df['new_re_created_at'] - df['new_created_at']

In [23]:
df.drop_duplicates().shape

(372516, 15)

In [24]:
df['id_news'].drop_duplicates().shape

(14119,)

In [25]:
# storing data in JSON format
dataset_name = '{}_{}_counting_{}hours.json'.format(dataset, label_or_class, hours)
df.to_json(dataset_name, orient = 'table', index=False, compression = 'infer')