In [1]:
import pandas as pd
import numpy as np
import json
import os
import time
import math
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import configs

In [None]:
# settings from configs.py file

dataset = configs.data_source # 'politifact' or 'gossipcop'
label_or_class = configs.fake_or_real # 'fake' or 'real'

hours = configs.detection_deadline # detection time in hours

# Analysing the lengthes of cascades (or the number of retweets for each tweet)

In [26]:
dataset_name = '{}_{}_counting_{}hours.json'.format(dataset, label_or_class, hours)
df = pd.read_json(dataset_name, orient ='table', compression = 'infer')

In [27]:
df.shape

(372516, 15)

In [28]:
df['count'] = df.groupby(['id_news','id_str'])['re_id_str'].transform('count')

In [29]:
df.describe()

Unnamed: 0,re_retweet_count,re_favorite_count,new_re_created_at,diff_t,diff_r,diff,count
count,34128.0,34128.0,372516.0,372516.0,372516.0,372516.0,372516.0
mean,48.930702,0.0,138350600.0,3378.716616,-1370335000.0,-1370339000.0,2.266034
std,47.812826,0.0,435678400.0,3142.038968,435398900.0,435399400.0,8.91482
min,1.0,0.0,-100.0,0.0,-1543719000.0,-1543728000.0,0.0
25%,15.0,0.0,-100.0,668.0,-1517537000.0,-1517541000.0,0.0
50%,34.0,0.0,-100.0,2296.0,-1506050000.0,-1506056000.0,0.0
75%,70.0,0.0,-100.0,5713.0,-1495586000.0,-1495586000.0,0.0
max,2523.0,0.0,1543725000.0,10799.0,10799.0,10796.0,81.0


In [30]:
df.shape

(372516, 16)

In [31]:
# The number of all tweets for all news in x hours
df.drop(['created_at', 're_created_at', 're_id_str', 're_text', 're_user_id_str',
        're_retweet_count', 're_favorite_count', 'new_created_at', 'new_re_created_at', 
        'zero_time', 'diff_t', 'diff_r', 'diff'], axis=1).drop_duplicates().describe()

Unnamed: 0,count
count,338388.0
mean,0.100855
std,1.543877
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,81.0


In [33]:
# The number of all tweets for all news in x hours
df.drop(['created_at', 're_created_at', 're_id_str', 're_text', 're_user_id_str',
        're_retweet_count', 're_favorite_count', 'new_created_at', 'new_re_created_at', 
        'zero_time', 'diff_t', 'diff_r', 'diff'], axis=1).drop_duplicates().shape

(338388, 3)

In [35]:
# The statics below are for the first x hours
for i in range(20):
    print('The percentage of tweets with {} retweets \
    is {}'.format(i,len(df[df['count'] == i])/len(df) * 100))

The percentage of tweets with 0 retweets     is 89.5102492241944
The percentage of tweets with 1 retweets     is 1.239678295697366
The percentage of tweets with 2 retweets     is 0.3615952066488419
The percentage of tweets with 3 retweets     is 0.26629728655950347
The percentage of tweets with 4 retweets     is 0.2536803788293657
The percentage of tweets with 5 retweets     is 0.29958444737944145
The percentage of tweets with 6 retweets     is 0.25555949274662026
The percentage of tweets with 7 retweets     is 0.2727399628472334
The percentage of tweets with 8 retweets     is 0.29716844377154267
The percentage of tweets with 9 retweets     is 0.30334267521395053
The percentage of tweets with 10 retweets     is 0.2657603968688593
The percentage of tweets with 11 retweets     is 0.22227233192668233
The percentage of tweets with 12 retweets     is 0.25824394119984106
The percentage of tweets with 13 retweets     is 0.2217354422360382
The percentage of tweets with 14 retweets     is 0.257

In [37]:
# By droping duplicates, we consider the retweet counts of each tweet once.
# The statics below are for the first 4 hours
df_counts = df.drop(['created_at', 're_created_at', 're_id_str', 're_text', 
                     're_user_id_str', 're_retweet_count', 're_favorite_count', 
                     'new_created_at', 'new_re_created_at', 'zero_time', 'diff_t', 
                     'diff_r', 'diff'], axis=1).drop_duplicates()

for i in range(20):
    print('The percentage of tweets with {} retweets \
    is {}'.format(i,len(df_counts[df_counts['count'] == i])/len(df_counts) * 100))
    
print('The percentage of tweets with more than 20 retweets \
    is {}'.format(len(df_counts[df_counts['count'] >= 20])/len(df_counts) * 100))

The percentage of tweets with 0 retweets     is 98.53777320708772
The percentage of tweets with 1 retweets     is 0.6823528021088219
The percentage of tweets with 2 retweets     is 0.132687920375427
The percentage of tweets with 3 retweets     is 0.07328865089778597
The percentage of tweets with 4 retweets     is 0.0558530444341998
The percentage of tweets with 5 retweets     is 0.05496648817333948
The percentage of tweets with 6 retweets     is 0.04019055049233424
The percentage of tweets with 7 retweets     is 0.0375308817097533
The percentage of tweets with 8 retweets     is 0.03634880669527288
The percentage of tweets with 9 retweets     is 0.03339361915907183
The percentage of tweets with 10 retweets     is 0.026596687825809425
The percentage of tweets with 11 retweets     is 0.020390793999787226
The percentage of tweets with 12 retweets     is 0.02186838776788775
The percentage of tweets with 13 retweets     is 0.01743560646358618
The percentage of tweets with 14 retweets     is 

<strong> The length of each cascade </strong>

According to the percentages above, the length of each cascade can be less than 7, and we need to consider the results of the model for each length in range [1,7].

# Creating a dataframe of the number of cascades with different lengths and then converting it into a dictionary
<strong>'{}_{}_lens_counts_{}hours.json'.format(dataset, label, hours)</strong>

<strong>'{}_{}_lens_counts_dic_11_{}hours.json'.format(dataset, label, hours)</strong>

In [39]:
df_counts.shape

(372059, 9)

In [40]:
news_list = list(df_counts['id_news'].drop_duplicates())

In [41]:
len(news_list)

14119

In [42]:
df_counts.head(1)

Unnamed: 0,id_news,id_str,new_created_at,new_re_created_at,zero_time,diff_t,diff_r,diff,count
0,gossipcop-846866,858030987837673472,2017-04-28 21:51:43,-100.0,2017-04-28 21:39:07,756.0,-1493416000.0,-1493416000.0,0


In [43]:
df_lens = pd.DataFrame(columns=['0','1','2','3','4','5','6','7','8','9','10','11','12',
                                '13','14','15','16','17','18','19','20','other'])

for news in news_list:
    # A new recored for current news
    new_record = {}
    new_record['id_news'] = news
    
    # A new dataframe consists of the only records belong to current news
    df_t = df_counts[df_counts['id_news'] == news]
    
    for i in range(21):
        this_len_count = df_t[df_t['count'] == i].shape[0]
        new_record[str(i)] = this_len_count
        
    this_len_count = df_t[df_t['count'] > i].shape[0]
    new_record['other'] = this_len_count
    
    row_df = pd.DataFrame([new_record])
    df_lens = pd.concat([df_lens, row_df], ignore_index=True)


In [44]:
df_lens.shape

(14119, 23)

In [45]:
df_lens.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,other,id_news
count,14119,14119,14119,14119,14119,14119,14119,14119,14119,14119,...,14119,14119,14119,14119,14119,14119,14119,14119,14119,14119
unique,148,20,12,9,6,7,4,4,6,4,...,4,4,3,3,5,4,5,3,80,14119
top,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,gossipcop-846866
freq,1451,12675,13776,13918,13952,13945,13987,13996,14002,14008,...,14061,14057,14077,14069,14070,14077,14089,14088,13704,1


In [46]:
# storing data in JSON format
dataset_name = '{}_{}_lens_counts_{}hours.json'.format(dataset, label_or_class, hours)
df_lens.to_json(dataset_name, orient = 'table', index=False, compression = 'infer')

## Creating a new dataset of the previous one by aggregating some last features.

<strong>'{}_{}_lens_counts_dic_11_{}hours.json'.format(dataset, label, hours)</strong>

In [47]:
# Reading the above file
dataset_name = '{}_{}_lens_counts_{}hours.json'.format(dataset, label_or_class, hours)
df_lens = pd.read_json(dataset_name, orient ='table', compression = 'infer')

In [48]:
df_lens.shape

(14119, 23)

In [49]:
df_lens.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,other,id_news
0,26,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,gossipcop-846866
1,19,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,gossipcop-873992


In [50]:
news_ids = list(df_lens['id_news'])
dic_lens = {}

for news in news_ids:
    dic_lens[news] = []
    news_lens = df_lens[df_lens['id_news'] == news]
    for i in range(10):
        x = news_lens.iloc[0][i]
        dic_lens[news].append(x)
        
    other = 0
    for i in range(10, 21):
        x = news_lens.iloc[0][i]
        other += x
        
    x = news_lens.iloc[0]['other']
    other += x
    dic_lens[news].append(other)

In [51]:
len(dic_lens)

14119

In [52]:
len(dic_lens[news_list[0]])

11

In [53]:
# storing data in JSON format
dic_lens_json = json.dumps(dic_lens)
file_name = '{}_{}_lens_counts_dic_11_{}hours.json'.format(dataset, label_or_class, hours)
f = open(file_name,"w")
f.write(dic_lens_json)
f.close()

# Analysing the number of cascades (or the number of tweet)

In [54]:
df_counts.shape

(372059, 9)

In [55]:
df_casecad_counts = df_counts.drop(['count'], axis=1).drop_duplicates()

In [56]:
df_casecad_counts['cascade_count'] = df_casecad_counts.groupby(['id_news'])['id_str'].transform('count')

In [60]:
df_casecad_counts.describe()

Unnamed: 0,new_re_created_at,diff_t,diff_r,diff,cascade_count
count,372059.0,372059.0,372059.0,372059.0,372059.0
mean,136671100.0,3379.740538,-1372018000.0,-1372022000.0,58.943461
std,433300300.0,3142.698076,433007800.0,433008200.0,94.544314
min,-100.0,0.0,-1543719000.0,-1543728000.0,1.0
25%,-100.0,668.0,-1517548000.0,-1517551000.0,24.0
50%,-100.0,2296.0,-1506093000.0,-1506094000.0,40.0
75%,-100.0,5717.0,-1495602000.0,-1495604000.0,54.0
max,1543725000.0,10799.0,10799.0,10796.0,1029.0


In [61]:
# The mean of Q1:Q3 cascade counts has less std than the total mean
df_casecad_counts[df_casecad_counts['cascade_count'].between(27,318)].describe()

Unnamed: 0,new_re_created_at,diff_t,diff_r,diff,cascade_count
count,258862.0,258862.0,258862.0,258862.0,258862.0
mean,163709000.0,3703.537,-1343315000.0,-1343319000.0,57.06986
std,469526600.0,3204.68588,468574800.0,468575400.0,38.619302
min,-100.0,0.0,-1532996000.0,-1533005000.0,27.0
25%,-100.0,841.0,-1515038000.0,-1515043000.0,38.0
50%,-100.0,2794.0,-1502509000.0,-1502517000.0,45.0
75%,-100.0,6261.75,-1494699000.0,-1494705000.0,60.0
max,1532967000.0,10799.0,10799.0,10796.0,291.0


<strong> The number of cascades per news </strong>

According to the percentages above, the average value for cascades count is around 3300, which means each piece of news is <strong>tweeted</strong> (only tweeted without counting retweets) almost 3300 times.

<strong> The number of cascades per news in the first 4 hours </strong>
According to the percentages above, the average value for cascades count is around 124, which means each piece of news is <strong>tweeted</strong> (only tweeted without counting retweets) almost 124 times.