# Creating a distinctive table for each object (tweet, re-tweet, user)

 
For information about the top-level <i>user</i> field and second-level <i>user</i> field in the <b>retweeted_status</b>, refer to the link below:

https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/user-object.html

In [1]:
import pandas as pd
import numpy as np
import json
import os
import time
import math
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import configs

In [None]:
# settings from configs.py file

dataset = configs.data_source # 'politifact' or 'gossipcop'
label = configs.fake_or_real # 'fake' or 'real'

# Directory consisting downloaded dataset and twitter files
source = '{}/{}/{}/'.format(configs.dic_source, dataset, label) 

In [2]:
news_list = [f for f in os.listdir(source)
             if os.path.exists(os.path.join(source, f, 'news content.json')) and
             os.path.exists(os.path.join(source, f, 'tweets')) and
             os.path.exists(os.path.join(source, f, 'retweets'))]

## Creating a Dataframe for tweet table
<strong>'{dataset}_{label}_tweets.json'</strong>

In [4]:
def determine_tweet_type(tweet):
    # Check for reply indicator first
    if tweet["in_reply_to_status_id"] is not None:
        tweet_type = "Reply Tweet"
    # Check boolean quote status field and make sure it's not a RT of a Quote Tweet 
    elif tweet["is_quote_status"] is True and not tweet["text"].startswith("RT"):
        tweet_type = "Quote Tweet"
    # Check both indicators of a Retweet
    elif tweet["text"].startswith("RT") and tweet.get("retweeted_status") is not None:
        tweet_type = "Retweet"
    else:
        tweet_type = "Original Tweet"
    return tweet_type

In [6]:
df = pd.DataFrame(columns=['news_id', 'created_at', 'tweet_id_str', 'text', 
                           'sentiment_compound', 'pos_count', 'neg_count',
                           'hashtag_count','mention_count', 
                           'user_id_str', 'retweet_count', 'favorite_count', 
                           'tweet_type'])

# Creating a sentiment analyzer object to calculate sentiment_compound
# of each tweet and retweet's text. Retweets can be quated tweets or replys
sid_obj = SentimentIntensityAnalyzer()

# Loading two lists of positive and negative words in order to count the 
# number of positive and negative words in each tweet and retweet
file_positives = open("positive.txt",'r')
positives = file_positives.read().replace("\n"," ").lower().split()
file_negatives = open("negative.txt",'r')
negatives = file_negatives.read().replace("\n"," ").lower().split()

for news in news_list:
    
    tweets = [f for f in os.listdir(os.path.join(source, news, 'tweets'))]
#     print('# of tweets: ', len(tweets))
    for tweet in tweets:
        # Extract important features of each tweet
        # Opening JSON file
        f = open(os.path.join(source, news, 'tweets', tweet))

        # returns JSON object as a dictionary
        data = json.load(f)
        
        new_record = {}
        
        new_record['news_id'] = news
        
        new_record['created_at'] = data['created_at']
        new_record['tweet_id_str'] = data['id_str']
        new_record['text'] = data['text']
        
        # Calculate the sentiment value of the tweet
        sentiment_dict = sid_obj.polarity_scores(data['text'])
        new_record['sentiment_compound'] = sentiment_dict['compound']
        
        # Count the frequency of positive and negative words in each tweet
        tokens = data['text'].split()
        pos_count = 0
        neg_count = 0
        for token in tokens:
            if token.strip().lower() in positives:
                pos_count += 1
            elif token.strip().lower() in negatives:
                neg_count += 1
        new_record['pos_count'] = pos_count
        new_record['neg_count'] = neg_count
        
        new_record['hashtag_count'] = len(data['entities']['hashtags'])
        new_record['mention_count'] = len(data['entities']['user_mentions'])
        new_record['user_id_str'] = data['user']['id_str']
        new_record['retweet_count'] = data['retweet_count']
        new_record['favorite_count'] = data['favorite_count']
        new_record['tweet_type'] = determine_tweet_type(data)

        # Closing file
        f.close()
        
        # Append rows in Empty Dataframe by adding dictionaries
        row_df = pd.DataFrame([new_record])
        #print('row')
        df = pd.concat([df,row_df], ignore_index=True)


In [8]:
df.shape

(686225, 13)

In [9]:
df.head()

Unnamed: 0,news_id,created_at,tweet_id_str,text,sentiment_compound,pos_count,neg_count,hashtag_count,mention_count,user_id_str,retweet_count,favorite_count,tweet_type
0,gossipcop-846866,Fri Apr 28 18:51:43 +0000 2017,858030987837673472,"Babies, Movies, Music and More: One Direction'...",0.0,0,0,0,0,85646589,0,0,Original Tweet
1,gossipcop-846866,Sat Apr 29 01:55:55 +0000 2017,858137740566827008,"Babies, Movies, Music and More: One Direction'...",0.0,0,0,0,0,52493415,0,0,Original Tweet
2,gossipcop-846866,Fri Apr 28 18:56:00 +0000 2017,858032064276377600,"Babies, Movies, Music and More: One Direction’...",0.0,0,0,0,0,20587567,0,0,Original Tweet
3,gossipcop-846866,Fri Apr 28 18:59:27 +0000 2017,858032934699360256,"#entertainment Babies, Movies, Music and More:...",0.4215,0,0,1,0,501117970,0,0,Original Tweet
4,gossipcop-846866,Fri Apr 28 21:01:43 +0000 2017,858063704407105537,"Babies, Movies, Music and More: One Direction'...",0.0,0,0,0,0,833861800336695296,0,0,Original Tweet


### Storing df in a json file. Set the name of the file.

In [10]:
# storing data in JSON format
dataset_name = '{}_{}_tweets.json'.format(dataset, label)
df.to_json(dataset_name, orient = 'table', index=False, compression = 'infer')

## Creating a Dataframe for retweet table
<strong>'{dataset}_{label}_retweets.json'</strong>

In [11]:
df = pd.DataFrame(columns=['id_str', 're_created_at', 're_id_str', 're_text',
                           're_sentiment_compound', 're_pos_count', 're_neg_count',
                           're_hashtag_count','re_mention_count', 
                           're_user_id_str', 're_retweet_count', 're_favorite_count', 
                           'retweet_type'])

# Creating a sentiment analyzer object to calculate sentiment_compound
# of each tweet and retweet's text. Retweets can be quated tweets or replys
sid_obj = SentimentIntensityAnalyzer()

# Loading two lists of positive and negative words in order to count the 
# number of positive and negative words in each tweet and retweet
file_positives = open("positive.txt",'r')
positives = file_positives.read().replace("\n"," ").lower().split()
file_negatives = open("negative.txt",'r')
negatives = file_negatives.read().replace("\n"," ").lower().split()

for news in news_list:
    tweets = [f for f in os.listdir(os.path.join(source, news, 'tweets'))]
    for tweet in tweets:
        # Extract important features of each tweet
        # Opening JSON file
        f = open(os.path.join(source, news, 'tweets', tweet))

        # returns JSON object as a dictionary
        data_tweet = json.load(f)

        # Closing file
        f.close()
        
        # Extract important features of each retweet
        retweets = os.path.join(source, news, 'retweets', tweet)
        if os.path.exists(retweets):
            # Opening JSON file
            f = open(retweets)
            
            # returns JSON object as a dictionary
            data_retweet = json.load(f)
#             print(len(data['retweets']))
            if (len(data_retweet['retweets'])) > 0:
                for retweet in data_retweet['retweets']:
                    
                    new_record = {}
                    new_record['id_str'] = data_tweet['id_str']
        
                    new_record['re_created_at'] = retweet['created_at']
                    new_record['re_id_str'] = retweet['id_str']
                    new_record['re_text'] = retweet['text']
                    
                    # Calculate the sentiment value of the tweet
                    sentiment_dict = sid_obj.polarity_scores(retweet['text'])
                    new_record['re_sentiment_compound'] = sentiment_dict['compound']

                    # Count the frequency of positive and negative words in each tweet
                    tokens = retweet['text'].split()
                    pos_count = 0
                    neg_count = 0
                    for token in tokens:
                        if token.strip().lower() in positives:
                            pos_count += 1
                        elif token.strip().lower() in negatives:
                            neg_count += 1
                    new_record['re_pos_count'] = pos_count
                    new_record['re_neg_count'] = neg_count

                    new_record['re_hashtag_count'] = len(retweet['entities']['hashtags'])
                    new_record['re_mention_count'] = len(retweet['entities']['user_mentions'])

                    new_record['re_user_id_str'] = retweet['user']['id_str']
                    new_record['re_retweet_count'] = retweet['retweet_count']
                    new_record['re_favorite_count'] = retweet['favorite_count']
                    new_record['retweet_type'] = determine_tweet_type(retweet)

                    # Append rows in Empty Dataframe by adding dictionaries
                    row_df = pd.DataFrame([new_record])
                    
                    df = pd.concat([df,row_df], ignore_index=True)
            
            # Closing file
            f.close()


In [12]:
df.shape

(131977, 13)

In [15]:
df.head()

Unnamed: 0,id_str,re_created_at,re_id_str,re_text,re_sentiment_compound,re_pos_count,re_neg_count,re_hashtag_count,re_mention_count,re_user_id_str,re_retweet_count,re_favorite_count,retweet_type
0,979747841303130112,Sat Mar 31 09:23:36 +0000 2018,980012726536867840,"RT @AnthoniaOrji: Corey Feldman Reveals ""World...",0.0,0,1,0,1,967297531964219392,2,0,Retweet
1,979747841303130112,Fri Mar 30 16:00:07 +0000 2018,979750125517058048,"RT @AnthoniaOrji: Corey Feldman Reveals ""World...",0.0,0,1,0,1,3182284246,2,0,Retweet
2,935721849627045888,Wed Nov 29 04:14:55 +0000 2017,935723722626957323,RT @Mystery_Girl14: Meghan Markle Is Quizzed o...,-0.4754,0,1,0,1,2409582901,2,0,Retweet
3,935721849627045888,Wed Nov 29 04:08:24 +0000 2017,935722085548285955,RT @Mystery_Girl14: Meghan Markle Is Quizzed o...,-0.4754,0,1,0,1,2350847191,2,0,Retweet
4,1018225954970271744,Sat Jul 14 20:12:04 +0000 2018,1018226646468395008,RT @abdiJosssS: Serena Williams Gets Emotional...,-0.25,0,1,0,0,2771504463,2,0,Retweet


### Storing df in a json file. Set the name of the file.

In [16]:
# storing data in JSON format
dataset_name = '{}_{}_retweets.json'.format(dataset, label)
df.to_json(dataset_name, orient = 'table', index=False, compression = 'infer')

## Creating a Dataframe for user table
<strong>'{dataset}_{label}_users.json'</strong>

In [17]:
# Creating an empty Dataframe with column names only
df = pd.DataFrame(columns=['user_id_str', 'user_followers_count', 'user_friends_count', 'user_listed_count', 
                           'user_created_at', 'user_favourites_count', 'user_verified', 'user_statuses_count'])

for news in news_list:
    tweets = [f for f in os.listdir(os.path.join(source, news, 'tweets'))]
    for tweet in tweets:
        # Extract important features of each user
        f = open(os.path.join(source, news, 'tweets', tweet))

        # returns JSON object as a dictionary
        data = json.load(f)
        
        new_record = {}
        
        new_record['user_id_str'] = data['user']['id_str']
        new_record['user_followers_count'] = data['user']['followers_count']
        new_record['user_friends_count'] = data['user']['friends_count']
        new_record['user_listed_count'] = data['user']['listed_count']
        new_record['user_created_at'] = data['user']['created_at']
        new_record['user_favourites_count'] = data['user']['favourites_count']
        new_record['user_verified'] = data['user']['verified']
        new_record['user_statuses_count'] = data['user']['statuses_count']

        # Closing file
        f.close()
        
        row_df = pd.DataFrame([new_record])
        df = pd.concat([df,row_df], ignore_index=True)
        
        # Extract important features of each retweet
        retweets = os.path.join(source, news, 'retweets', tweet)
        if os.path.exists(retweets):
            f = open(retweets)
            
            # returns JSON object as a dictionary
            data_retweet = json.load(f)
            if (len(data_retweet['retweets'])) > 0:
                for retweet in data_retweet['retweets']:
                    new_record = {}
                    new_record['user_id_str'] = retweet['user']['id_str']
                    new_record['user_followers_count'] = retweet['user']['followers_count']
                    new_record['user_friends_count'] = retweet['user']['friends_count']
                    new_record['user_listed_count'] = retweet['user']['listed_count']
                    new_record['user_created_at'] = retweet['user']['created_at']
                    new_record['user_favourites_count'] = retweet['user']['favourites_count']
                    new_record['user_verified'] = retweet['user']['verified']
                    new_record['user_statuses_count'] = retweet['user']['statuses_count']

                    row_df = pd.DataFrame([new_record])
                    df = pd.concat([df,row_df], ignore_index=True)
            # Closing file
            f.close()

In [18]:
df.shape

(818202, 8)

In [19]:
df.drop_duplicates(inplace=True)

In [20]:
df.shape

(182136, 8)

In [21]:
df.head()

Unnamed: 0,user_id_str,user_followers_count,user_friends_count,user_listed_count,user_created_at,user_favourites_count,user_verified,user_statuses_count
0,85646589,2969,1931,44,Tue Oct 27 19:51:59 +0000 2009,9,False,181925
1,52493415,2143,5001,60,Tue Jun 30 19:32:33 +0000 2009,22885,False,564140
2,20587567,1522,1647,837,Wed Feb 11 12:48:59 +0000 2009,1,False,590040
3,501117970,4449,41,4,Thu Feb 23 20:21:19 +0000 2012,2761,False,28871
4,833861800336695296,111,0,4,Tue Feb 21 02:12:00 +0000 2017,64,False,163893


### Storing df in a json file. Set the name of the file.

In [22]:
# storing data in JSON format
dataset_name = '{}_{}_users.json'.format(dataset, label)
df.to_json(dataset_name, orient = 'table', index=False, compression = 'infer')

## A small dataset for test

In [307]:
df_temp = pd.DataFrame( {
   'news_id': [1,1,1,1,1,2,2,2,2,2,3,3,4,4,4,4,4],
   'tweet_id': [5,5,5,6,7,5,6,6,5,6,7,7,6,7,7,6,7],
   'retweet_id': [1,9,1,1,np.nan,1,1,1,np.nan,1,1,np.nan,1,1,np.nan,1,1],
    're_user_id': [13, 15,-100,14,11,21,16,16,19,22,17,23,12,17,11,18,19]
    } )

In [392]:
df_temp

Unnamed: 0,news_id,tweet_id,retweet_id,re_user_id,count
0,1,5,1.0,13,3
1,1,5,9.0,15,3
2,1,5,1.0,-100,3
3,1,6,1.0,14,1
4,1,7,,11,0
5,2,5,1.0,21,1
6,2,6,1.0,16,3
7,2,6,1.0,16,3
8,2,5,,19,1
9,2,6,1.0,22,3


In [412]:
df_temp[(df_temp['news_id'] == 1) & (df_temp['tweet_id'] == 5)]

Unnamed: 0,news_id,tweet_id,retweet_id,re_user_id,count
0,1,5,1.0,13,3
1,1,5,9.0,15,3
2,1,5,1.0,-100,3


In [None]:
df_temp['zero_time'] = df_temp.groupby(['news_id'])['tweet_id'].transform('min')

In [None]:
df_temp

In [None]:
df_temp['diff'] = df_temp['tweet_id'] - df_temp['zero_time']

In [None]:
df_temp

In [None]:
# The total number of retweets created in first 15 minuets after tweeting 
df_temp[df_temp['diff'].between(1, 2)].shape

In [309]:
df_temp['count'] = df_temp.groupby(['news_id','tweet_id'])['retweet_id'].transform('count')

In [310]:
df_temp

Unnamed: 0,news_id,tweet_id,retweet_id,re_user_id,count
0,1,5,1.0,13,3
1,1,5,9.0,15,3
2,1,5,1.0,-100,3
3,1,6,1.0,14,1
4,1,7,,11,0
5,2,5,1.0,21,1
6,2,6,1.0,16,3
7,2,6,1.0,16,3
8,2,5,,19,1
9,2,6,1.0,22,3


In [311]:
df_temp[df_temp['news_id'] == 1].sort_values(['tweet_id', 'count'], 
                                             ascending=False)

Unnamed: 0,news_id,tweet_id,retweet_id,re_user_id,count
4,1,7,,11,0
3,1,6,1.0,14,1
0,1,5,1.0,13,3
1,1,5,9.0,15,3
2,1,5,1.0,-100,3


In [None]:
df_temp.sort_values(['job','count'],ascending=False).groupby('job').head(3)