In [126]:
import pandas as pd
import matplotlib.pyplot as plt
import dataframe_image as dfi

In [127]:
file_paths = {
    'fake': {
        'read': 'data/processed/full_dataset/fake_dataset.csv'
    },
    'real': {
        'read': 'data/processed/full_dataset/real_dataset.csv'
    }
}

In [128]:
# Read in the datasets
fake_df = pd.read_csv(file_paths['fake']['read'], encoding='utf8')
real_df = pd.read_csv(file_paths['real']['read'], encoding='utf8')

# Restrict to English only
fake_df = fake_df[fake_df['language'] == 'en']
real_df = real_df[real_df['language'] == 'en']

### Plot Tweet Numerical Column Distributions

In [129]:
for df, type in [(fake_df, 'fake'), (real_df, 'real')]:
    # Distributions of like count, reply count, retweet count
    like_count = df['likes_count'][(df['likes_count'] < 100) & (df['likes_count'] > 10)]
    plt.hist(like_count, bins=25)
    plt.title('Like Count Distribution - {}'.format(type))
    plt.savefig('results/plots/like_count_distribution_{}.jpg'.format(type))
    plt.clf()
    reply_count = df['replies_count'][(df['replies_count'] < 100) & (df['replies_count'] > 10)]
    plt.hist(reply_count, bins=25)
    plt.title('Reply Count Distribution - {}'.format(type))
    plt.savefig('results/plots/reply_count_distribution_{}.jpg'.format(type))
    plt.clf()
    retweet_count = df['retweets_count'][(df['retweets_count'] < 100) & (df['retweets_count'] > 10)]
    plt.hist(retweet_count, bins=25)
    plt.title('Retweet Count Distribution - {}'.format(type))
    plt.savefig('results/plots/retweet_count_distribution_{}.jpg'.format(type))
    plt.clf()

<Figure size 432x288 with 0 Axes>

### Get Summary Stats for Tweet Numerical Columns

In [130]:
for df, type in [(fake_df, 'fake'), (real_df, 'real')]:
    # Get summary stats of like, reply, retweet count
    stats = df[['likes_count','replies_count','retweets_count']].describe()
    dfi.export(stats, 'results/stats/summary_{}.jpg'.format(type))

### Plot User Data Numerical Columns

In [131]:
# Plot user data stats
for df, type in [(fake_df, 'fake'), (real_df, 'real')]:
    # Distributions of like count, reply count, retweet count
    follower_count = df['user_followers'][(df['user_followers'] < 10000) & (df['user_followers'] > 10)]
    plt.hist(follower_count, bins=25)
    plt.title('Follower Count Distribution - {}'.format(type))
    plt.savefig('results/plots/follower_count_distribution_{}.jpg'.format(type))
    plt.clf()
    following_count = df['user_following'][(df['user_following'] < 10000) & (df['user_following'] > 10)]
    plt.hist(following_count, bins=25)
    plt.title('Following Count Distribution - {}'.format(type))
    plt.savefig('results/plots/following_count_distribution_{}.jpg'.format(type))
    plt.clf()
    user_tweet_count = df['user_tweet_count'][(df['user_tweet_count'] < 100000) & (df['user_tweet_count'] > 10)]
    plt.hist(user_tweet_count, bins=25)
    plt.title('User Tweet Count Distribution - {}'.format(type))
    plt.savefig('results/plots/user_tweet_count_distribution_{}.jpg'.format(type))
    plt.clf()

<Figure size 432x288 with 0 Axes>

### Get User Data Stats

In [121]:
for df, type in [(fake_df, 'fake'), (real_df, 'real')]:
    # Get summary stats of followers, follwing, tweet count
    stats = df[['user_followers','user_following','user_tweet_count']].describe()
    dfi.export(stats, 'results/stats/user_summary_{}.jpg'.format(type))

### Plot time series

In [125]:
for df, type in [(fake_df, 'fake'), (real_df, 'real')]:
    tweet_ts = pd.to_datetime(df['created_at']).dt.date.value_counts()
    full_user_ts = pd.to_datetime(df['user_timestamp']).dt.date.value_counts()
    
    start_date = tweet_ts.index.min()
    end_date = tweet_ts.index.max()
    user_ts = full_user_ts[(full_user_ts.index >= start_date) & (full_user_ts.index <= end_date)]

    plt.scatter(tweet_ts.index, tweet_ts, s=3)
    plt.xlabel('Date')
    plt.ylabel('Number of Tweets Posted')
    plt.xticks(rotation = -45)
    plt.title('Time Series of Tweets Posted - {}'.format(type))
    plt.tight_layout()
    plt.savefig('results/plots/tweet_time_series_{}'.format(type))
    plt.clf()

    plt.scatter(full_user_ts.index, full_user_ts, s=3)
    plt.xlabel('Date')
    plt.ylabel('Number of Users Created')
    plt.xticks(rotation = -45)
    plt.title('Full Time Series of User Account Creations - {}'.format(type))
    plt.tight_layout()
    plt.savefig('results/plots/full_user_time_series_{}'.format(type))
    plt.clf()
    
    plt.scatter(user_ts.index, user_ts, s=3)
    plt.xlabel('Date')
    plt.ylabel('Number of Users Created')
    plt.xticks(rotation = -45)
    plt.title('Truncated Time Series of User Account Creations - {}'.format(type))
    plt.tight_layout()
    plt.savefig('results/plots/user_time_series_{}'.format(type))
    plt.clf()

<Figure size 432x288 with 0 Axes>

### Feature Extraction

In [137]:
for df, type in [(fake_df, 'fake'), (real_df, 'real')]:
    # created_at - user_timestamp (in hours)
    df['time_diff'] = (pd.to_datetime(df['created_at'], utc=True) - pd.to_datetime(df['user_timestamp'], utc=True)) / pd.Timedelta(hours=1)
    # tweet non-whitespace character count
    df['char_count'] = df['tweet'].str.replace('[\s]','', regex=True).str.len()
    # tweet number of special characters
    df['special_char_count'] = df['tweet'].str.replace('[a-zA-Z0-9\s]','', regex=True).str.len()
    # user value counts
    df['user_freq'] = df['user_id'].map(df['user_id'].value_counts())

In [138]:
# Plot distributions
for df, type in [(fake_df, 'fake'), (real_df, 'real')]:
    # Distributions of like count, reply count, retweet count
    time_diff = df['time_diff']
    plt.hist(time_diff, bins=25)
    plt.title('Account Creation Time - Tweet Time Distribution (hours) - {}'.format(type))
    plt.savefig('results/plots/time_diff_distribution_{}.jpg'.format(type))
    plt.clf()
    char_count = df['char_count'][df['char_count'] < 400]
    plt.hist(char_count, bins=25)
    plt.title('Character Count Distribution - {}'.format(type))
    plt.savefig('results/plots/char_count_distribution_{}.jpg'.format(type))
    plt.clf()
    special_char_count = df['special_char_count'][df['special_char_count'] < 40]
    plt.hist(special_char_count, bins=25)
    plt.title('Special Character Count Distribution - {}'.format(type))
    plt.savefig('results/plots/special_char_count_distribution_{}.jpg'.format(type))
    plt.clf()
    user_freq = df['user_freq'][df['user_freq'] < 15]
    plt.hist(user_freq, bins=25)
    plt.title('User Dataset Frequencies Distribution - {}'.format(type))
    plt.savefig('results/plots/user_freq_distribution_{}.jpg'.format(type))
    plt.clf()

<Figure size 432x288 with 0 Axes>

In [139]:
# Record statistics
for df, type in [(fake_df, 'fake'), (real_df, 'real')]:
    # Get summary stats of followers, follwing, tweet count
    stats = df[['time_diff','char_count','special_char_count', 'user_freq']].describe()
    dfi.export(stats, 'results/stats/extracted_features_summary_{}.jpg'.format(type))

In [140]:
# Add a label column
fake_df['label'] = 'fake'
real_df['label'] = 'real'

# Get the features we want
fake_df_model = fake_df[['likes_count','replies_count','retweets_count','user_followers','user_following','user_tweet_count','time_diff','char_count','special_char_count', 'user_freq', 'label']]
real_df_model = real_df[['likes_count','replies_count','retweets_count','user_followers','user_following','user_tweet_count','time_diff','char_count','special_char_count', 'user_freq', 'label']]

# Combine the dataframes
df_model = pd.concat([fake_df_model, real_df_model], axis=0)

# Write to file
df_model.to_csv('data/processed/model_data/dataset.csv', index=False)