In [1]:
import pandas as pd
from pandas import DataFrame
import numpy as np
import networkx
import itertools
import time

Load Data

In [2]:
CSV_FILE_PATH = "../dataset/news_tweets.csv"
df_social = pd.read_csv(CSV_FILE_PATH) 
df_social['followers_count'] = df_social['followers_count'].fillna(0)
df_social['friends_count'] = df_social['friends_count'].fillna(0)

CSV_FILE_PATH = "../dataset/recovery-news-data.csv"
df_content = pd.read_csv(CSV_FILE_PATH) 

nids = np.array(df_content['news_id'].values, dtype=int)
labels = np.array(-1 * df_content['reliability'].values + 1, dtype=int)

print(nids[:5])
print(labels[:5])

[0 1 2 3 4]
[0 0 0 1 0]


Number of Tweets

In [3]:
tweet_nums = np.zeros(nids.shape, dtype=int)
for idx,nid in enumerate(nids):
    ids = df_social[df_social['news_id'] == nid]['tweet_id']
    tweet_nums[idx] = ids.unique().shape[0]
print(tweet_nums[:5])

[1260    8  338    0   38]


Number of Spreaders

In [4]:
spreader_nums = np.zeros(nids.shape, dtype=int)
for idx,nid in enumerate(nids):
    ids = df_social[df_social['news_id'] == nid]['user_id']
    spreader_nums[idx] = ids.unique().shape[0]
print(spreader_nums[:5])

[1088    8  278    0   37]


Mean / Median Number of Followers

In [7]:
follower_nums_mean = np.zeros(nids.shape, dtype=int)
for idx,nid in enumerate(nids):
    counts = df_social[df_social['news_id'] == nid]['followers_count']
    if counts.shape[0] > 0:
        follower_nums_mean[idx] = np.mean(counts)
print(follower_nums_mean[:5])

follower_nums_median = np.zeros(nids.shape, dtype=int)
for idx,nid in enumerate(nids):
    counts = df_social[df_social['news_id'] == nid]['followers_count']
    if counts.shape[0] > 0:
        follower_nums_median[idx] = np.median(counts)
print(follower_nums_median[:5])

[78430  2498 20585     0  3691]
[ 817 1708  748    0  482]


Mean / Median Number of Friends

In [8]:
friend_nums_mean = np.zeros(nids.shape, dtype=int)
for idx,nid in enumerate(nids):
    counts = df_social[df_social['news_id'] == nid]['friends_count']
    if counts.shape[0] > 0:
        friend_nums_mean[idx] = np.mean(counts)
print(friend_nums_mean[:5])

friend_nums_median = np.zeros(nids.shape, dtype=int)
for idx,nid in enumerate(nids):
    counts = df_social[df_social['news_id'] == nid]['friends_count']
    if counts.shape[0] > 0:
        friend_nums_median[idx] = np.median(counts)
print(friend_nums_median[:5])


[2793 1746 3309    0 1924]
[ 883 1283  694    0  407]


Save Feature Matrix to File

In [9]:
social_features = DataFrame({
    'tweet_num': tweet_nums,
    'spreader_num': spreader_nums,
    'in_degree_avg': follower_nums_mean,
    'in_degree_med': follower_nums_median,
    'degree_avg': friend_nums_mean,
    'degree_med': friend_nums_median
    })

# Save features and labels to csv files
social_features.to_csv('../feature/social-features.csv', index=False)