In [11]:
import pandas as pd
from pandas import DataFrame
import numpy as np
import networkx
import itertools
import time

Load Data

In [12]:
CSV_FILE_PATH = "../dataset/recovery-news-data.csv"
df = pd.read_csv(CSV_FILE_PATH) 

Timestamp of Publish Dates

In [13]:
dates = np.array(df['publish_date'].values, dtype=str)
fea_time_stamps = np.zeros(dates.shape, dtype=float)

for idx,date in enumerate(dates):
    if date == 'nan':
        fea_time_stamps[idx] = -1
    else:
        ts = time.strptime(date, "%Y-%m-%d")
        fea_time_stamps[idx] = time.mktime(ts)

print(dates[:5])
print(fea_time_stamps[:5])

['2020-01-21' '2020-01-22' '2020-01-23' '2020-01-24' '2020-01-24']
[1.5795828e+09 1.5796692e+09 1.5797556e+09 1.5798420e+09 1.5798420e+09]


Number of Authors

In [14]:
authors = np.array(df['author'].values, dtype=str)
fea_author_nums = np.zeros(authors.shape, dtype=int)
for idx, author in enumerate(authors):
    if author == "[]":
        fea_author_nums[idx] = 0
    else:
        author_names = author[1:-1].split(',')
        fea_author_nums[idx] = len(author_names)

print(authors[:5])
print(fea_author_nums[:5])

["['Knvul Sheikh', 'Roni Caryn Rabin']" "['Emily Feng']"
 "['Nicole Wetsman']" '[]'
 "['Nicole Wetsman', 'Zoe Schiffer', 'Jay Peters', 'Sean OKane', 'Kim Lyons', 'Elizabeth Lopatto', 'Josh Dzieza', 'Nick Statt', 'James Vincent']"]
[2 1 1 0 9]


Mean / Median Number of Collaborators of Authors

In [15]:
def get_list_authors(authors):
    string_auth = authors[1:len(authors) - 1]
    string_auth = string_auth.strip()
    string_auth = string_auth.replace('\'', '')
    list_auth = string_auth.split(", ")
    return list_auth

authors = df['author'].dropna().unique()
G=networkx.Graph()

all_authors=[]
count=0
for author in authors:
    all_authors.append(get_list_authors(author))
del all_authors[3]

authors_list = []
list_auth_dict = []
for author in all_authors:
    num_auth_dict = {}
    for each_author in author:
        each_author = each_author.strip()
        if each_author not in authors_list:
            authors_list.append(each_author)
            count+=1
            num_auth_dict[count] = each_author
        else:
            num_auth_dict[authors_list.index(each_author)+1] = each_author
    list_auth_dict.append(num_auth_dict)
 
for authors1 in list_auth_dict:
    keys_array = []
    for key in authors1.keys():
        keys_array.append(key)
        G.add_node(key)
    #print(keys_array)

    if len(authors1) > 1:
        pairs = list(itertools.combinations(keys_array, 2))
        #print(pairs)
        for pair in pairs:
            G.add_edge(pair[0],pair[1])

_, degrees = zip(*networkx.degree(G))
degrees = list(degrees)

print(len(authors_list), authors_list[:5])
print(len(degrees), degrees[:5])

authors = df['author'].values

fea_avg_degrees = np.zeros(authors.shape, dtype=float)
fea_med_degrees = np.zeros(authors.shape, dtype=int)

for idx, author in enumerate(authors):
    if author == "[]" or np.array(author, dtype=str) == 'nan':
        fea_avg_degrees[idx] = 0
        fea_med_degrees[idx] = 0
    else:
        # print(idx, author)
        author_names = get_list_authors(author)
        degree_list = []
        for auth_name in author_names:
            degree_list.append(degrees[authors_list.index(auth_name)])
        fea_avg_degrees[idx] = np.mean(degree_list)
        fea_med_degrees[idx] = np.median(degree_list)

print(len(fea_avg_degrees), fea_avg_degrees[:5])
print(len(fea_med_degrees), fea_med_degrees[:5])

1095 ['Knvul Sheikh', 'Roni Caryn Rabin', 'Emily Feng', 'Nicole Wetsman', 'Zoe Schiffer']
1095 [3, 1, 1, 8, 8]
2029 [2.         1.         8.         0.         8.33333333]
2029 [2 1 8 0 8]


Number of Words in Title / Bodytext / Title + Bodytext

In [16]:
titles = np.array(df['title'].values, dtype=str)
fea_title_word_nums = np.zeros(titles.shape, dtype=int)
for idx, title in enumerate(titles):
    fea_title_word_nums[idx] = len(title.split(' '))
print(fea_title_word_nums[:5])

bodies = np.array(df['body_text'].values, dtype=str)
fea_body_word_nums = np.zeros(bodies.shape, dtype=int)
for idx, body in enumerate(bodies):
    fea_body_word_nums[idx] = len(body.split(' '))
print(fea_body_word_nums[:5])

fea_word_nums = fea_title_word_nums + fea_body_word_nums
print(fea_word_nums[:5])

[8 9 8 7 8]
[1794  868 2485 1299  195]
[1802  877 2493 1306  203]


Number of Images
    

In [17]:
images = np.array(df['image'].values, dtype=str)
fea_image_nums = np.zeros(images.shape, dtype=int)
for idx, image in enumerate(images):
    if image == 'nan':
        fea_image_nums[idx] = 0
    else:
        fea_image_nums[idx] = 1

print(images[:5])
print(fea_image_nums[:5])    


['https://static01.nyt.com/images/2020/03/12/science/26VIRUS-EXPLAINER-update1/26VIRUS-EXPLAINER-update1-facebookJumbo.jpg'
 'https://media.npr.org/include/images/facebook-default-wide.jpg?s=1400'
 'https://cdn.vox-cdn.com/thumbor/a9_Oz7cvSBKyalibjq3yKtypMqc=/0x153:2130x1268/fit-in/1200x630/cdn.vox-cdn.com/uploads/chorus_asset/file/19581722/VRG_ILLO_1777_AK_vaccine.0.jpg'
 'https://www.worldhealth.net/media/original_images/virus_breath.jpg'
 'https://cdn.vox-cdn.com/thumbor/t2gt1SmEni4McrANA0ptgYRMJVg=/0x146:2040x1214/fit-in/1200x630/cdn.vox-cdn.com/uploads/chorus_asset/file/19933222/acastro_200428_1777_coronavirus_0001.0.jpg']
[1 1 1 1 1]


Save Feature Matrix to File

In [19]:
########## 1. construct the overall feature matrix ##########
content_features = DataFrame({
    'publish_date': fea_time_stamps,
    'author_num': fea_author_nums,
    'degree_avg': fea_avg_degrees,
    'degree_med': fea_med_degrees,
    'title_word_num': fea_title_word_nums,
    'body_word_num': fea_body_word_nums,
    'word_num': fea_word_nums,
    'image_num': fea_image_nums,
    })

########## 2. construct ground-truth label matrix ##########
is_unreliable = -1 * df['reliability'].values + 1
print(np.unique(is_unreliable))

labels = DataFrame({ 'is_unreliable': is_unreliable })
print(labels['is_unreliable'][:5])

# Save features and labels to csv files
content_features.to_csv('../feature/content-features.csv', index=False)
labels.to_csv('../feature/labels.csv', index=False)




[0 1]
0    0
1    0
2    0
3    1
4    0
Name: is_unreliable, dtype: int64
