Manually extract machine learning features based on content attributes



In [24]:
import os
import pandas as pd
from pandas import DataFrame
import numpy as np
import networkx
import itertools

import math
import random
import time
import datetime
import matplotlib.pyplot as plt
import matplotlib as mlt
import seaborn as sns

load csv files
    - combine all csv files as one df

In [25]:
# 0 is reliable
# 1 is unreliable

CSV_FILE_DIR_HEAD = "/Volumes/MySSD/PycharmProjects/MIS-COV19/"

CSV_FILE_DIR0 = CSV_FILE_DIR_HEAD + "dataset/reliable"
CSV_FILE_NAMES0 = os.listdir(CSV_FILE_DIR0)

dfs0 = pd.DataFrame()
for CSV_FILE_NAME in CSV_FILE_NAMES0:
    if CSV_FILE_NAME[:12]=="news-dataset":
        df = pd.read_csv(CSV_FILE_DIR0 + '/' + CSV_FILE_NAME) 
        dfs0 = pd.concat([dfs0,df])
    dfs0 = dfs0[ ~dfs0['publish_date'].isin(['2010-07-06', '2020-07-24']) ]
        
dfs0['unreliability']='0'

CSV_FILE_DIR1 = CSV_FILE_DIR_HEAD + "dataset/unreliable"
CSV_FILE_NAMES1 = os.listdir(CSV_FILE_DIR1)

dfs1 = pd.DataFrame()
for CSV_FILE_NAME in CSV_FILE_NAMES1:
    if CSV_FILE_NAME[:12]=="news-dataset":
        df = pd.read_csv(CSV_FILE_DIR1 + '/' + CSV_FILE_NAME) 
        dfs1 = pd.concat([dfs1,df])
    dfs1 = dfs1[ ~dfs1['publish_date'].isin(['2010-07-06', '2020-07-24']) ]
        
dfs1['unreliability']='1'

dfs = pd.concat([dfs0,dfs1])

Publisher ID

In [26]:
publishers = dfs['publisher'].values  

pub_uni = np.unique(publishers)
print("# Unique publishers: %d" % pub_uni.shape[0])

pub_dict = {}
for idx in range(len(pub_uni)):
    pub_dict[pub_uni[idx]] = idx + 1
    
print(publishers[:5])
print([pub_dict[publishers[i]] for i in range(5)])

fea_publisher = np.zeros(publishers.shape, dtype=int)
for idx, pub in enumerate(publishers):
    fea_publisher[idx] = pub_dict[pub]

print(fea_publisher[:5])

# Unique publishers: 55
['FiveThirtyEight' 'FiveThirtyEight' 'The Mercury News' 'The Mercury News'
 'The Mercury News']
[17, 17, 44, 44, 44]
[17 17 44 44 44]


Publish year / month / day

In [27]:
dates = dfs['publish_date'].values
dates = np.array(dates, dtype=str)

fea_pub_yy = np.zeros(dates.shape, dtype=int)
fea_pub_mm = np.zeros(dates.shape, dtype=int)
fea_pub_dd = np.zeros(dates.shape, dtype=int)
for idx, date in enumerate(dates):
    if date == 'nan':
        fea_pub_yy[idx], fea_pub_mm[idx], fea_pub_dd[idx] = 0, 0, 0
    else:
        fea_pub_yy[idx], fea_pub_mm[idx], fea_pub_dd[idx] = date.split('-')
 
print(dates[:5])
print(fea_pub_yy[:5])
print(fea_pub_mm[:5])
print(fea_pub_dd[:5])

['2020-05-15' '2020-04-22' '2020-05-18' '2020-03-04' '2020-05-18']
[2020 2020 2020 2020 2020]
[5 4 5 3 5]
[15 22 18  4 18]


The number of authors

In [28]:
authors = dfs['author'].values
authors = np.array(authors, dtype=str)
fea_author_num = np.zeros(authors.shape, dtype=int)
for idx, author in enumerate(authors):
    if author == "[]":
        fea_author_num[idx] = 0
    else:
        author_names = author[1:-1].split(',')
        fea_author_num[idx] = len(author_names)

print(authors[:5])
print(fea_author_num[:5])

["['Dhrumil Mehta']" "['Likhitha Butchireddygari']" "['Lisa M. Krieger']"
 "['Alejandra Armstrong', 'Harriet Blair Rowan']" "['Paul Rogers']"]
[1 1 1 2 1]


The mean / median degree of authors in the co-author network

In [29]:
def get_list_authors(authors):
    string_auth = authors[1:len(authors) - 1]
    string_auth = string_auth.strip()
    string_auth = string_auth.replace('\'', '')
    list_auth = string_auth.split(", ")
    return list_auth

df = pd.read_csv("../dataset/recovery-news-data.csv")
authors = df['author'].dropna().unique()
G=networkx.Graph()

all_authors=[]
count=0
for author in authors:
    all_authors.append(get_list_authors(author))
del all_authors[3]

authors_list = []
list_auth_dict = []
for author in all_authors:
    num_auth_dict = {}
    for each_author in author:
        each_author = each_author.strip()
        if each_author not in authors_list:
            authors_list.append(each_author)
            count+=1
            num_auth_dict[count] = each_author
        else:
            num_auth_dict[authors_list.index(each_author)+1] = each_author
    list_auth_dict.append(num_auth_dict)
 
for authors1 in list_auth_dict:
    keys_array = []
    for key in authors1.keys():
        keys_array.append(key)
        G.add_node(key)
    #print(keys_array)

    if len(authors1) > 1:
        pairs = list(itertools.combinations(keys_array, 2))
        #print(pairs)
        for pair in pairs:
            G.add_edge(pair[0],pair[1])

_, degrees = zip(*networkx.degree(G))
degrees = list(degrees)

print(len(authors_list), authors_list[:10])
print(len(degrees), degrees[:10])

authors = df['author'].values

fea_deg_avg = np.zeros(authors.shape, dtype=float)
fea_deg_med = np.zeros(authors.shape, dtype=int)

for idx, author in enumerate(authors):
    if author == "[]" or np.array(author, dtype=str) == 'nan':
        fea_deg_avg[idx] = 0
        fea_deg_med[idx] = 0
    else:
        # print(idx, author)
        author_names = get_list_authors(author)
        degree_list = []
        for auth_name in author_names:
            degree_list.append(degrees[authors_list.index(auth_name)])
        fea_deg_avg[idx] = np.mean(degree_list)
        fea_deg_med[idx] = np.median(degree_list)

# print(len(fea_deg_avg), fea_deg_avg[:100])
# print(len(fea_deg_med), fea_deg_med[:100])

1095 ['Knvul Sheikh', 'Roni Caryn Rabin', 'Emily Feng', 'Nicole Wetsman', 'Zoe Schiffer', 'Jay Peters', 'Sean OKane', 'Kim Lyons', 'Elizabeth Lopatto', 'Josh Dzieza']
1095 [3, 1, 1, 8, 8, 8, 8, 8, 10, 8]


The number of words in a news title / body-text / overall

In [30]:
titles = dfs['title'].values
titles = np.array(titles, dtype=str)
fea_title_word_nums = np.zeros(titles.shape, dtype=int)
for idx, title in enumerate(titles):
    fea_title_word_nums[idx] = len(title.split(' '))
print(fea_title_word_nums[:5])


bodies = dfs['body_text'].values
bodies = np.array(bodies, dtype=str)
fea_body_word_nums = np.zeros(bodies.shape, dtype=int)
for idx, body in enumerate(bodies):
    fea_body_word_nums[idx] = len(body.split(' '))
print(fea_body_word_nums[:5])


fea_word_nums = fea_title_word_nums + fea_body_word_nums
print(fea_word_nums[:5])

[17 12 14 11  9]
[1489 1124  866  305  492]
[1506 1136  880  316  501]


Have head/main/top image?
    

In [31]:
images = dfs['image'].values
images = np.array(images, dtype=str)
fea_image_nums = np.zeros(images.shape, dtype=int)
for idx, image in enumerate(images):
    if image == 'nan':
        fea_image_nums[idx] = 0
    else:
        fea_image_nums[idx] = 1

print(images[:5])
print(fea_image_nums[:5])    

['https://fivethirtyeight.com/wp-content/uploads/2020/05/0515_POLLA-16x9-1.png?w=575'
 'https://fivethirtyeight.com/wp-content/uploads/2020/04/AP_1203290115592-16x9-1.jpg?w=575'
 'https://www.mercurynews.com/wp-content/uploads/2020/03/browning2.jpeg?w=1024&h=683'
 'https://www.mercurynews.com/wp-content/uploads/2020/03/Bay-Area-covid-19-map.png?w=1024&h=576'
 'https://www.mercurynews.com/wp-content/uploads/2020/04/SJM-L-BEACHCLOSED-0XXX-2b.jpg?w=1024&h=679']
[1 1 1 1 1]


Political Bias ID

In [32]:
biases = dfs['political_bias'].values  
biases = np.array(biases, dtype=str)
bias_uni = np.unique(biases)
print("# Unique political biases: %d" % bias_uni.shape[0])

bias_dict = {}
for idx in range(len(bias_uni)):
    bias_dict[bias_uni[idx]] = idx + 1
    
print(biases[:5])
print([bias_dict[biases[i]] for i in range(5)])

fea_bias = np.zeros(biases.shape, dtype=int)
for idx, bias in enumerate(biases):
    fea_bias[idx] = bias_dict[bias]

print(fea_bias[:5])


# Unique political biases: 8
['Center' 'Center' 'Left-center' 'Left-center' 'Left-center']
[1, 1, 5, 5, 5]
[1 1 5 5 5]


Country ID
    

In [33]:
countries = dfs['country'].values  
countries = np.array(countries, dtype=str)
ctr_uni = np.unique(countries)
print("# Unique countries: %d" % ctr_uni.shape[0])

ctr_dict = {}
for idx in range(len(ctr_uni)):
    ctr_dict[ctr_uni[idx]] = idx + 1
    
print(countries[:5])
print([ctr_dict[countries[i]] for i in range(5)])

fea_country = np.zeros(countries.shape, dtype=int)
for idx, ctr in enumerate(countries):
    fea_country[idx] = ctr_dict[ctr]

print(fea_country[:5])   


# Unique countries: 7
['USA' 'USA' 'USA' 'USA' 'USA']
[6, 6, 6, 6, 6]
[6 6 6 6 6]


Preparation for Unreliable News Prediction

In [34]:
########## 1. construct the overall feature matrix ##########
content_features = DataFrame({
    # 'publisher': fea_publisher,
    'publish_year': fea_pub_yy,
    'publisher_month': fea_pub_mm,
    'publisher_day': fea_pub_dd,
    'author_num': fea_author_num,
    'degree_avg': fea_deg_avg,
    'degree_med': fea_deg_med,
    'title_word_num': fea_title_word_nums,
    'body_word_num': fea_body_word_nums,
    'word_num': fea_word_nums,
    'image_num': fea_image_nums,
    # 'bias': fea_bias,
    # 'country': fea_country
    })

########## 2. construct ground-truth label matrix ##########
is_unreliable = dfs['unreliability'].values
print(np.unique(is_unreliable))

labels = DataFrame({ 'is_unreliable': is_unreliable })
print(labels['is_unreliable'][:5])

# Save features and labels to csv files
content_features.to_csv(CSV_FILE_DIR_HEAD+'feature/content-features.csv', index=False)
labels.to_csv(CSV_FILE_DIR_HEAD+'feature/labels.csv', index=False)

['0' '1']
0    0
1    0
2    0
3    0
4    0
Name: is_unreliable, dtype: object
