# Dataset description | recovery-news-data.csv

*CS 539 - Social Media Mining | Francesca Spezzano*

*Computer Science | Boise State University*

*11.05.2022 | Fall 2022*

*Aida Gomezbueno Berezo | aidagomezbuenobe@u.boisestate.edu*

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd

#### Download & prepare dataset

In [None]:
#Download
data = pd.read_csv(r'recovery-news-data.csv')
df = pd.DataFrame(data)
df.columns = ['index', 'news_ID', 'url', 'publisher', 'publish_date', 'author', 'title', 'image', 'body_text', 'political_bias', 'country', 'reliability']
feature_cols = ['url', 'publisher', 'publish_date', 'author', 'title', 'body_text', 'political_bias', 'country']
#Reliability is the label as: 1 - real, 0 - fake.
label = ['reliability']
X = df[feature_cols]
Y = df[label]
df.head()

#### Dataset description

In [None]:
unique_vals = pd.unique(df['publisher'])
n_publishers = unique_vals.size

news_dict = {}
temp = 0
glob_count = 0
for x in range(n_publishers):
    n=0
    temp = unique_vals[x]
    for y in range(df['publisher'].size):
        temp2 = str(df['publisher'].values[y])
        if temp==temp2:
            n+=1
    news_dict[temp] = n
    glob_count+=n

for i in news_dict:
    print("Publisher: ", i, " -> ", news_dict.get(i), "news.")

print("\nNumber of publishers: ", n_publishers)
print("\nTotal news: ", glob_count)

In [None]:
r_news_dict={}

for x in range(n_publishers):
    n=0
    temp = unique_vals[x]
    for y in range(df['publisher'].size):
        temp2 = str(df['publisher'].values[y])
        if temp==temp2:
            rel = df['reliability'].values[y]
            if(rel==1): #Real new
                n+=1
    r_news_dict[temp] = n

n_reliable_publishers=0
for i in r_news_dict:
    print("Publisher: ", i, " -> ", (r_news_dict.get(i)/news_dict.get(i))*100, " % reliability -", r_news_dict.get(i), " real news.")
    r = int(r_news_dict.get(i))
    if r!=0:
        n_reliable_publishers+=1
n_non_reliable=n_publishers-n_reliable_publishers
print("\nNumber of reliable publishers: ", n_reliable_publishers, ". ", ((n_reliable_publishers/n_publishers)*100), "% over the total.")
print("Therefore, number of unreliable publishers: ", n_non_reliable, ".", ((n_non_reliable/n_publishers)*100), "% over the total.")


In [None]:
#DataFrame statistics
df_stat = pd.DataFrame()
df_stat['Publisher'] = news_dict.keys()
df_stat['Number_of_news'] = news_dict.values()
df_stat['Reliable'] = r_news_dict.values()
df_stat['Reliable'] = df_stat['Reliable'].where(df_stat['Reliable'].astype(int)==0, 1)
df_stat_dict = {}
df_stat_dict['Reliable'] = n_reliable_publishers
df_stat_dict['Unreliable'] = n_non_reliable
#df_stat_dict

colors = ['#feebe2', '#fbb4b9', '#f768a1', '#c51b8a', '#7a0177']

fig, ax = plt.subplots(figsize=(5, 5))

plt.bar(df_stat_dict.keys(), df_stat_dict.values(), color=colors)
plt.ylabel("Number of reliable/unreliable publishers")
plt.show()

fig, ax = plt.subplots(figsize=(15, 5))

plt.bar(df_stat['Publisher'], df_stat['Number_of_news'], color=colors)
plt.xlabel("Publisher")
plt.ylabel("Number of news published")
plt.xticks(rotation=90)
plt.show()

In [None]:
#REL VS NONREL
rel = []
rel_news = []
non_rel = []
non_rel_news = []
for x in range(df_stat['Publisher'].size):
    if df_stat['Reliable'].iloc[x] == 1:
        rel.append(df_stat['Publisher'].iloc[x])
        rel_news.append(df_stat['Number_of_news'].iloc[x])
    else:
        non_rel.append(df_stat['Publisher'].iloc[x])
        non_rel_news.append(df_stat['Number_of_news'].iloc[x])
df_rels = pd.DataFrame()
df_rels['Publisher'] = rel
df_rels['Number_of_news'] = rel_news
df_non_rels = pd.DataFrame()
df_non_rels['Publisher'] = non_rel
df_non_rels['Number_of_news'] = non_rel_news

colors = ['#feebe2', '#fbb4b9', '#f768a1', '#c51b8a', '#7a0177']

fig, ax = plt.subplots(figsize=(10, 5))
plt.bar(df_rels['Publisher'], df_rels['Number_of_news'], color=colors)
plt.title("Reliable publishers")
plt.ylabel("Number of news")
plt.xticks(rotation=90)
plt.show()

fig, ax = plt.subplots(figsize=(10, 5))
plt.bar(df_non_rels['Publisher'], df_non_rels['Number_of_news'], color=colors)
plt.title("Unreliable publishers")
plt.ylabel("Number of news")
plt.xticks(rotation=90)
plt.show()

In [None]:
#FEATURE: AUTHOR
df['author'] = df['author'].fillna("0")
author_dict = {}
n=1
unique_vals = pd.unique(df['author'])
#Maybe would be more accurate if rather than considering having sets of authors repeatedly, split them in several columns or whatever
uniq = []
for i in unique_vals:
    i = str(i)
    i = i.replace("[", "")
    i = i.replace("]", "")
    i = i.replace("'", "")
    splited = i.split(", ")
    for j in splited:
        uniq.append(j)
unique_vals = pd.unique(uniq)
n_authors = len(unique_vals)

authors_news = {}
authors_col_dict = {}
for x in range(n_authors):
    n=0
    news=[]
    temp = unique_vals[x]
    for y in range(df['author'].size):
        temp2 = str(df['author'].values[y])
        temp2 = temp2.replace("[", "")
        temp2 = temp2.replace("]", "")
        temp2 = temp2.replace("'", "")
        if temp in temp2:
            n+=1
            news.append(df['news_ID'].iloc[y])
    authors_col_dict[temp] = n
    authors_news[temp] = news 
    
len(authors_col_dict)
del authors_col_dict['']
del authors_col_dict['0']
del authors_news['']
del authors_news['0']

df_author = pd.DataFrame()
df_author['Author'] = authors_col_dict.keys()
df_author['Contributions'] = authors_col_dict.values()
df_author['Rel_news'] = authors_news.values()

n_authors = len(authors_col_dict)
#authors_col_dict
print("Number of authors contributing: ", n_authors)

max_contr = df_author.loc[df_author['Contributions'].idxmax()]
print("The author involved in max num of articles/news is: ", max_contr[0], "\n\tNumber of news: ", max_contr[1], "\n\tNews ID in which the author is involved: ", max_contr[2])

min_contr = df_author.loc[df_author['Contributions'].idxmin()]
print("The author involved in max num of articles/news is: ", min_contr[0], "\n\tNumber of news: ", min_contr[1], "\n\tNews ID in which the author is involved: ", min_contr[2])

In [None]:
unique_vals = pd.unique(df['country'].dropna())
n_countries = unique_vals.size
count_dict = {}
temp = 0

for x in range(n_countries):
    n=0
    temp = unique_vals[x]
    for y in range(df['country'].size):
        temp2 = str(df['country'].values[y])
        if temp==temp2:
            n+=1
    count_dict[temp] = n
    
df_countries = pd.DataFrame()
df_countries['Country'] = count_dict.keys()
df_countries['Contributions'] = count_dict.values()

for i in count_dict:
    print("Country: ", i, " -> ", count_dict.get(i), "news.")
print("\nNumber of countries: ", n_countries)

colors = ['#feebe2', '#fbb4b9', '#f768a1', '#c51b8a', '#7a0177']

fig, ax = plt.subplots(figsize=(10, 5))
plt.bar(df_countries['Country'], df_countries['Contributions'], color=colors)
plt.title("Publishers per Country")
plt.ylabel("Number of news")
plt.xticks(rotation=90)
plt.show()