# Setup


In [1]:
# ! pip install datasets
# ! pip install pandas
# ! python.exe -m pip install --upgrade pip
# ! pip install matplotlib
# ! pip install openpyxl

In [2]:
from datasets import load_dataset
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def pd_set_see_full_dataframe():
    # Permanently changes the pandas settings
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', None)
    # pd.set_option('display.max_colwidth', -1)

def pd_reset_options():
    # Resets the options
    pd.reset_option('all')

def display_whole_df(df):
    pd_set_see_full_dataframe()
    display(df)
    pd_reset_options()

# Imports and Merge 


## Covid-19 Fake News Dataset


In [4]:
df_covid_fake_news_dataset = pd.read_excel('../data/COVID-19 Fake News Dataset/fake_new_dataset.xlsx')

In [5]:
df_covid_fake_news_dataset.head()
# df_covid_fake_news_dataset['label'].value_counts()

Unnamed: 0.1,Unnamed: 0,title,text,subcategory,label
0,0,FACEBOOK DELETES MICHIGAN ANTI-LOCKDOWN GROUP ...,Facebook has shuttered a popular group for Mic...,false news,0
1,1,Other Viewpoints: COVID-19 is worse than the flu,We can now officially put to rest all comparis...,true,1
2,2,Bermuda's COVID-19 cases surpass 100,The Ministry of Health in Bermuda has confirme...,true,1
3,3,Purdue University says students face 'close to...,"Purdue University President Mitch Daniels, the...",partially false,0
4,4,THE HIGH COST OF LOCKING DOWN AMERICA: “WE’VE ...,Locking down much of the country may have help...,false news,0


In [6]:
df_covid_fake_news_dataset = df_covid_fake_news_dataset[['text', 'label', 'title']].assign(metadata=df_covid_fake_news_dataset.iloc[:,3:].agg(dict,1))

# df_covid_fake_news_dataset.rename(columns={'title': 'metadata'}, inplace=True)

In [7]:
df_covid_fake_news_dataset.head()

Unnamed: 0,text,label,title,metadata
0,Facebook has shuttered a popular group for Mic...,0,FACEBOOK DELETES MICHIGAN ANTI-LOCKDOWN GROUP ...,"{'subcategory': 'false news', 'label': 0}"
1,We can now officially put to rest all comparis...,1,Other Viewpoints: COVID-19 is worse than the flu,"{'subcategory': 'true', 'label': 1}"
2,The Ministry of Health in Bermuda has confirme...,1,Bermuda's COVID-19 cases surpass 100,"{'subcategory': 'true', 'label': 1}"
3,"Purdue University President Mitch Daniels, the...",0,Purdue University says students face 'close to...,"{'subcategory': 'partially false', 'label': 0}"
4,Locking down much of the country may have help...,0,THE HIGH COST OF LOCKING DOWN AMERICA: “WE’VE ...,"{'subcategory': 'false news', 'label': 0}"


In [8]:
df_covid_fake_news_dataset['label'] = 1 - df_covid_fake_news_dataset['label']

In [9]:
df_covid_fake_news_dataset.head()

Unnamed: 0,text,label,title,metadata
0,Facebook has shuttered a popular group for Mic...,1,FACEBOOK DELETES MICHIGAN ANTI-LOCKDOWN GROUP ...,"{'subcategory': 'false news', 'label': 0}"
1,We can now officially put to rest all comparis...,0,Other Viewpoints: COVID-19 is worse than the flu,"{'subcategory': 'true', 'label': 1}"
2,The Ministry of Health in Bermuda has confirme...,0,Bermuda's COVID-19 cases surpass 100,"{'subcategory': 'true', 'label': 1}"
3,"Purdue University President Mitch Daniels, the...",1,Purdue University says students face 'close to...,"{'subcategory': 'partially false', 'label': 0}"
4,Locking down much of the country may have help...,1,THE HIGH COST OF LOCKING DOWN AMERICA: “WE’VE ...,"{'subcategory': 'false news', 'label': 0}"


## FNIR

In [10]:
df_FNIR_true = pd.read_csv('../data/COVID19-FNIR/trueNews.csv')
df_FNIR_false = pd.read_csv('../data/COVID19-FNIR/fakeNews.csv')

In [11]:
df_FNIR_false['Binary Label'] = 1
df_FNIR_true['Binary Label'] = 0

In [12]:
df_FNIR_false.head()

Unnamed: 0,Date Posted,Link,Text,Region,Country,Explanation,Origin,Origin_URL,Fact_checked_by,Poynter_Label,Binary Label
0,2/7/20,https://www.poynter.org/?ifcn_misinformation=t...,Tencent revealed the real number of deaths.\t\t,Europe,France,The screenshot is questionable.,Twitter,https://www.liberation.fr/checknews/2020/02/07...,CheckNews,Misleading,1
1,2/7/20,https://www.poynter.org/?ifcn_misinformation=t...,Taking chlorine dioxide helps fight coronavir...,Europe,Germany,Chlorine dioxide does guard against the coron...,Website,https://correctiv.org/faktencheck/medizin-und-...,Correctiv,FALSE,1
2,2/7/20,https://www.poynter.org/?ifcn_misinformation=t...,This video shows workmen uncovering a bat-inf...,India,India,A video shows bats nesting in the roof; howev...,Facebook,https://factcheck.afp.com/video-shows-workmen-...,AFP,MISLEADING,1
3,2/7/20,https://www.poynter.org/?ifcn_misinformation=t...,The Asterix comic books and The Simpsons pred...,India,India,Coronavirus has been around since the 1960s. ...,Twitter,https://www.boomlive.in/health/did-the-simpson...,BOOM FactCheck,Misleading,1
4,2/7/20,https://www.poynter.org/?ifcn_misinformation=c...,Chinese President Xi Jinping visited a mosque...,India,India,Chinese President Xi Jinping's visit to the m...,Facebook,http://newsmobile.in/articles/2020/02/07/chine...,NewsMobile,FALSE,1


In [13]:
df_FNIR_true.head()

Unnamed: 0,Date Posted,Link,Text,Region,Username,Publisher,Label,Binary Label
0,2/11/20,https://twitter.com/the_hindu/status/122725962...,Just in: Novel coronavirus named 'Covid-19': U...,India,the_hindu,The Hindu,1,0
1,2/12/20,https://twitter.com/ndtv/status/12274908434742...,WHO officially names #coronavirus as Covid-19....,India,ndtv,NDTV,1,0
2,2/12/20,https://twitter.com/the_hindu/status/122744471...,"The #UN #health agency announced that ""COVID-1...",India,the_hindu,The Hindu,1,0
3,2/14/20,https://twitter.com/IndiaToday/status/12282764...,The Indian Embassy in Tokyo has said that one ...,India,indiatoday,IndiaToday,1,0
4,2/15/20,https://twitter.com/the_hindu/status/122854247...,Ground Zero | How Kerala used its experience i...,India,the_hindu,The Hindu,1,0


In [14]:
df_FNIR_true = df_FNIR_true[['Text', 'Binary Label', 'Link', 'Date Posted']]
df_FNIR_false = df_FNIR_false[['Text', 'Binary Label', 'Origin_URL', 'Date Posted']]

In [15]:
df_FNIR_true.rename(columns={'Text': 'text', 'Binary Label': 'label', 'Link': 'link', 'Date Posted': 'date'}, inplace=True)

df_FNIR_false.rename(columns={'Text': 'text', 'Binary Label': 'label', 'Origin_URL': 'link', 'Date Posted': 'date'}, inplace=True)

In [16]:
df_FNIR = pd.concat([df_FNIR_true, df_FNIR_false], axis=0, ignore_index=True)

In [17]:
df_FNIR = df_FNIR[['text', 'label']].assign(metadata=df_FNIR.iloc[:,3:].agg(dict,1))

In [18]:
df_FNIR.head()

Unnamed: 0,text,label,metadata
0,Just in: Novel coronavirus named 'Covid-19': U...,0,{'date': '2/11/20'}
1,WHO officially names #coronavirus as Covid-19....,0,{'date': '2/12/20'}
2,"The #UN #health agency announced that ""COVID-1...",0,{'date': '2/12/20'}
3,The Indian Embassy in Tokyo has said that one ...,0,{'date': '2/14/20'}
4,Ground Zero | How Kerala used its experience i...,0,{'date': '2/15/20'}


## COVID19FN

In [19]:
df_covid_claims = pd.read_csv('../data/COVID19FN/COVID19FN.csv')

In [20]:
df_covid_claims.head()

Unnamed: 0.1,Unnamed: 0,Title,Text,country,Label,URL,len_sentences,source,text,date,avg_word_len
0,0,A video shows a fortune teller predicting the...,Circulating on social networks a video that sh...,Portugal,False,https://observador.pt/factchecks/fact-check-um...,83,https://observador.pt,circulating on social networks video that show...,08-04-2020,4.795181
1,1,Internet sensation and the worldâ€™s cutest ba...,Internet sensation and the world’s cutest baby...,India,False,https://www.newschecker.in/article/news-detail...,516,https://www.newschecker.in,internet sensation and the world cutest baby a...,17-04-2020,4.858527
2,2,A video has been viewed hundreds of thousands ...,A video has been viewed hundreds of thousands ...,Indonesia,False,https://factcheck.afp.com/video-shows-us-presi...,570,https://factcheck.afp.com,a video has been viewed hundreds of thousands ...,09-04-2020,5.010526
3,3,"Treasury is depositing Kshs 45, 000 to the mob...",A Facebook post claiming that the National Tre...,Kenya,False,https://pesacheck.org/false-treasury-is-not-se...,551,https://pesacheck.org,a facebook post claiming that the national tre...,11-04-2020,5.377495
4,4,Hunagrian authorities are capturing men 50 or ...,Moves on Facebook and Twitter a video showing ...,Mexico,False,https://www.animalpolitico.com/elsabueso/hungr...,368,https://www.animalpolitico.com,moves on facebook and twitter video showing po...,11-04-2020,5.032609


In [21]:
df_covid_claims['Label'].value_counts()

Label
False    1591
True     1230
Name: count, dtype: int64

In [22]:
df_covid_claims = df_covid_claims.rename(columns={'Title ': 'title', 'Label': 'label'})


In [23]:
df_covid_claims.head()

Unnamed: 0.1,Unnamed: 0,title,Text,country,label,URL,len_sentences,source,text,date,avg_word_len
0,0,A video shows a fortune teller predicting the...,Circulating on social networks a video that sh...,Portugal,False,https://observador.pt/factchecks/fact-check-um...,83,https://observador.pt,circulating on social networks video that show...,08-04-2020,4.795181
1,1,Internet sensation and the worldâ€™s cutest ba...,Internet sensation and the world’s cutest baby...,India,False,https://www.newschecker.in/article/news-detail...,516,https://www.newschecker.in,internet sensation and the world cutest baby a...,17-04-2020,4.858527
2,2,A video has been viewed hundreds of thousands ...,A video has been viewed hundreds of thousands ...,Indonesia,False,https://factcheck.afp.com/video-shows-us-presi...,570,https://factcheck.afp.com,a video has been viewed hundreds of thousands ...,09-04-2020,5.010526
3,3,"Treasury is depositing Kshs 45, 000 to the mob...",A Facebook post claiming that the National Tre...,Kenya,False,https://pesacheck.org/false-treasury-is-not-se...,551,https://pesacheck.org,a facebook post claiming that the national tre...,11-04-2020,5.377495
4,4,Hunagrian authorities are capturing men 50 or ...,Moves on Facebook and Twitter a video showing ...,Mexico,False,https://www.animalpolitico.com/elsabueso/hungr...,368,https://www.animalpolitico.com,moves on facebook and twitter video showing po...,11-04-2020,5.032609


In [24]:
df_covid_claims = df_covid_claims[['text', 'label', 'title']].assign(metadata=df_covid_claims.iloc[:,3:].agg(dict,1))


In [25]:
df_covid_claims.head()

Unnamed: 0,text,label,title,metadata
0,circulating on social networks video that show...,False,A video shows a fortune teller predicting the...,"{'country': ' Portugal', 'label': False, 'URL'..."
1,internet sensation and the world cutest baby a...,False,Internet sensation and the worldâ€™s cutest ba...,"{'country': ' India', 'label': False, 'URL': '..."
2,a video has been viewed hundreds of thousands ...,False,A video has been viewed hundreds of thousands ...,"{'country': ' Indonesia', 'label': False, 'URL..."
3,a facebook post claiming that the national tre...,False,"Treasury is depositing Kshs 45, 000 to the mob...","{'country': ' Kenya', 'label': False, 'URL': '..."
4,moves on facebook and twitter video showing po...,False,Hunagrian authorities are capturing men 50 or ...,"{'country': ' Mexico', 'label': False, 'URL': ..."


In [26]:
df_covid_claims['label'].value_counts()

label
False    1591
True     1230
Name: count, dtype: int64

In [27]:
df_covid_claims['label'] = df_covid_claims['label'].replace(False, -1)
df_covid_claims['label'] = df_covid_claims['label'].replace(True, 0)
df_covid_claims['label'] = df_covid_claims['label'].replace(-1, 1)



In [28]:
df_covid_claims['label'].value_counts()

label
1    1591
0    1230
Name: count, dtype: int64

In [29]:
df_covid_claims.head()

Unnamed: 0,text,label,title,metadata
0,circulating on social networks video that show...,1,A video shows a fortune teller predicting the...,"{'country': ' Portugal', 'label': False, 'URL'..."
1,internet sensation and the world cutest baby a...,1,Internet sensation and the worldâ€™s cutest ba...,"{'country': ' India', 'label': False, 'URL': '..."
2,a video has been viewed hundreds of thousands ...,1,A video has been viewed hundreds of thousands ...,"{'country': ' Indonesia', 'label': False, 'URL..."
3,a facebook post claiming that the national tre...,1,"Treasury is depositing Kshs 45, 000 to the mob...","{'country': ' Kenya', 'label': False, 'URL': '..."
4,moves on facebook and twitter video showing po...,1,Hunagrian authorities are capturing men 50 or ...,"{'country': ' Mexico', 'label': False, 'URL': ..."


# Output

In [30]:
df_covid_fake_news_dataset.to_feather('../data/covid_fake_news_dataset.feather')
df_covid_claims.to_feather('../data/covid_claims.feather')
df_FNIR.to_feather('../data/covid_FNIR.feather')