In [None]:
#test different training & evaluation splits
#perform fine-tuning on BERT
# label all non-english entries as fake
# BERT only
# BERT + SVM

# Downloads & Installs

In [1]:
!pip install langdetect



In [2]:
import pandas as pd
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException

# Data

In [3]:
data = pd.read_csv("WELFake_Dataset.csv")
data = data.fillna('')

In [4]:
data.head()
# Note: (0 = real and 1 = fake) 

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


## Labeling Issue (NYT & Reuters)

In [19]:
nyt_articles = data[data['title'].str.contains("The New York Times$", na=False)]
nyt_label_counts = nyt_articles['label'].value_counts()

In [20]:
print(nyt_label_counts)

0    6223
1       1
Name: label, dtype: int64


In [23]:
total_label_counts = data['label'].value_counts()
print(total_label_counts)
# In the data card, it is stated that: "72,134 news articles with 35,028 real and 37,106 fake news"
# However, the label counts are for 1 (real according to the authors), 37106 and for 0 (fake according to the authors) 35028.
# The labels should be inversed. 

1    37106
0    35028
Name: label, dtype: int64


## Different Languages

In [5]:
DetectorFactory.seed = 0

def detect_language(text):
    try:
        return detect(text)
    except LangDetectException:
        return "unknown"

# Combine title and text
data['combined'] = data['title'] + " " + data['text']

data['language'] = data['combined'].apply(detect_language)

language_counts = data['language'].value_counts()

In [6]:
print(language_counts)

en         71540
ru           156
es           141
de           132
fr            72
ar            19
unknown       10
pt             9
it             7
tr             7
id             5
so             4
hr             4
nl             4
no             3
ro             3
pl             2
da             2
el             2
cy             2
ca             1
hu             1
zh-cn          1
sv             1
et             1
tl             1
sw             1
vi             1
fi             1
lt             1
Name: language, dtype: int64


In [7]:
label_language_counts = data.groupby('language')['label'].value_counts().unstack().fillna(0)

In [8]:
print(label_language_counts)

label           0        1
language                  
ar            0.0     19.0
ca            0.0      1.0
cy            0.0      2.0
da            0.0      2.0
de            0.0    132.0
el            0.0      2.0
en        35027.0  36513.0
es            0.0    141.0
et            0.0      1.0
fi            0.0      1.0
fr            1.0     71.0
hr            0.0      4.0
hu            0.0      1.0
id            0.0      5.0
it            0.0      7.0
lt            0.0      1.0
nl            0.0      4.0
no            0.0      3.0
pl            0.0      2.0
pt            0.0      9.0
ro            0.0      3.0
ru            0.0    156.0
so            0.0      4.0
sv            0.0      1.0
sw            0.0      1.0
tl            0.0      1.0
tr            0.0      7.0
unknown       0.0     10.0
vi            0.0      1.0
zh-cn         0.0      1.0


In [9]:
en_data = data[data['language'] == 'en']

en_data.to_csv('english_articles.csv', index=False)

In [10]:
en_data = pd.read_csv('english_articles.csv')

In [11]:
en_data.head()

Unnamed: 0.1,Unnamed: 0,title,text,label,combined,language
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,en
1,1,,Did they post their votes for Hillary already?,1,Did they post their votes for Hillary already?,en
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,en
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0,"Bobby Jindal, raised Hindu, uses story of Chri...",en
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1,SATAN 2: Russia unvelis an image of its terrif...,en


## Cleaning & Preprocessing

In [None]:
#unconsistent spacing 