In [872]:
# import librerie
import pandas as pd
import numpy as np

In [873]:
# import dataset
dataset = pd.read_csv("./dataset/corona_fake.csv")

### Pre-processing dataset1

In [874]:
# formattazione
dataset.loc[dataset['label'] == 'Fake', ['label']] = 'FAKE'
dataset.loc[dataset['label'] == 'fake', ['label']] = 'FAKE'
dataset.loc[dataset['source'] == 'facebook', ['source']] = 'https://facebook.com'

# assegnazione esplicita delle label in seguito ad accertamenti
dataset.loc[5]['label'] = 'FAKE'
dataset.loc[15]['label'] = 'TRUE'
dataset.loc[43]['label'] = 'FAKE'
dataset.loc[131]['label'] = 'TRUE'
dataset.loc[242]['label'] = 'FAKE'

dataset.text.fillna(dataset.title, inplace=True)
dataset = dataset.sample(frac=1).reset_index(drop=True)

# replace dei NaN
dataset.title.fillna('missing', inplace=True)
dataset.source.fillna('missing', inplace=True)

# Exploration

In [875]:
#%pip install plotly.express
#%pip install plotly.figure_factory
#%pip install plotly.graph_objects

In [876]:
# import
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go

## Capital Letters in Title
### • Count the number of capital letters in each title.
### • Compute the percentage of capital letters in each article body rather than simply counting the number, because the length of the articles are very different.

In [877]:
dataset['title_num_uppercase'] = dataset['title'].str.count(r'[A-Z]')
dataset['text_num_uppercase'] = dataset['text'].str.count(r'[A-Z]')
dataset['text_len'] = dataset['text'].str.len()
dataset['text_pct_uppercase'] = dataset.text_num_uppercase.div(dataset.text_len)

x1 = dataset.loc[dataset['label']=='TRUE']['title_num_uppercase']
x2 = dataset.loc[dataset['label'] == 'FAKE']['title_num_uppercase']
group_labels = ['TRUE', 'FAKE']
colors = ['rgb(0, 0, 100)', 'rgb(0, 200, 200)']

fig = ff.create_distplot(
    [x1, x2], group_labels,colors=colors)

fig.update_layout(title_text='Distribution of Uppercase in title', template="plotly_white")
fig.show()

In [878]:
fig = go.Figure()
fig.add_trace(go.Box(y=x1, name='TRUE',
                marker_color = 'rgb(0, 0, 100)'))
fig.add_trace(go.Box(y=x2, name = 'FAKE',
                marker_color = 'rgb(0, 200, 200)'))
fig.update_layout(title_text='Box plot of Capital Letter in title', template="plotly_white")
fig.show()

On average, fake news have way more words that appear in capital letters in the title. 
This makes me to think that fake news is targeting audiences who are likely to be influenced by titles.


## Stop Words in Title
### • Count the number of stop words in each title.
### • Compute the percentage of stop words in each article body rather than simply counting the number, because the length of the articles are very different.

In [879]:
#%pip install nltk

In [880]:
from nltk.corpus import stopwords    
stop_words = set(stopwords.words('english'))

In [881]:
dataset['title_num_stop_words'] = dataset['title'].str.split().apply(lambda x: len(set(x) & stop_words))
dataset['text_num_stop_words'] = dataset['text'].str.split().apply(lambda x: len(set(x) & stop_words))
dataset['text_word_count'] = dataset['text'].apply(lambda x: len(str(x).split()))
dataset['text_pct_stop_words'] = dataset['text_num_stop_words'] / dataset['text_word_count']

x1 = dataset.loc[dataset['label']=='TRUE']['title_num_stop_words']
x2 = dataset.loc[dataset['label'] == 'FAKE']['title_num_stop_words']
group_labels = ['TRUE', 'FAKE']
colors = ['rgb(0, 0, 100)', 'rgb(0, 200, 200)']

fig = ff.create_distplot(
    [x1, x2], group_labels,colors=colors)

fig.update_layout(title_text='Distribution of Stop Words in title', template="plotly_white")
fig.show()

In [882]:
fig = go.Figure()
fig.add_trace(go.Box(y=x1, name='TRUE',
                marker_color = 'rgb(0, 0, 100)'))
fig.add_trace(go.Box(y=x2, name = 'FAKE',
                marker_color = 'rgb(0, 200, 200)'))
fig.update_layout(title_text='Box plot of Stop Words in title', template="plotly_white")
fig.show()

Fake news titles have fewer stop-words than those of real news.


## Proper Noun in Title
### • Count number of proper nouns (NNP) in each title.

In [883]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.tag import pos_tag
from nltk import word_tokenize
from collections import Counter

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/alessandro/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/alessandro/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [884]:
dataset.drop(['text_num_uppercase', 'text_len', 'text_num_stop_words', 'text_word_count'], axis=1, inplace=True)

dataset['token'] = dataset.apply(lambda row: nltk.word_tokenize(row['title']), axis=1)
dataset['pos_tags'] = dataset.apply(lambda row: nltk.pos_tag(row['token']), axis=1)

tag_count_dataset = pd.DataFrame(dataset['pos_tags'].map(lambda x: Counter(tag[1] for tag in x)).to_list())
dataset = pd.concat([dataset, tag_count_dataset], axis=1).fillna(0).drop(['pos_tags', 'token'], axis=1)

dataset = dataset[['title', 'text', 'source', 'label', 'title_num_uppercase', 'text_pct_uppercase', 'title_num_stop_words', 'text_pct_stop_words', 'NNP']].rename(columns={'NNP': 'NNP_title'})

x1 = dataset.loc[dataset['label']=='TRUE']['NNP_title']
x2 = dataset.loc[dataset['label'] == 'FAKE']['NNP_title']

group_labels = ['TRUE', 'FAKE']

colors = ['rgb(0, 0, 100)', 'rgb(0, 200, 200)']

fig = ff.create_distplot(
    [x1, x2], group_labels,colors=colors)

fig.update_layout(title_text='Number of Proper nouns in title', template="plotly_white")
fig.show()

In [885]:
fig = go.Figure()
fig.add_trace(go.Box(y=x1, name='TRUE',
                marker_color = 'rgb(0, 0, 100)'))
fig.add_trace(go.Box(y=x2, name = 'FAKE',
                marker_color = 'rgb(0, 200, 200)'))
fig.update_layout(title_text='Box plot of Proper nouns in title', template="plotly_white")
fig.show()

Fake news titles have more proper nouns. Apparently the use of proper nouns in titles are very significant in differentiating fake from real.


## Take Away from Analysis on Article Titles
#### Overall, these results suggest that the writers of fake news are attempting to attracting attention by using all capitalized words in titles, and squeeze as much substance into the titles as possible by skipping stop-words and increase proper nouns. We will find out whether these apply to article bodies as well shortly.
#### Here is an example of fake news title vs. real news title.
#### Fake news title: “FULL TRANSCRIPT OF “SMOKING GUN” BOMBSHELL INTERVIEW: PROF. FRANCES BOYLE EXPOSES THE BIOWEAPONS ORIGINS OF THE COVID-19 CORONAVIRUS”
#### Real news title: “Why outbreaks like coronavirus spread exponentially, and how to ‘flatten the curve’”

## Capital Letters in Article Body


In [886]:
x1 = dataset.loc[dataset['label']=='TRUE']['text_pct_uppercase']
x2 = dataset.loc[dataset['label'] == 'FAKE']['text_pct_uppercase']

group_labels = ['TRUE', 'FAKE']

colors = ['rgb(0, 0, 100)', 'rgb(0, 200, 200)']

fig = ff.create_distplot(
    [x1, x2], group_labels,colors=colors)

fig.update_layout(title_text='Percentage of Capital Letter in Article body', template="plotly_white")
fig.show()

On average, fake news have more words that appear in capital letters in the article body than those of real news.


### Stop Words in Article Body


In [887]:
x1 = dataset.loc[dataset['label']=='TRUE']['text_pct_stop_words']
x2 = dataset.loc[dataset['label'] == 'FAKE']['text_pct_stop_words']

group_labels = ['TRUE', 'FAKE']

colors = ['rgb(0, 0, 100)', 'rgb(0, 200, 200)']

fig = ff.create_distplot(
    [x1, x2], group_labels,colors=colors)

fig.update_layout(title_text='Percentage of Stop Words in Text', template="plotly_white")
fig.show()

It seems there isn’t a significant difference on the percentage of stop words in article text between fake news and real news.


### NNP (Proper noun, singular) in Article Body


In [888]:
dataset.sample(3)

Unnamed: 0,title,text,source,label,title_num_uppercase,text_pct_uppercase,title_num_stop_words,text_pct_stop_words,NNP_title
93,What is the risk of getting COVID-19 while ex...,Exercising poses a potential risk from SARS-Co...,https://www.ecdc.europa.eu,TRUE,6,0.021448,4,0.147059,1.0
1096,How deadly is coronavirus?,The proportion dying from the disease is likel...,https://www.bbc.com/,TRUE,1,0.018727,1,0.186047,0.0
242,"TO MAINTAIN WORLD DOMINATION, A GROUP OF PEOPL...","In front of us, apparently, is the ideal crime...",missing,FAKE,72,0.010823,0,0.3125,14.0


## Harvard Health Publishing vs. Natural News
### Remember, Natural News is a far-right conspiracy theory and fake news website. All the news articles I collected from there are labeled as fake news.

In [889]:
x1 = dataset.loc[dataset['source']=='https://www.health.harvard.edu/']['text_pct_stop_words']
x2 = dataset.loc[dataset['source']=='https://www.naturalnews.com/']['text_pct_stop_words']

group_labels = ['Health Harvard', 'Natural News']

colors = ['rgb(0, 0, 100)', 'rgb(0, 200, 200)']

fig = ff.create_distplot(
    [x1, x2], group_labels,colors=colors)

fig.update_layout(title_text='Percentage of Stop Words in Article Bodies', template="plotly_white")
fig.show()

Within the expectation, Natural News articles use a lot less stop words than Harvard Health Publishing.


In [890]:
dataset.sample(2)

Unnamed: 0,title,text,source,label,title_num_uppercase,text_pct_uppercase,title_num_stop_words,text_pct_stop_words,NNP_title
634,I have a chronic medical condition that puts m...,You can take steps to lower your risk of getti...,https://www.health.harvard.edu/,True,9,0.015683,13,0.170588,1.0
1156,What is coronavirus?,Coronaviruses are an extremely common cause of...,https://www.health.harvard.edu/,True,1,0.01087,1,0.384615,0.0


## Features
### To study fake and real news articles, we compute many content based features on the article bodies. They are:

• Use part-of-speech tagger and keep a count of how many times each tag appears in the article.


In [891]:
dataset['token'] = dataset.apply(lambda row: nltk.word_tokenize(row['text']), axis=1)
dataset['pos_tags'] = dataset.apply(lambda row: nltk.pos_tag(row['token']), axis=1)

tag_count_dataset = pd.DataFrame(dataset['pos_tags'].map(lambda x: Counter(tag[1] for tag in x)).to_list())

dataset = pd.concat([dataset, tag_count_dataset], axis=1).fillna(0).drop(['pos_tags', 'token'], axis=1)

• Number of negations, interrogatives in the article body.

In [892]:
dataset['num_negation'] = dataset['text'].str.lower().str.count("no|not|never|none|nothing|nobody|neither|nowhere|hardly|scarcely|barely|doesn’t|isn’t|wasn’t|shouldn’t|wouldn’t|couldn’t|won’t|can't|don't")

dataset['num_interrogatives_title'] = dataset['title'].str.lower().str.count("what|who|when|where|which|why|how")
dataset['num_interrogatives_text'] = dataset['text'].str.lower().str.count("what|who|when|where|which|why|how")

## Training del modello

In [893]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix  
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

In [894]:
train, test = train_test_split(dataset, test_size = 0.2, random_state = 0)
X_train, y_train = train.drop(['title', 'text', 'source', 'label'], axis = 1), train['label']
X_test, y_test = test.drop(['title', 'text', 'source', 'label'], axis = 1), test['label']

scaler = StandardScaler()
scaler.fit(X_train)
scaler.fit(X_test)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [895]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()
model = model.fit(X_train, y_train)
pred = model.predict(X_test)
print("Accuracy: {:.2f}%".format(accuracy_score(pred, y_test) * 100))

Accuracy: 66.95%


### Support Vector Machine

In [896]:
svc = LinearSVC(dual=False)
model = svc.fit(X_train, y_train)
pred = model.predict(X_test)
print("Accuracy: {:.2f}%".format(accuracy_score(pred, y_test) * 100))

Accuracy: 85.84%
