In [None]:
# import librerie
import pandas as pd
import numpy as np
from sklearn.utils import shuffle


from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix  
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import cross_val_score
import re
import seaborn as sns
import matplotlib.pyplot as plt

import textstat
from lexicalrichness import LexicalRichness

In [None]:
# import dataset
dataset = pd.read_csv("./dataset/corona_fake.csv")
dataset

### Pre-processing dataset

In [None]:
# formattazione
dataset['label'] = dataset['label'].str.upper()  #trasforma tutta la colonna label in maiuscolo
dataset['source'] = dataset['source'].str.lower()  # trasorma tutta la colonna source in minuscolo
dataset.loc[dataset['source'] == 'facebook', ['source']] = 'https://facebook.com/'
dataset.loc[dataset['source'] == 'twitter', ['source']] = 'https://twitter.com/'
dataset.loc[dataset['source'] == 'youtube', ['source']] = 'https://youtube.com/'

# assegnazione esplicita delle label in seguito ad accertamenti
dataset.loc[5]['label'] = 'FAKE'
dataset.loc[15]['label'] = 'TRUE'
dataset.loc[43]['label'] = 'FAKE'
dataset.loc[131]['label'] = 'TRUE'
dataset.loc[242]['label'] = 'FAKE'

dataset.text.fillna(dataset.title, inplace=True)
dataset = dataset.sample(frac=1).reset_index(drop=True)

# replace dei NaN
dataset.title.fillna('missing', inplace=True)
dataset.source.fillna('missing', inplace=True)

#dataset.label.value_counts()
dataset

# Analisi

In [None]:
#%pip install plotly.express
#%pip install plotly.figure_factory
#%pip install plotly.graph_objects
#%pip install nbformat

In [None]:
# import
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go

## Lettere maiuscole nel titolo
### • Contiamo il numero di lettere maiuscole in ogni titolo.
### • Calcoliamo la percentuale di lettere maiuscole nel corpo di ogni articolo anzichè contarne il numero , a causa della diversa lunghezza degli articoli.

In [None]:
dataset['title_num_uppercase'] = dataset['title'].str.count(r'[A-Z]')
dataset['text_num_uppercase'] = dataset['text'].str.count(r'[A-Z]')
dataset['text_len'] = dataset['text'].str.len()
dataset['text_pct_uppercase'] = dataset.text_num_uppercase.div(dataset.text_len)

x1 = dataset.loc[dataset['label']=='TRUE']['title_num_uppercase']
x2 = dataset.loc[dataset['label'] == 'FAKE']['title_num_uppercase']
group_labels = ['TRUE', 'FAKE']
colors = ['rgb(0, 0, 100)', 'rgb(0, 200, 200)']

fig = ff.create_distplot([x1, x2], group_labels,colors=colors)

fig.update_layout(title_text='Distribuzione delle lettere maiuscole nel titolo', template="plotly_white")
fig.show()

In [None]:
fig = go.Figure()
fig.add_trace(go.Box(y=x1, name='TRUE',
                marker_color = 'rgb(0, 0, 100)'))
fig.add_trace(go.Box(y=x2, name = 'FAKE',
                marker_color = 'rgb(0, 200, 200)'))
fig.update_layout(title_text='Box plot delle lettere maiuscole nel titolo', template="plotly_white")
fig.show()

In media, le fake news presentano un maggior numero di lettere maiuscole nel titolo.
Questo fa pensare che le fake news si rivolgono a un pubblico che potrebbe essere influenzato dai titoli.



## Stop Words nel titolo
### • Contiamo il numero di stop words in ogni titolo.
### • Calcoliamo la percentuale di stop words nel corpo di ogni articolo anzichè contarne il numero , a causa della diversa lunghezza degli articoli.

In [None]:
#%pip install nltk
import nltk
nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords    
stop_words = set(stopwords.words('english'))

In [None]:
dataset['title_num_stop_words'] = dataset['title'].str.split().apply(lambda x: len(set(x) & stop_words))
dataset['text_num_stop_words'] = dataset['text'].str.split().apply(lambda x: len(set(x) & stop_words))
dataset['text_word_count'] = dataset['text'].apply(lambda x: len(str(x).split()))
dataset['text_pct_stop_words'] = dataset['text_num_stop_words'] / dataset['text_word_count']

x1 = dataset.loc[dataset['label']=='TRUE']['title_num_stop_words']
x2 = dataset.loc[dataset['label'] == 'FAKE']['title_num_stop_words']
group_labels = ['TRUE', 'FAKE']
colors = ['rgb(0, 0, 100)', 'rgb(0, 200, 200)']

fig = ff.create_distplot(
    [x1, x2], group_labels,colors=colors)

fig.update_layout(title_text='Distribuzione delle Stop Words nel titolo', template="plotly_white")
fig.show()

In [None]:
fig = go.Figure()
fig.add_trace(go.Box(y=x1, name='TRUE', marker_color = 'rgb(0, 0, 100)'))
fig.add_trace(go.Box(y=x2, name = 'FAKE', marker_color = 'rgb(0, 200, 200)'))
fig.update_layout(title_text='Box plot delle Stop Words nel titolo', template="plotly_white")
fig.show()

I titoli delle fake news hanno meno stop-words rispetto alle real-news.


## Nomi propri nel titolo
### • Contiamo il numero di nomi prorpri (NNP) in ogni titolo.

In [None]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.tag import pos_tag
from nltk import word_tokenize
from collections import Counter

In [None]:
dataset.drop(['text_num_uppercase', 'text_len', 'text_num_stop_words', 'text_word_count'], axis=1, inplace=True)

dataset['token'] = dataset.apply(lambda row: nltk.word_tokenize(row['title']), axis=1)
dataset['pos_tags'] = dataset.apply(lambda row: nltk.pos_tag(row['token']), axis=1)

tag_count_dataset = pd.DataFrame(dataset['pos_tags'].map(lambda x: Counter(tag[1] for tag in x)).to_list())
dataset = pd.concat([dataset, tag_count_dataset], axis=1).fillna(0).drop(['pos_tags', 'token'], axis=1)

dataset = dataset[['title', 'text', 'source', 'label', 'title_num_uppercase', 'text_pct_uppercase', 'title_num_stop_words', 'text_pct_stop_words', 'NNP']].rename(columns={'NNP': 'NNP_title'})

x1 = dataset.loc[dataset['label']=='TRUE']['NNP_title']
x2 = dataset.loc[dataset['label'] == 'FAKE']['NNP_title']

group_labels = ['TRUE', 'FAKE']

colors = ['rgb(0, 0, 100)', 'rgb(0, 200, 200)']

fig = ff.create_distplot(
    [x1, x2], group_labels,colors=colors)

fig.update_layout(title_text='Numero di nomi propri nel titolo', template="plotly_white")
fig.show()

In [None]:
fig = go.Figure()
fig.add_trace(go.Box(y=x1, name='TRUE',
                marker_color = 'rgb(0, 0, 100)'))
fig.add_trace(go.Box(y=x2, name = 'FAKE',
                marker_color = 'rgb(0, 200, 200)'))
fig.update_layout(title_text='Box plot dei nomi propri nel titolo', template="plotly_white")
fig.show()

I titoli delle fake-news presentano più nomi propri. 


In [None]:
#pip install wordcloud 

In [None]:
# Final word cloud after all the cleaning and pre-processing
# import matplotlib.pyplot as plt
# from wordcloud import WordCloud, STOPWORDS
# comment_words = ' '
# stopwords = set(STOPWORDS) 

# iterate through the csv file 
# for val in df.comment: 

   # typecaste each val to string 
   # val = str(val) 

   # split the value 
   # tokens = val.split() 

# Converts each token into lowercase 
# for i in range(len(tokens)): 
#    tokens[i] = tokens[i].lower() 

# for words in tokens: 
#    comment_words = comment_words + words + ' '


# wordcloud = WordCloud(width = 800, height = 800, 
#            background_color ='white', 
#            stopwords = stopwords, 
#            min_font_size = 10).generate(comment_words) 

# plot the WordCloud image                        
# plt.figure(figsize = (8, 8), facecolor = None) 
# plt.imshow(wordcloud) 
# plt.axis("off") 
# plt.tight_layout(pad = 0) 

# plt.show() 

#### Nel complesso, questi risultati suggeriscono che gli autori di fake-news cercano di attirare l'attenzione utilizzando le parole in maiuscolo nei titoli e concentrando quante più key-words possibili nei titoli saltando le stop-word e aumentando i nomi propri. Analizziamo se lo stesso avviene anche nei corpi degli articoli.

## Lettere maiuscole nel corpo degli articoli


In [None]:
x1 = dataset.loc[dataset['label']=='TRUE']['text_pct_uppercase']
x2 = dataset.loc[dataset['label'] == 'FAKE']['text_pct_uppercase']

group_labels = ['TRUE', 'FAKE']

colors = ['rgb(0, 0, 100)', 'rgb(0, 200, 200)']

fig = ff.create_distplot(
    [x1, x2], group_labels,colors=colors)

fig.update_layout(title_text='Percentuale di lettere maiuscole nel corpo degli articoli', template="plotly_white")
fig.show()

In media, le fake news presentano un maggior numero di lettere maiuscole nel corpo degli articoli.



## Stop Words nel corpo degli articoli


In [None]:
x1 = dataset.loc[dataset['label']=='TRUE']['text_pct_stop_words']
x2 = dataset.loc[dataset['label'] == 'FAKE']['text_pct_stop_words']

group_labels = ['TRUE', 'FAKE']

colors = ['rgb(0, 0, 100)', 'rgb(0, 200, 200)']

fig = ff.create_distplot(
    [x1, x2], group_labels,colors=colors)

fig.update_layout(title_text='Percentuale di Stop Words nel corpo degli articoli', template="plotly_white")
fig.show()

Non ci sono differenze significative tra le percentuali di stop word nelle fake e nelle real news


In [None]:
dataset.sample(3)

## Harvard Health Publishing vs. Natural News
#### Natural News è un sito di notizie false.

In [None]:
x1 = dataset.loc[dataset['source']=='https://www.health.harvard.edu/']['text_pct_stop_words']
x2 = dataset.loc[dataset['source']=='https://www.naturalnews.com/']['text_pct_stop_words']

x3 = dataset.loc[dataset['source']=='https://www.health.harvard.edu/']['text_pct_uppercase']
x4 = dataset.loc[dataset['source']=='https://www.naturalnews.com/']['text_pct_uppercase']

x5 = dataset.loc[dataset['source']=='https://www.health.harvard.edu/']['NNP_title']
x6 = dataset.loc[dataset['source']=='https://www.naturalnews.com/']['NNP_title']



group_labels = ['Health Harvard', 'Natural News']

colors = ['rgb(0, 0, 100)', 'rgb(0, 200, 200)']

fig1 = ff.create_distplot([x1, x2], group_labels,colors=colors)
fig1.update_layout(title_text='Percentuale di Stop Words nel corpo degli articoli', template="plotly_white")
fig1.show()

fig2 = ff.create_distplot([x3, x4], group_labels,colors=colors)
fig2.update_layout(title_text='Percentuale di lettere maiuscole nel corpo degli articoli', template="plotly_white")
fig2.show()

fig3 = ff.create_distplot([x5, x6], group_labels,colors=colors)
fig3.update_layout(title_text='Numero di nomi propri nel titolo degli articoli', template="plotly_white")
fig3.show()

Come volevasi dimostrare, gli articoli di Natural News usano molte meno stop words rispetto a Healt Publishing.

## Features
### Per analizzare in modo approfondito gli articoli fake e real, calcoliamo alcune features basate sui corpi degli articoli:

• Usiamo un part-of-speech tagger e contiamo il numero di volte in cui ogni tag compare nell'articolo.

In [None]:
dataset['token'] = dataset.apply(lambda row: nltk.word_tokenize(row['text']), axis=1)
dataset['pos_tags'] = dataset.apply(lambda row: nltk.pos_tag(row['token']), axis=1)

tag_count_dataset = pd.DataFrame(dataset['pos_tags'].map(lambda x: Counter(tag[1] for tag in x)).to_list())

dataset = pd.concat([dataset, tag_count_dataset], axis=1).fillna(0).drop(['pos_tags', 'token'], axis=1)

• Numero di forme negative e interrogative nel corpo degli articoli.

In [None]:
dataset['num_negation'] = dataset['text'].str.lower().str.count("no|not|never|none|nothing|nobody|neither|nowhere|hardly|scarcely|barely|doesn’t|isn’t|wasn’t|shouldn’t|wouldn’t|couldn’t|won’t|can't|don't")

dataset['num_interrogatives_title'] = dataset['title'].str.lower().str.count("what|who|when|where|which|why|how")
dataset['num_interrogatives_text'] = dataset['text'].str.lower().str.count("what|who|when|where|which|why|how")

## Training del modello

In [None]:
reading_ease = []
for doc in dataset['text']:
    reading_ease.append(textstat.flesch_reading_ease(doc))
    
smog = []
for doc in dataset['text']:
    smog.append(textstat.smog_index(doc))
    
kincaid_grade = []
for doc in dataset['text']:
    kincaid_grade.append(textstat.flesch_kincaid_grade(doc))
    
liau_index = []
for doc in dataset['text']:
    liau_index.append(textstat.coleman_liau_index(doc))
    
readability_index = []
for doc in dataset['text']:
    readability_index.append(textstat.automated_readability_index(doc))

readability_score = []
for doc in dataset['text']:
    readability_score.append(textstat.dale_chall_readability_score(doc))
    
difficult_words = []
for doc in dataset['text']:
    difficult_words.append(textstat.difficult_words(doc))

write_formula = []
for doc in dataset['text']:
    write_formula.append(textstat.linsear_write_formula(doc))

gunning_fog = []
for doc in dataset['text']:
    gunning_fog.append(textstat.gunning_fog(doc))

text_standard = []
for doc in dataset['text']:
    text_standard.append(textstat.text_standard(doc))
    
dataset['flesch_reading_ease'] = reading_ease
dataset['smog_index'] = smog
dataset['flesch_kincaid_grade'] = kincaid_grade
dataset['automated_readability_index'] = readability_index
dataset['dale_chall_readability_score'] = readability_score
dataset['difficult_words'] = difficult_words
dataset['linsear_write_formula'] = write_formula
dataset['gunning_fog'] = gunning_fog
dataset['text_standard'] = text_standard

In [None]:
ttr = []
for doc in dataset['text']:
    lex = LexicalRichness(doc)
    ttr.append(lex.ttr)

dataset['ttr'] = ttr

In [None]:
dataset['num_powerWords_text'] = dataset['text'].str.lower().str.count('improve|trust|immediately|discover|profit|learn|know|understand|powerful|best|win|more|bonus|exclusive|extra|you|free|health|guarantee|new|proven|safety|money|now|today|results|protect|help|easy|amazing|latest|extraordinary|how to|worst|ultimate|hot|first|big|anniversary|premiere|basic|complete|save|plus|create')
dataset['num_casualWords_text'] = dataset['text'].str.lower().str.count('make|because|how|why|change|use|since|reason|therefore|result')
dataset['num_tentativeWords_text'] = dataset['text'].str.lower().str.count('may|might|can|could|possibly|probably|it is likely|it is unlikely|it is possible|it is probable|tends to|appears to|suggests that|seems to')
dataset['num_emotionWords_text'] = dataset['text'].str.lower().str.count('ordeal|outrageous|provoke|repulsive|scandal|severe|shameful|shocking|terrible|tragic|unreliable|unstable|wicked|aggravate|agony|appalled|atrocious|corruption|damage|disastrous|disgusted|dreadatasetul|eliminate|harmful|harsh|inconsiderate|enraged|offensive|aggressive|frustrated|controlling|resentful|anger|sad|fear|malicious|infuriated|critical|violent|vindictive|furious|contrary|condemning|sarcastic|poisonous|jealous|retaliating|desperate|alienated|unjustified|violated')

In [None]:
def cleantext(string):
    text = string.lower().split()
    text = " ".join(text)
    text = re.sub(r"http(\S)+",' ',text)    
    text = re.sub(r"www(\S)+",' ',text)
    text = re.sub(r"&",' and ',text)  
    text = text.replace('&amp',' ')
    text = re.sub(r"[^0-9a-zA-Z]+",' ',text)
    text = text.split()
    text = [w for w in text if not w in stop_words]
    text = " ".join(text)
    return text

In [None]:
dataset['text'] = dataset['text'].map(lambda x: cleantext(x))
dataset['title'] = dataset['title'].map(lambda x: cleantext(x))
dataset['source'] = dataset['source'].map(lambda x: cleantext(x))

In [None]:
classes = {"TRUE":1,"FAKE":0}
dataset["label"].replace(classes, inplace=True)

In [None]:
def correlation(dataset, threshold):
    col_corr = set()
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if (corr_matrix.iloc[i,j] >= threshold and (corr_matrix.columns[j] not in col_corr)):
                colname = corr_matrix.columns[i]
                col_corr.add(colname)
                if colname in dataset.columns:
                    del dataset[colname]

In [None]:
train, test = train_test_split(dataset, test_size = 0.2, random_state = 0)
X_train, y_train = train.drop(['title', 'text', 'source', 'label', 'text_standard'], axis = 1), train['label']
X_test, y_test = test.drop(['title', 'text', 'source', 'label', 'text_standard'], axis = 1), test['label']

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [None]:
def print_metrices(pred, true):
    print("Accuracy : ", accuracy_score(pred, true))
    print("Precison : ", precision_score(pred, true, pos_label=1))
    print("Recall : ", recall_score(pred, true))
    print("F1 : ", f1_score(pred, true))

In [None]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(criterion = 'entropy')
model = model.fit(X_train, y_train)
pred = model.predict(X_test)
print_metrices(pred, y_test)

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=1000)
model = model.fit(X_train, y_train)
pred = model.predict(X_test)
print_metrices(pred, y_test)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=3)
model = model.fit(X_train, y_train)
pred = model.predict(X_test)
print_metrices(pred, y_test)

In [None]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model = model.fit(X_train, y_train)
pred = model.predict(X_test)
print_metrices(pred, y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model = model.fit(X_train, y_train)
pred = model.predict(X_test)
print_metrices(pred, y_test)

In [None]:
svc = LinearSVC(dual=False)
model = svc.fit(X_train, y_train)
pred = model.predict(X_test)
print_metrices(pred, y_test)

In [None]:
from sklearn import svm
clf = svm.SVC(kernel='linear', C = 1.0)
model = clf.fit(X_train,y_train)
pred = model.predict(X_test)
print_metrices(pred, y_test)

In [None]:
from sklearn.ensemble import AdaBoostClassifier

abc = AdaBoostClassifier(n_estimators=50, learning_rate=1)
model = abc.fit(X_train, y_train)  
pred = model.predict(X_test)
print_metrices(pred, y_test)