In [288]:
# import librerie
import pandas as pd
import numpy as np
from sklearn.utils import shuffle


from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix  
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import cross_val_score
import re
import seaborn as sns
import matplotlib.pyplot as plt

import textstat
from lexicalrichness import LexicalRichness

In [289]:
# import dataset
dataset = pd.read_csv("./dataset/corona_fake.csv")
dataset

Unnamed: 0,title,text,source,label
0,Due to the recent outbreak for the Coronavirus...,"You just need to add water, and the drugs and ...",coronavirusmedicalkit.com,Fake
1,,Hydroxychloroquine has been shown to have a 10...,RudyGiuliani,Fake
2,,Fact: Hydroxychloroquine has been shown to hav...,CharlieKirk,Fake
3,,The Corona virus is a man made virus created i...,JoanneWrightForCongress,Fake
4,,Doesn’t @BillGates finance research at the Wuh...,JoanneWrightForCongress,Fake
...,...,...,...,...
1159,Could the Power of the Sun Slow the Coronavirus?,A study suggests that ultraviolet rays could s...,https://www.nytimes.com/,TRUE
1160,Key evidence for coronavirus spread is flawed ...,"Last week, a medical journal reported that a b...",https://www.nytimes.com/,TRUE
1161,Summer Heat May Not Diminish Coronavirus Strength,"A new report, sent to the White House science ...",https://www.nytimes.com/,TRUE
1162,How Long Will a Vaccine Really Take?,A vaccine would be the ultimate weapon against...,https://www.nytimes.com/,TRUE


### Pre-processing dataset

In [290]:
# formattazione
dataset['label'] = dataset['label'].str.upper()  #trasforma tutta la colonna label in maiuscolo
dataset['source'] = dataset['source'].str.lower()  # trasorma tutta la colonna source in minuscolo
dataset.loc[dataset['source'] == 'facebook', ['source']] = 'https://facebook.com/'
dataset.loc[dataset['source'] == 'twitter', ['source']] = 'https://twitter.com/'
dataset.loc[dataset['source'] == 'youtube', ['source']] = 'https://youtube.com/'

# assegnazione esplicita delle label in seguito ad accertamenti
dataset.loc[5]['label'] = 'FAKE'
dataset.loc[15]['label'] = 'TRUE'
dataset.loc[43]['label'] = 'FAKE'
dataset.loc[131]['label'] = 'TRUE'
dataset.loc[242]['label'] = 'FAKE'

dataset.text.fillna(dataset.title, inplace=True)
dataset = dataset.sample(frac=1).reset_index(drop=True)

# replace dei NaN
dataset.title.fillna('missing', inplace=True)
dataset.source.fillna('missing', inplace=True)

#dataset.label.value_counts()
dataset

Unnamed: 0,title,text,source,label
0,COVID-19 Pandemic,A pandemic is a global outbreak of disease. Pa...,https://www.cdc.gov/,TRUE
1,CORONAVIRUS OUTBREAK IN EUROPE: CRIMINAL NEGLI...,There is little doubt that the coronavirus thr...,southfront.org,FAKE
2,Corona Tyranny – and Death by Famine,By the end of 2020 more people will have died ...,https://journal-neo.org/,FAKE
3,missing,"Obama Coronavirus kills Americans. In 2015, it...",tin woodman,FAKE
4,CORONAVIRUS IS A US TOOL TO DISRUPT CHINESE PR...,"Coronavirus, Uyghur, and Huawei are a triangle...",https://katehon.com/,FAKE
...,...,...,...,...
1159,missing,For all you dummies that have absolutely no id...,missing,FAKE
1160,"After Recovery From the Coronavirus, Most Peop...",A new study adds to evidence of immunity among...,https://www.nytimes.com/,TRUE
1161,Study Shows Direct Correlation between 5G Netw...,"At last, the first study has emerged regarding...",https://smombiegate.org/,FAKE
1162,What’s happening with a vaccine?,A vaccine for Covid-19 isn’t around the corner...,https://www.wired.co.uk/,TRUE


In [291]:
dataset.sample(15)

Unnamed: 0,title,text,source,label
550,"Coronavirus explained: Symptoms, lockdowns and...",The coronavirus pandemic has completely change...,https://www.cnet.com/,TRUE
56,False claim: Bill Gates wants to microchip peo...,Posts on social media claim that Bill Gates al...,https://www.reuters.com/,TRUE
474,Murder Most Foul: The Perps Behind COVID-19,“I am not saying that China deliberately relea...,https://www.organicconsumers.org/,FAKE
33,How could contact tracing help slow the spread...,Anyone who comes into close contact with someo...,https://www.health.harvard.edu/,TRUE
136,COVID and the Terror of Uncertainty,"There are two worlds, in a way never before im...",https://journal-neo.org/,FAKE
49,The Long Road to Reopening,Americans are eagerly awaiting the reopening o...,https://www.globalhealthnow.org/,TRUE
198,Has COVID-19 subverted global health?,For the first time in the post-war history of ...,https://www.thelancet.com/,TRUE
902,How contagious is the virus?,"It seems to spread very easily, making contain...",https://www.nytimes.com/,TRUE
582,"The Slums Of Southern Europe, The New Slave Ma...",The southern parts of Italy and Spain are beco...,http://oneworld.press/,FAKE
284,Is It Time to Launch an Investigation Into the...,"Occasionally, humanity is confronted with a se...",https://www.strategic-culture.org/,FAKE


# Analisi

In [292]:
#%pip install plotly.express
#%pip install plotly.figure_factory
#%pip install plotly.graph_objects
#%pip install nbformat

In [293]:
# import
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go

## Lettere maiuscole nel titolo
### • Contiamo il numero di lettere maiuscole in ogni titolo.
### • Calcoliamo la percentuale di lettere maiuscole nel corpo di ogni articolo anzichè contarne il numero , a causa della diversa lunghezza degli articoli.

In [294]:
dataset['title_num_uppercase'] = dataset['title'].str.count(r'[A-Z]')
dataset['text_num_uppercase'] = dataset['text'].str.count(r'[A-Z]')
dataset['text_len'] = dataset['text'].str.len()
dataset['text_pct_uppercase'] = dataset.text_num_uppercase.div(dataset.text_len)

x1 = dataset.loc[dataset['label']=='TRUE']['title_num_uppercase']
x2 = dataset.loc[dataset['label'] == 'FAKE']['title_num_uppercase']
group_labels = ['TRUE', 'FAKE']
colors = ['rgb(0, 0, 100)', 'rgb(0, 200, 200)']

fig = ff.create_distplot([x1, x2], group_labels,colors=colors)

fig.update_layout(title_text='Distribuzione delle lettere maiuscole nel titolo', template="plotly_white")
fig.show()

In [295]:
fig = go.Figure()
fig.add_trace(go.Box(y=x1, name='TRUE',
                marker_color = 'rgb(0, 0, 100)'))
fig.add_trace(go.Box(y=x2, name = 'FAKE',
                marker_color = 'rgb(0, 200, 200)'))
fig.update_layout(title_text='Box plot delle lettere maiuscole nel titolo', template="plotly_white")
fig.show()

In media, le fake news presentano un maggior numero di lettere maiuscole nel titolo.
Questo fa pensare che le fake news si rivolgono a un pubblico che potrebbe essere influenzato dai titoli.



## Stop Words nel titolo
### • Contiamo il numero di stop words in ogni titolo.
### • Calcoliamo la percentuale di stop words nel corpo di ogni articolo anzichè contarne il numero , a causa della diversa lunghezza degli articoli.

In [296]:
#%pip install nltk
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alexr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [297]:
from nltk.corpus import stopwords    
stop_words = set(stopwords.words('english'))

In [298]:
dataset['title_num_stop_words'] = dataset['title'].str.split().apply(lambda x: len(set(x) & stop_words))
dataset['text_num_stop_words'] = dataset['text'].str.split().apply(lambda x: len(set(x) & stop_words))
dataset['text_word_count'] = dataset['text'].apply(lambda x: len(str(x).split()))
dataset['text_pct_stop_words'] = dataset['text_num_stop_words'] / dataset['text_word_count']

x1 = dataset.loc[dataset['label']=='TRUE']['title_num_stop_words']
x2 = dataset.loc[dataset['label'] == 'FAKE']['title_num_stop_words']
group_labels = ['TRUE', 'FAKE']
colors = ['rgb(0, 0, 100)', 'rgb(0, 200, 200)']

fig = ff.create_distplot(
    [x1, x2], group_labels,colors=colors)

fig.update_layout(title_text='Distribuzione delle Stop Words nel titolo', template="plotly_white")
fig.show()

In [299]:
fig = go.Figure()
fig.add_trace(go.Box(y=x1, name='TRUE', marker_color = 'rgb(0, 0, 100)'))
fig.add_trace(go.Box(y=x2, name = 'FAKE', marker_color = 'rgb(0, 200, 200)'))
fig.update_layout(title_text='Box plot delle Stop Words nel titolo', template="plotly_white")
fig.show()

I titoli delle fake news hanno meno stop-words rispetto alle real-news.


## Nomi propri nel titolo
### • Contiamo il numero di nomi prorpri (NNP) in ogni titolo.

In [300]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.tag import pos_tag
from nltk import word_tokenize
from collections import Counter

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\alexr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\alexr\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [301]:
dataset.drop(['text_num_uppercase', 'text_len', 'text_num_stop_words', 'text_word_count'], axis=1, inplace=True)

dataset['token'] = dataset.apply(lambda row: nltk.word_tokenize(row['title']), axis=1)
dataset['pos_tags'] = dataset.apply(lambda row: nltk.pos_tag(row['token']), axis=1)

tag_count_dataset = pd.DataFrame(dataset['pos_tags'].map(lambda x: Counter(tag[1] for tag in x)).to_list())
dataset = pd.concat([dataset, tag_count_dataset], axis=1).fillna(0).drop(['pos_tags', 'token'], axis=1)

dataset = dataset[['title', 'text', 'source', 'label', 'title_num_uppercase', 'text_pct_uppercase', 'title_num_stop_words', 'text_pct_stop_words', 'NNP']].rename(columns={'NNP': 'NNP_title'})

x1 = dataset.loc[dataset['label']=='TRUE']['NNP_title']
x2 = dataset.loc[dataset['label'] == 'FAKE']['NNP_title']

group_labels = ['TRUE', 'FAKE']

colors = ['rgb(0, 0, 100)', 'rgb(0, 200, 200)']

fig = ff.create_distplot(
    [x1, x2], group_labels,colors=colors)

fig.update_layout(title_text='Numero di nomi propri nel titolo', template="plotly_white")
fig.show()

In [302]:
fig = go.Figure()
fig.add_trace(go.Box(y=x1, name='TRUE',
                marker_color = 'rgb(0, 0, 100)'))
fig.add_trace(go.Box(y=x2, name = 'FAKE',
                marker_color = 'rgb(0, 200, 200)'))
fig.update_layout(title_text='Box plot dei nomi propri nel titolo', template="plotly_white")
fig.show()

I titoli delle fake-news presentano più nomi propri. 


In [303]:
#pip install wordcloud 

#Final word cloud after all the cleaning and pre-processing
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
comment_words = ' '
stopwords = set(STOPWORDS) 

# iterate through the csv file 
for val in df.comment: 

   # typecaste each val to string 
   val = str(val) 

   # split the value 
   tokens = val.split() 

# Converts each token into lowercase 
for i in range(len(tokens)): 
    tokens[i] = tokens[i].lower() 

for words in tokens: 
    comment_words = comment_words + words + ' '


wordcloud = WordCloud(width = 800, height = 800, 
            background_color ='white', 
            stopwords = stopwords, 
            min_font_size = 10).generate(comment_words) 

# plot the WordCloud image                        
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 

plt.show() 

#### Nel complesso, questi risultati suggeriscono che gli autori di fake-news cercano di attirare l'attenzione utilizzando le parole in maiuscolo nei titoli e concentrando quante più key-words possibili nei titoli saltando le stop-word e aumentando i nomi propri. Analizziamo se lo stesso avviene anche nei corpi degli articoli.

## Lettere maiuscole nel corpo degli articoli


In [304]:
x1 = dataset.loc[dataset['label']=='TRUE']['text_pct_uppercase']
x2 = dataset.loc[dataset['label'] == 'FAKE']['text_pct_uppercase']

group_labels = ['TRUE', 'FAKE']

colors = ['rgb(0, 0, 100)', 'rgb(0, 200, 200)']

fig = ff.create_distplot(
    [x1, x2], group_labels,colors=colors)

fig.update_layout(title_text='Percentuale di lettere maiuscole nel corpo degli articoli', template="plotly_white")
fig.show()

In media, le fake news presentano un maggior numero di lettere maiuscole nel corpo degli articoli.



## Stop Words nel corpo degli articoli


In [305]:
x1 = dataset.loc[dataset['label']=='TRUE']['text_pct_stop_words']
x2 = dataset.loc[dataset['label'] == 'FAKE']['text_pct_stop_words']

group_labels = ['TRUE', 'FAKE']

colors = ['rgb(0, 0, 100)', 'rgb(0, 200, 200)']

fig = ff.create_distplot(
    [x1, x2], group_labels,colors=colors)

fig.update_layout(title_text='Percentuale di Stop Words nel corpo degli articoli', template="plotly_white")
fig.show()

Non ci sono differenze significative tra le percentuali di stop word nelle fake e nelle real news


In [306]:
dataset.sample(3)

Unnamed: 0,title,text,source,label,title_num_uppercase,text_pct_uppercase,title_num_stop_words,text_pct_stop_words,NNP_title
421,Corona virus developed in Canada and stolen by...,A researcher with ties to China was recently e...,https://perma.cc/,FAKE,3,0.043745,3,0.176471,3.0
932,PHENOMENA OF CORONAVIRUS CRISIS,"As of midday on March 16th, the number of tota...",https://web.archive.org/,FAKE,28,0.020103,0,0.058673,4.0
28,What Is Coronavirus?,COVID-19 is the disease caused by the new coro...,https://www.hopkinsmedicine.org/,TRUE,3,0.04559,0,0.086066,1.0


## Harvard Health Publishing vs. Natural News
#### Natural News è un sito di notizie false.

In [307]:
x1 = dataset.loc[dataset['source']=='https://www.health.harvard.edu/']['text_pct_stop_words']
x2 = dataset.loc[dataset['source']=='https://www.naturalnews.com/']['text_pct_stop_words']

x3 = dataset.loc[dataset['source']=='https://www.health.harvard.edu/']['text_pct_uppercase']
x4 = dataset.loc[dataset['source']=='https://www.naturalnews.com/']['text_pct_uppercase']

x5 = dataset.loc[dataset['source']=='https://www.health.harvard.edu/']['NNP_title']
x6 = dataset.loc[dataset['source']=='https://www.naturalnews.com/']['NNP_title']



group_labels = ['Health Harvard', 'Natural News']

colors = ['rgb(0, 0, 100)', 'rgb(0, 200, 200)']

fig1 = ff.create_distplot([x1, x2], group_labels,colors=colors)
fig1.update_layout(title_text='Percentuale di Stop Words nel corpo degli articoli', template="plotly_white")
fig1.show()

fig2 = ff.create_distplot([x3, x4], group_labels,colors=colors)
fig2.update_layout(title_text='Percentuale di lettere maiuscole nel corpo degli articoli', template="plotly_white")
fig2.show()

fig3 = ff.create_distplot([x5, x6], group_labels,colors=colors)
fig3.update_layout(title_text='Numero di nomi propri nel titolo degli articoli', template="plotly_white")
fig3.show()

Come volevasi dimostrare, gli articoli di Natural News usano molte meno stop words rispetto a Healt Publishing.

In [308]:
dataset.sample(2)

Unnamed: 0,title,text,source,label,title_num_uppercase,text_pct_uppercase,title_num_stop_words,text_pct_stop_words,NNP_title
397,breathing in hot air from a hair dryer or in a...,In view of the dire situation we currently fac...,https://perma.cc/,FAKE,5,0.011047,5,0.138211,1.0
450,Video: Coronavirus Treatment: New York Doctor ...,This is a “potential” slap in the face to the ...,https://www.globalresearch.ca/,FAKE,15,0.037491,2,0.125541,12.0


## Features
### Per analizzare in modo approfondito gli articoli fake e real, calcoliamo alcune features basate sui corpi degli articoli:

• Usiamo un part-of-speech tagger e contiamo il numero di volte in cui ogni tag compare nell'articolo.

In [309]:
dataset['token'] = dataset.apply(lambda row: nltk.word_tokenize(row['text']), axis=1)
dataset['pos_tags'] = dataset.apply(lambda row: nltk.pos_tag(row['token']), axis=1)

tag_count_dataset = pd.DataFrame(dataset['pos_tags'].map(lambda x: Counter(tag[1] for tag in x)).to_list())

dataset = pd.concat([dataset, tag_count_dataset], axis=1).fillna(0).drop(['pos_tags', 'token'], axis=1)

• Numero di forme negative e interrogative nel corpo degli articoli.

In [310]:
dataset['num_negation'] = dataset['text'].str.lower().str.count("no|not|never|none|nothing|nobody|neither|nowhere|hardly|scarcely|barely|doesn’t|isn’t|wasn’t|shouldn’t|wouldn’t|couldn’t|won’t|can't|don't")

dataset['num_interrogatives_title'] = dataset['title'].str.lower().str.count("what|who|when|where|which|why|how")
dataset['num_interrogatives_text'] = dataset['text'].str.lower().str.count("what|who|when|where|which|why|how")

## Training del modello

In [311]:
dataset.sample(3)

Unnamed: 0,title,text,source,label,title_num_uppercase,text_pct_uppercase,title_num_stop_words,text_pct_stop_words,NNP_title,DT,...,``,'',PDT,WP$,SYM,FW,#,num_negation,num_interrogatives_title,num_interrogatives_text
504,Exposing yourself to the sun or to temperature...,"You can catch COVID-19, no matter how sunny or...",https://www.who.int/emergencies/diseases/novel...,TRUE,14,0.051793,5,0.261905,3.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,0,1
285,Information for Healthcare Professionals: COVI...,"Patients at higher risk for infection, severe ...",https://www.cdc.gov/,TRUE,10,0.034571,2,0.121739,1.0,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3,0,4
942,missing,Bill Gates either predicted or planned the cor...,instagram,FAKE,0,0.044053,0,0.25,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0


In [312]:
reading_ease = []
for doc in dataset['text']:
    reading_ease.append(textstat.flesch_reading_ease(doc))
    
smog = []
for doc in dataset['text']:
    smog.append(textstat.smog_index(doc))
    
kincaid_grade = []
for doc in dataset['text']:
    kincaid_grade.append(textstat.flesch_kincaid_grade(doc))
    
liau_index = []
for doc in dataset['text']:
    liau_index.append(textstat.coleman_liau_index(doc))
    
readability_index = []
for doc in dataset['text']:
    readability_index.append(textstat.automated_readability_index(doc))

readability_score = []
for doc in dataset['text']:
    readability_score.append(textstat.dale_chall_readability_score(doc))
    
difficult_words = []
for doc in dataset['text']:
    difficult_words.append(textstat.difficult_words(doc))

write_formula = []
for doc in dataset['text']:
    write_formula.append(textstat.linsear_write_formula(doc))

gunning_fog = []
for doc in dataset['text']:
    gunning_fog.append(textstat.gunning_fog(doc))

text_standard = []
for doc in dataset['text']:
    text_standard.append(textstat.text_standard(doc))
    
dataset['flesch_reading_ease'] = reading_ease
dataset['smog_index'] = smog
dataset['flesch_kincaid_grade'] = kincaid_grade
dataset['automated_readability_index'] = readability_index
dataset['dale_chall_readability_score'] = readability_score
dataset['difficult_words'] = difficult_words
dataset['linsear_write_formula'] = write_formula
dataset['gunning_fog'] = gunning_fog
dataset['text_standard'] = text_standard

In [313]:
ttr = []
for doc in dataset['text']:
    lex = LexicalRichness(doc)
    ttr.append(lex.ttr)

dataset['ttr'] = ttr

In [314]:
dataset['num_powerWords_text'] = dataset['text'].str.lower().str.count('improve|trust|immediately|discover|profit|learn|know|understand|powerful|best|win|more|bonus|exclusive|extra|you|free|health|guarantee|new|proven|safety|money|now|today|results|protect|help|easy|amazing|latest|extraordinary|how to|worst|ultimate|hot|first|big|anniversary|premiere|basic|complete|save|plus|create')
dataset['num_casualWords_text'] = dataset['text'].str.lower().str.count('make|because|how|why|change|use|since|reason|therefore|result')
dataset['num_tentativeWords_text'] = dataset['text'].str.lower().str.count('may|might|can|could|possibly|probably|it is likely|it is unlikely|it is possible|it is probable|tends to|appears to|suggests that|seems to')
dataset['num_emotionWords_text'] = dataset['text'].str.lower().str.count('ordeal|outrageous|provoke|repulsive|scandal|severe|shameful|shocking|terrible|tragic|unreliable|unstable|wicked|aggravate|agony|appalled|atrocious|corruption|damage|disastrous|disgusted|dreadatasetul|eliminate|harmful|harsh|inconsiderate|enraged|offensive|aggressive|frustrated|controlling|resentful|anger|sad|fear|malicious|infuriated|critical|violent|vindictive|furious|contrary|condemning|sarcastic|poisonous|jealous|retaliating|desperate|alienated|unjustified|violated')


In [315]:
def cleantext(string):
    text = string.lower().split()
    text = " ".join(text)
    text = re.sub(r"http(\S)+",' ',text)    
    text = re.sub(r"www(\S)+",' ',text)
    text = re.sub(r"&",' and ',text)  
    text = text.replace('&amp',' ')
    text = re.sub(r"[^0-9a-zA-Z]+",' ',text)
    text = text.split()
    text = [w for w in text if not w in stop_words]
    text = " ".join(text)
    return text


In [316]:
dataset['text'] = dataset['text'].map(lambda x: cleantext(x))
dataset['title'] = dataset['title'].map(lambda x: cleantext(x))
dataset['source'] = dataset['source'].map(lambda x: cleantext(x))

In [317]:
classes = {"TRUE":1,"FAKE":0}
dataset["label"].replace(classes, inplace=True)

In [318]:
def correlation(dataset, threshold):
    col_corr = set()
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if (corr_matrix.iloc[i,j] >= threshold and (corr_matrix.columns[j] not in col_corr)):
                colname = corr_matrix.columns[i]
                col_corr.add(colname)
                if colname in dataset.columns:
                    del dataset[colname]

#correlation(dataset, 0.9)

train, test = train_test_split(dataset, test_size = 0.2, random_state = 0)
X_train, y_train = train, train['label']
X_test, y_test = test, test['label']

le = LabelEncoder()
scaler = StandardScaler()

for column_name in X_train.columns:
    if X_train[column_name].dtype == object:
        X_train[column_name] = le.fit_transform(X_train[column_name])

for column_name in X_test.columns:
    if X_test[column_name].dtype == object:
        X_test[column_name] = le.fit_transform(X_test[column_name])

X_train.loc[:, ~X_train.columns.isin(['title','text','source'])] = scaler.fit_transform(X_train.loc[:, ~X_train.columns.isin(['title','text','source'])])


In [319]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 931 entries, 303 to 684
Data columns (total 46 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   title                         931 non-null    object 
 1   text                          931 non-null    object 
 2   source                        931 non-null    object 
 3   label                         931 non-null    int64  
 4   title_num_uppercase           931 non-null    int64  
 5   text_pct_uppercase            931 non-null    float64
 6   title_num_stop_words          931 non-null    int64  
 7   text_pct_stop_words           931 non-null    float64
 8   NNP_title                     931 non-null    float64
 9   DT                            931 non-null    float64
 10  NNP                           931 non-null    float64
 11  VBD                           931 non-null    float64
 12  CD                            931 non-null    float64
 13  VBP

In [320]:
train, test = train_test_split(dataset, test_size = 0.2, random_state = 0)
X_train, y_train = train.drop(['title', 'text', 'source', 'label', 'text_standard'], axis = 1), train['label']
X_test, y_test = test.drop(['title', 'text', 'source', 'label', 'text_standard'], axis = 1), test['label']

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [321]:
def print_metrices(pred, true):
    print("Accuracy : ", accuracy_score(pred, true))
    print("Precison : ", precision_score(pred, true, pos_label=1))
    print("Recall : ", recall_score(pred, true))
    print("F1 : ", f1_score(pred, true))

In [322]:
y_train

303     0
294     1
1116    0
848     0
97      0
       ..
1033    1
763     1
835     0
559     1
684     1
Name: label, Length: 931, dtype: int64

In [323]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(criterion = 'entropy')
model = model.fit(X_train, y_train)
pred = model.predict(X_test)
print_metrices(pred, y_test)
#print("Accuracy: {:.2f}%".format(accuracy_score(pred, y_test) * 100))


Accuracy :  0.6995708154506438
Precison :  0.7166666666666667
Recall :  0.7049180327868853
F1 :  0.7107438016528925


In [324]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=1000)
model = model.fit(X_train, y_train)
pred = model.predict(X_test)
print_metrices(pred, y_test)
#print("Accuracy: {:.2f}%".format(accuracy_score(pred, y_test) * 100))

Accuracy :  0.8240343347639485
Precison :  0.8
Recall :  0.8495575221238938
F1 :  0.8240343347639485


In [325]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=3)
model = model.fit(X_train, y_train)
pred = model.predict(X_test)
print_metrices(pred, y_test)
#print("Accuracy: {:.2f}%".format(accuracy_score(pred, y_test) * 100))

Accuracy :  0.8068669527896996
Precison :  0.9
Recall :  0.7659574468085106
F1 :  0.8275862068965516


In [326]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()
model = model.fit(X_train, y_train)
pred = model.predict(X_test)
print_metrices(pred, y_test)
#print("Accuracy: {:.2f}%".format(accuracy_score(pred, y_test) * 100))

Accuracy :  0.6137339055793991
Precison :  0.825
Recall :  0.5892857142857143
F1 :  0.6875000000000001


In [327]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model = model.fit(X_train, y_train)
pred = model.predict(X_test)
print_metrices(pred, y_test)
#print("Accuracy: {:.2f}%".format(accuracy_score(pred, y_test)*100))

Accuracy :  0.8197424892703863
Precison :  0.8166666666666667
Recall :  0.8305084745762712
F1 :  0.8235294117647058


In [328]:
svc = LinearSVC(dual=False)
model = svc.fit(X_train, y_train)
pred = model.predict(X_test)
print_metrices(pred, y_test)
#print("Accuracy: {:.2f}%".format(accuracy_score(pred, y_test) * 100))

Accuracy :  0.8326180257510729
Precison :  0.8083333333333333
Recall :  0.8584070796460177
F1 :  0.832618025751073


In [329]:
from sklearn import svm
clf = svm.SVC(kernel='linear', C = 1.0)
model = clf.fit(X_train,y_train)
pred = model.predict(X_test)
print_metrices(pred, y_test)
#print("Accuracy: {:.2f}%".format(accuracy_score(pred, y_test) * 100))

Accuracy :  0.8283261802575107
Precison :  0.7916666666666666
Recall :  0.8636363636363636
F1 :  0.8260869565217391


In [330]:
from sklearn.ensemble import AdaBoostClassifier

abc = AdaBoostClassifier(n_estimators=50, learning_rate=1)
model = abc.fit(X_train, y_train)  
pred = model.predict(X_test)
print_metrices(pred, y_test)
#print("Accuracy: {:.2f}%".format(accuracy_score(pred, y_test) * 100))

Accuracy :  0.8283261802575107
Precison :  0.8166666666666667
Recall :  0.8448275862068966
F1 :  0.8305084745762712
