In [636]:
# import librerie
import pandas as pd
import numpy as np
from sklearn.utils import shuffle


from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix  
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import cross_val_score
import re

In [637]:
# import dataset
dataset = pd.read_csv("./dataset/corona_fake.csv")
dataset

Unnamed: 0,title,text,source,label
0,Due to the recent outbreak for the Coronavirus...,"You just need to add water, and the drugs and ...",coronavirusmedicalkit.com,Fake
1,,Hydroxychloroquine has been shown to have a 10...,RudyGiuliani,Fake
2,,Fact: Hydroxychloroquine has been shown to hav...,CharlieKirk,Fake
3,,The Corona virus is a man made virus created i...,JoanneWrightForCongress,Fake
4,,Doesn’t @BillGates finance research at the Wuh...,JoanneWrightForCongress,Fake
...,...,...,...,...
1159,Could the Power of the Sun Slow the Coronavirus?,A study suggests that ultraviolet rays could s...,https://www.nytimes.com/,TRUE
1160,Key evidence for coronavirus spread is flawed ...,"Last week, a medical journal reported that a b...",https://www.nytimes.com/,TRUE
1161,Summer Heat May Not Diminish Coronavirus Strength,"A new report, sent to the White House science ...",https://www.nytimes.com/,TRUE
1162,How Long Will a Vaccine Really Take?,A vaccine would be the ultimate weapon against...,https://www.nytimes.com/,TRUE


### Pre-processing dataset

In [638]:
# formattazione
dataset['label'] = dataset['label'].str.upper()  #trasforma tutta la colonna label in maiuscolo
dataset['source'] = dataset['source'].str.lower()  # trasorma tutta la colonna source in minuscolo
dataset.loc[dataset['source'] == 'facebook', ['source']] = 'https://facebook.com/'
dataset.loc[dataset['source'] == 'twitter', ['source']] = 'https://twitter.com/'
dataset.loc[dataset['source'] == 'youtube', ['source']] = 'https://youtube.com/'

# assegnazione esplicita delle label in seguito ad accertamenti
dataset.loc[5]['label'] = 'FAKE'
dataset.loc[15]['label'] = 'TRUE'
dataset.loc[43]['label'] = 'FAKE'
dataset.loc[131]['label'] = 'TRUE'
dataset.loc[242]['label'] = 'FAKE'

dataset.text.fillna(dataset.title, inplace=True)
dataset = dataset.sample(frac=1).reset_index(drop=True)

# replace dei NaN
dataset.title.fillna('missing', inplace=True)
dataset.source.fillna('missing', inplace=True)

#dataset.label.value_counts()
dataset

Unnamed: 0,title,text,source,label
0,Can a person who has had coronavirus get infec...,"While we don't know the answer yet, most peopl...",https://www.health.harvard.edu/,TRUE
1,How could contact tracing help slow the spread...,Anyone who comes into close contact with someo...,https://www.health.harvard.edu/,TRUE
2,How could contact tracing help slow the spread...,Anyone who comes into close contact with someo...,https://www.health.harvard.edu/,TRUE
3,What is the risk of infection in pregnant wome...,There is limited scientific evidence on the se...,https://www.ecdc.europa.eu/,TRUE
4,EU left Italy ‘practically alone' to fight cor...,The EU’s initial response to the massive outbr...,https://www.rt.com/,FAKE
...,...,...,...,...
1159,Coronavirus: what do scientists know about Cov...,Medical researchers have been studying everyth...,https://www.theguardian.com/,TRUE
1160,What are the symptoms of COVID-19?,Some people infected with the virus have no sy...,https://www.health.harvard.edu/,TRUE
1161,What we know about potential coronavirus vacci...,Although physicians still have no vaccine or c...,https://www.cnn.com/,TRUE
1162,My child has mild cold or flu symptoms. Should...,"No. Coronavirus symptoms can include fever, dr...",https://www.nytimes.com/,TRUE


In [639]:
dataset.sample(15)

Unnamed: 0,title,text,source,label
639,"The COVID-19 pandemic could last for 2 years, ...",A new report from researchers at the Center fo...,https://www.weforum.org/,TRUE
746,A Shocking Update. Did The Virus Originate in ...,virus known as COVID-19 originated outside Chi...,https://www.globalresearch.ca/,FAKE
774,Breaking news: China will admit coronavirus co...,As the “novel” coronavirus originated in Wuhan...,https://gnews.org/,FAKE
524,Prevention,You can take measures to reduce your risk of c...,https://bestpractice.bmj.com/,TRUE
1039,Nutritional Treatment of Coronavirus,Abundant clinical evidence confirms vitamin C'...,http://orthomolecular.org/,FAKE
348,Digital handshake: Can contact tracing deliver...,As countries around the world ease the lockdow...,https://www.reuters.com/,TRUE
561,How can I avoid getting infected?,"The virus enters your body via your eyes, nose...",https://www.ecdc.europa.eu,TRUE
632,How reliable is the test for COVID-19?,"In the US, the most common test for the COVID-...",https://www.health.harvard.edu/,TRUE
477,How 5G conspiracy theories used covid-19 to go...,FOR AS LONG as there have been mobile networks...,https://www.economist.com,TRUE
666,Robert F. Kennedy Jr. warns that Anthony Fauci...,During a recent episode of the Thomas Paine Po...,https://www.naturalnews.com/,FAKE


# Analisi

In [640]:
#%pip install plotly.express
#%pip install plotly.figure_factory
#%pip install plotly.graph_objects
#%pip install nbformat

In [641]:
# import
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go

## Lettere maiuscole nel titolo
### • Contiamo il numero di lettere maiuscole in ogni titolo.
### • Calcoliamo la percentuale di lettere maiuscole nel corpo di ogni articolo anzichè contarne il numero , a causa della diversa lunghezza degli articoli.

In [642]:
dataset['title_num_uppercase'] = dataset['title'].str.count(r'[A-Z]')
dataset['text_num_uppercase'] = dataset['text'].str.count(r'[A-Z]')
dataset['text_len'] = dataset['text'].str.len()
dataset['text_pct_uppercase'] = dataset.text_num_uppercase.div(dataset.text_len)

x1 = dataset.loc[dataset['label']=='TRUE']['title_num_uppercase']
x2 = dataset.loc[dataset['label'] == 'FAKE']['title_num_uppercase']
group_labels = ['TRUE', 'FAKE']
colors = ['rgb(0, 0, 100)', 'rgb(0, 200, 200)']

fig = ff.create_distplot([x1, x2], group_labels,colors=colors)

fig.update_layout(title_text='Distribuzione delle lettere maiuscole nel titolo', template="plotly_white")
fig.show()

In [643]:
fig = go.Figure()
fig.add_trace(go.Box(y=x1, name='TRUE',
                marker_color = 'rgb(0, 0, 100)'))
fig.add_trace(go.Box(y=x2, name = 'FAKE',
                marker_color = 'rgb(0, 200, 200)'))
fig.update_layout(title_text='Box plot delle lettere maiuscole nel titolo', template="plotly_white")
fig.show()

In media, le fake news presentano un maggior numero di lettere maiuscole nel titolo.
Questo fa pensare che le fake news si rivolgono a un pubblico che potrebbe essere influenzato dai titoli.



## Stop Words nel titolo
### • Contiamo il numero di stop words in ogni titolo.
### • Calcoliamo la percentuale di stop words nel corpo di ogni articolo anzichè contarne il numero , a causa della diversa lunghezza degli articoli.

In [644]:
#%pip install nltk
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alexr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [645]:
from nltk.corpus import stopwords    
stop_words = set(stopwords.words('english'))

In [646]:
dataset['title_num_stop_words'] = dataset['title'].str.split().apply(lambda x: len(set(x) & stop_words))
dataset['text_num_stop_words'] = dataset['text'].str.split().apply(lambda x: len(set(x) & stop_words))
dataset['text_word_count'] = dataset['text'].apply(lambda x: len(str(x).split()))
dataset['text_pct_stop_words'] = dataset['text_num_stop_words'] / dataset['text_word_count']

x1 = dataset.loc[dataset['label']=='TRUE']['title_num_stop_words']
x2 = dataset.loc[dataset['label'] == 'FAKE']['title_num_stop_words']
group_labels = ['TRUE', 'FAKE']
colors = ['rgb(0, 0, 100)', 'rgb(0, 200, 200)']

fig = ff.create_distplot(
    [x1, x2], group_labels,colors=colors)

fig.update_layout(title_text='Distribuzione delle Stop Words nel titolo', template="plotly_white")
fig.show()

In [647]:
fig = go.Figure()
fig.add_trace(go.Box(y=x1, name='TRUE', marker_color = 'rgb(0, 0, 100)'))
fig.add_trace(go.Box(y=x2, name = 'FAKE', marker_color = 'rgb(0, 200, 200)'))
fig.update_layout(title_text='Box plot delle Stop Words nel titolo', template="plotly_white")
fig.show()

I titoli delle fake news hanno meno stop-words rispetto alle real-news.


## Nomi propri nel titolo
### • Contiamo il numero di nomi prorpri (NNP) in ogni titolo.

In [648]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.tag import pos_tag
from nltk import word_tokenize
from collections import Counter

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\alexr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\alexr\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [649]:
dataset.drop(['text_num_uppercase', 'text_len', 'text_num_stop_words', 'text_word_count'], axis=1, inplace=True)

dataset['token'] = dataset.apply(lambda row: nltk.word_tokenize(row['title']), axis=1)
dataset['pos_tags'] = dataset.apply(lambda row: nltk.pos_tag(row['token']), axis=1)

tag_count_dataset = pd.DataFrame(dataset['pos_tags'].map(lambda x: Counter(tag[1] for tag in x)).to_list())
dataset = pd.concat([dataset, tag_count_dataset], axis=1).fillna(0).drop(['pos_tags', 'token'], axis=1)

dataset = dataset[['title', 'text', 'source', 'label', 'title_num_uppercase', 'text_pct_uppercase', 'title_num_stop_words', 'text_pct_stop_words', 'NNP']].rename(columns={'NNP': 'NNP_title'})

x1 = dataset.loc[dataset['label']=='TRUE']['NNP_title']
x2 = dataset.loc[dataset['label'] == 'FAKE']['NNP_title']

group_labels = ['TRUE', 'FAKE']

colors = ['rgb(0, 0, 100)', 'rgb(0, 200, 200)']

fig = ff.create_distplot(
    [x1, x2], group_labels,colors=colors)

fig.update_layout(title_text='Numero di nomi propri nel titolo', template="plotly_white")
fig.show()

In [650]:
fig = go.Figure()
fig.add_trace(go.Box(y=x1, name='TRUE',
                marker_color = 'rgb(0, 0, 100)'))
fig.add_trace(go.Box(y=x2, name = 'FAKE',
                marker_color = 'rgb(0, 200, 200)'))
fig.update_layout(title_text='Box plot dei nomi propri nel titolo', template="plotly_white")
fig.show()

I titoli delle fake-news presentano più nomi propri. 


In [651]:
#pip install wordcloud 

#Final word cloud after all the cleaning and pre-processing
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
comment_words = ' '
stopwords = set(STOPWORDS) 

# iterate through the csv file 
for val in df.comment: 

   # typecaste each val to string 
   val = str(val) 

   # split the value 
   tokens = val.split() 

# Converts each token into lowercase 
for i in range(len(tokens)): 
    tokens[i] = tokens[i].lower() 

for words in tokens: 
    comment_words = comment_words + words + ' '


wordcloud = WordCloud(width = 800, height = 800, 
            background_color ='white', 
            stopwords = stopwords, 
            min_font_size = 10).generate(comment_words) 

# plot the WordCloud image                        
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 

plt.show() 

#### Nel complesso, questi risultati suggeriscono che gli autori di fake-news cercano di attirare l'attenzione utilizzando le parole in maiuscolo nei titoli e concentrando quante più key-words possibili nei titoli saltando le stop-word e aumentando i nomi propri. Analizziamo se lo stesso avviene anche nei corpi degli articoli.

## Lettere maiuscole nel corpo degli articoli


In [652]:
x1 = dataset.loc[dataset['label']=='TRUE']['text_pct_uppercase']
x2 = dataset.loc[dataset['label'] == 'FAKE']['text_pct_uppercase']

group_labels = ['TRUE', 'FAKE']

colors = ['rgb(0, 0, 100)', 'rgb(0, 200, 200)']

fig = ff.create_distplot(
    [x1, x2], group_labels,colors=colors)

fig.update_layout(title_text='Percentuale di lettere maiuscole nel corpo degli articoli', template="plotly_white")
fig.show()

In media, le fake news presentano un maggior numero di lettere maiuscole nel corpo degli articoli.



## Stop Words nel corpo degli articoli


In [653]:
x1 = dataset.loc[dataset['label']=='TRUE']['text_pct_stop_words']
x2 = dataset.loc[dataset['label'] == 'FAKE']['text_pct_stop_words']

group_labels = ['TRUE', 'FAKE']

colors = ['rgb(0, 0, 100)', 'rgb(0, 200, 200)']

fig = ff.create_distplot(
    [x1, x2], group_labels,colors=colors)

fig.update_layout(title_text='Percentuale di Stop Words nel corpo degli articoli', template="plotly_white")
fig.show()

Non ci sono differenze significative tra le percentuali di stop word nelle fake e nelle real news


In [654]:
dataset.sample(3)

Unnamed: 0,title,text,source,label,title_num_uppercase,text_pct_uppercase,title_num_stop_words,text_pct_stop_words,NNP_title
1141,Dr. Vladimir Zelenko has now treated 699 coron...,April 11 Update: A new research study reveals ...,https://techstartups.com/,FAKE,8,0.042354,4,0.105392,7.0
571,missing,The elite are behind the Corona Virus and othe...,https://facebook.com/,FAKE,0,0.028687,0,0.091696,0.0
1033,Here are the innovations we need to reopen the...,It’s entirely understandable that the national...,https://www.washingtonpost.com/opinions,TRUE,1,0.012547,4,0.06563,0.0


## Harvard Health Publishing vs. Natural News
#### Natural News è un sito di notizie false.

In [655]:
x1 = dataset.loc[dataset['source']=='https://www.health.harvard.edu/']['text_pct_stop_words']
x2 = dataset.loc[dataset['source']=='https://www.naturalnews.com/']['text_pct_stop_words']

x3 = dataset.loc[dataset['source']=='https://www.health.harvard.edu/']['text_pct_uppercase']
x4 = dataset.loc[dataset['source']=='https://www.naturalnews.com/']['text_pct_uppercase']

x5 = dataset.loc[dataset['source']=='https://www.health.harvard.edu/']['NNP_title']
x6 = dataset.loc[dataset['source']=='https://www.naturalnews.com/']['NNP_title']



group_labels = ['Health Harvard', 'Natural News']

colors = ['rgb(0, 0, 100)', 'rgb(0, 200, 200)']

fig1 = ff.create_distplot([x1, x2], group_labels,colors=colors)
fig1.update_layout(title_text='Percentuale di Stop Words nel corpo degli articoli', template="plotly_white")
fig1.show()

fig2 = ff.create_distplot([x3, x4], group_labels,colors=colors)
fig2.update_layout(title_text='Percentuale di lettere maiuscole nel corpo degli articoli', template="plotly_white")
fig2.show()

fig3 = ff.create_distplot([x5, x6], group_labels,colors=colors)
fig3.update_layout(title_text='Numero di nomi propri nel titolo degli articoli', template="plotly_white")
fig3.show()

Come volevasi dimostrare, gli articoli di Natural News usano molte meno stop words rispetto a Healt Publishing.

In [656]:
dataset.sample(2)

Unnamed: 0,title,text,source,label,title_num_uppercase,text_pct_uppercase,title_num_stop_words,text_pct_stop_words,NNP_title
812,"I have asthma. If I get COVID-19, am I more li...","Yes, asthma may increase your risk of getting ...",https://www.health.harvard.edu/,True,9,0.019048,4,0.21978,0.0
445,Can I Boost My Immune System?,Fears about coronavirus have prompted online s...,https://www.nytimes.com/,True,6,0.015152,0,0.051028,3.0


## Features
### Per analizzare in modo approfondito gli articoli fake e real, calcoliamo alcune features basate sui corpi degli articoli:

• Usiamo un part-of-speech tagger e contiamo il numero di volte in cui ogni tag compare nell'articolo.

In [657]:
dataset['token'] = dataset.apply(lambda row: nltk.word_tokenize(row['text']), axis=1)
dataset['pos_tags'] = dataset.apply(lambda row: nltk.pos_tag(row['token']), axis=1)

tag_count_dataset = pd.DataFrame(dataset['pos_tags'].map(lambda x: Counter(tag[1] for tag in x)).to_list())

dataset = pd.concat([dataset, tag_count_dataset], axis=1).fillna(0).drop(['pos_tags', 'token'], axis=1)

• Numero di forme negative e interrogative nel corpo degli articoli.

In [658]:
dataset['num_negation'] = dataset['text'].str.lower().str.count("no|not|never|none|nothing|nobody|neither|nowhere|hardly|scarcely|barely|doesn’t|isn’t|wasn’t|shouldn’t|wouldn’t|couldn’t|won’t|can't|don't")

dataset['num_interrogatives_title'] = dataset['title'].str.lower().str.count("what|who|when|where|which|why|how")
dataset['num_interrogatives_text'] = dataset['text'].str.lower().str.count("what|who|when|where|which|why|how")

## Training del modello

In [659]:
dataset.sample(3)

Unnamed: 0,title,text,source,label,title_num_uppercase,text_pct_uppercase,title_num_stop_words,text_pct_stop_words,NNP_title,IN,...,'',FW,UH,WP$,$,SYM,#,num_negation,num_interrogatives_title,num_interrogatives_text
314,"Drinking cold water, hot drinks or alcohol pro...",you should drink water every quarter of an hou...,https://facebook.com/,FAKE,1,0.0,2,0.382353,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1
1148,THE VISUAL LINK BETWEEN 5G DEPLOYMENT AND CASE...,Never before have we had such a high level loc...,healingoracle.ch,FAKE,65,0.026368,0,0.043412,11.0,271.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,31,0,29
121,How can people help stop stigma related to COV...,People can fight stigma by providing social su...,https://www.cdc.gov/,TRUE,6,0.032258,2,0.25,1.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,1,1


In [660]:
def cleantext(string):
    text = string.lower().split()
    text = " ".join(text)
    text = re.sub(r"http(\S)+",' ',text)    
    text = re.sub(r"www(\S)+",' ',text)
    text = re.sub(r"&",' and ',text)  
    text = text.replace('&amp',' ')
    text = re.sub(r"[^0-9a-zA-Z]+",' ',text)
    text = text.split()
    text = [w for w in text if not w in stop_words]
    text = " ".join(text)
    return text


In [661]:
dataset['text'] = dataset['text'].map(lambda x: cleantext(x))
dataset['title'] = dataset['title'].map(lambda x: cleantext(x))
dataset['source'] = dataset['source'].map(lambda x: cleantext(x))

In [673]:
classes = {"TRUE":1,"FAKE":0}
dataset["label"].replace(classes, inplace=True)

In [675]:
train, test = train_test_split(dataset, test_size = 0.2, random_state = 0)
X_train, y_train = train, train['label']
X_test, y_test = test, test['label']

le = LabelEncoder()
scaler = StandardScaler()

for column_name in X_train.columns:
    if X_train[column_name].dtype == object:
        X_train[column_name] = le.fit_transform(X_train[column_name])

for column_name in X_test.columns:
    if X_test[column_name].dtype == object:
        X_test[column_name] = le.fit_transform(X_test[column_name])

X_train.loc[:, ~X_train.columns.isin(['title','text','source', 'label'])] = scaler.fit_transform(X_train.loc[:, ~X_train.columns.isin(['title','text','source', 'label'])])


In [676]:
dataset.sample(3)

Unnamed: 0,title,text,source,label,title_num_uppercase,text_pct_uppercase,title_num_stop_words,text_pct_stop_words,NNP_title,IN,...,'',FW,UH,WP$,$,SYM,#,num_negation,num_interrogatives_title,num_interrogatives_text
580,human life must trump economics pandemic china...,senior fellow chongyang institute financial st...,,0,9,0.029914,7,0.048208,4.0,217.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19,1,21
299,china western china bashing vs western bio war...,29 january director general dr tedros adhanom ...,,0,6,0.033738,0,0.054448,7.0,173.0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,33,0,16
601,cure go hospital unless breathing problem,coronavirus patients need hospitalized vast ma...,,1,1,0.031223,7,0.157303,1.0,18.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,1,1


In [677]:
def print_metrices(pred, true):
    print("Accuracy : ", accuracy_score(pred, true))
    print("Precison : ", precision_score(pred, true, pos_label=1))
    print("Recall : ", recall_score(pred, true))
    print("F1 : ", f1_score(pred, true))

In [678]:
y_train

303     0
294     1
1116    1
848     1
97      1
       ..
1033    1
763     1
835     0
559     1
684     0
Name: label, Length: 931, dtype: int64

In [683]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(criterion = 'entropy')
model = model.fit(X_train, y_train)
pred = model.predict(X_test)
print_metrices(pred, y_test)
#print("Accuracy: {:.2f}%".format(accuracy_score(pred, y_test) * 100))


Accuracy :  1.0
Precison :  1.0
Recall :  1.0
F1 :  1.0


In [684]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=1000)
model = model.fit(X_train, y_train)
pred = model.predict(X_test)
print_metrices(pred, y_test)
#print("Accuracy: {:.2f}%".format(accuracy_score(pred, y_test) * 100))

Accuracy :  0.9184549356223176
Precison :  0.9203539823008849
Recall :  0.9122807017543859
F1 :  0.9162995594713657


In [685]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=3)
model = model.fit(X_train, y_train)
pred = model.predict(X_test)
print_metrices(pred, y_test)
#print("Accuracy: {:.2f}%".format(accuracy_score(pred, y_test) * 100))

Accuracy :  0.5064377682403434
Precison :  0.5663716814159292
Recall :  0.49230769230769234
F1 :  0.5267489711934157


In [686]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()
model = model.fit(X_train, y_train)
pred = model.predict(X_test)
print_metrices(pred, y_test)
#print("Accuracy: {:.2f}%".format(accuracy_score(pred, y_test) * 100))

Accuracy :  0.8540772532188842
Precison :  0.6991150442477876
Recall :  1.0
F1 :  0.8229166666666666


In [687]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model = model.fit(X_train, y_train)
pred = model.predict(X_test)
print_metrices(pred, y_test)
#print("Accuracy: {:.2f}%".format(accuracy_score(pred, y_test)*100))

Accuracy :  1.0
Precison :  1.0
Recall :  1.0
F1 :  1.0


In [688]:
svc = LinearSVC(dual=False)
model = svc.fit(X_train, y_train)
pred = model.predict(X_test)
print_metrices(pred, y_test)
#print("Accuracy: {:.2f}%".format(accuracy_score(pred, y_test) * 100))

Accuracy :  0.9227467811158798
Precison :  1.0
Recall :  0.8625954198473282
F1 :  0.9262295081967213


In [689]:
from sklearn import svm
clf = svm.SVC(kernel='linear', C = 1.0)
model = clf.fit(X_train,y_train)
pred = model.predict(X_test)
print_metrices(pred, y_test)
#print("Accuracy: {:.2f}%".format(accuracy_score(pred, y_test) * 100))

Accuracy :  0.9098712446351931
Precison :  0.8230088495575221
Recall :  0.9893617021276596
F1 :  0.8985507246376812


In [690]:
from sklearn.ensemble import AdaBoostClassifier

abc = AdaBoostClassifier(n_estimators=50, learning_rate=1)
model = abc.fit(X_train, y_train)  
pred = model.predict(X_test)
print_metrices(pred, y_test)
#print("Accuracy: {:.2f}%".format(accuracy_score(pred, y_test) * 100))

Accuracy :  1.0
Precison :  1.0
Recall :  1.0
F1 :  1.0
