In [129]:
import nltk
nltk.download('stopwords')
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from nltk.corpus import stopwords
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np
import urllib3

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alfredo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [130]:
df = pd.read_csv("database-news.csv")

In [131]:
df.head()

Unnamed: 0.1,Unnamed: 0,noticia,label
0,0,Kátia Abreu diz que vai colocar sua expulsão e...,fake
1,1,Blog esquerdista dá a entender que reclamar de...,fake
2,2,"Alckmin diz que por ele PSDB desembarca, mas...",fake
3,3,Cara de pau não tem limites: Zé Celso aciona M...,fake
4,4,Temer resolve o problema de Luislinda: liberd...,fake


In [132]:
df.tail()

Unnamed: 0.1,Unnamed: 0,noticia,label
7195,7195,Ficou longe das notícias no fim de ano? Veja o...,True
7196,7196,A nova denúncia contra o ex-presidente Luiz I...,True
7197,7197,Como a Bahia virou uma potência mundial do mar...,True
7198,7198,"Alvo da Lava Jato, Bendine tinha passagem comp...",True
7199,7199,Chefs convidados do Encontro Mundial das Cidad...,True


In [133]:
df2 = pd.read_csv('https://raw.githubusercontent.com/ViniciusNunes0/SIRENE-news/master/noticias-sirene.csv', sep=';')

In [134]:
df2.head()

Unnamed: 0,id,noticia,classificacao
0,0,Filipe Toledo sofreu uma derrota dura em Pipel...,0
1,1,O lateral direito do PSG Daniel Alves não perd...,1
2,2,"A Portela estreia, nesta quinta-feira (12), o ...",0
3,3,"Ele nunca apareceu na mídia, apenas coberto de...",1
4,4,Em meio à grave crise na saúde que atinge o Ri...,0


In [135]:
df2 = df2[['noticia', 'classificacao']]
df2 = df2.rename(columns={"noticia": "noticia", "classificacao": "label"})
df2['label'] = df2['label'].replace({0: 'true', 1: 'fake'})
df2.head()
df = df.append(df2, ignore_index=True)

In [136]:
x_train,x_test,y_train,y_test=train_test_split(df['noticia'], df['label'], test_size=0.2, random_state=42)

In [137]:
tfidf_vectorizer = TfidfVectorizer(stop_words=stopwords.words('portuguese'),
                                analyzer='word',
                                ngram_range=(1, 1),
                                lowercase=True,
                                use_idf=True)
tfidf_train = tfidf_vectorizer.fit_transform(x_train) 
tfidf_test = tfidf_vectorizer.transform(x_test)

In [138]:
rf = RandomForestClassifier(random_state=0)
rf.fit(tfidf_train,y_train)

y_pred = rf.predict(tfidf_test)
score = accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 93.93%


In [16]:
model = SVC()
model.fit (tfidf_train,y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [18]:
predictions = model.predict(tfidf_test)

In [19]:
print(confusion_matrix(y_test, predictions))

[[1167   43]
 [  50 1128]]


In [20]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

        fake       0.96      0.96      0.96      1210
        true       0.96      0.96      0.96      1178

    accuracy                           0.96      2388
   macro avg       0.96      0.96      0.96      2388
weighted avg       0.96      0.96      0.96      2388



In [22]:
param_grid = {'C': [0.1, 1, 10, 100, 1000], 'kernel': ['linear']}

In [23]:
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=5)
grid.fit(tfidf_train, y_train)
grid.best_params_

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] C=0.1, kernel=linear ............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................ C=0.1, kernel=linear, score=0.914, total= 1.5min
[CV] C=0.1, kernel=linear ............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.5min remaining:    0.0s


[CV] ................ C=0.1, kernel=linear, score=0.934, total= 1.5min
[CV] C=0.1, kernel=linear ............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  3.0min remaining:    0.0s


[CV] ................ C=0.1, kernel=linear, score=0.924, total= 1.5min
[CV] C=0.1, kernel=linear ............................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  4.5min remaining:    0.0s


[CV] ................ C=0.1, kernel=linear, score=0.941, total= 1.5min
[CV] C=0.1, kernel=linear ............................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  6.0min remaining:    0.0s


[CV] ................ C=0.1, kernel=linear, score=0.919, total= 1.5min
[CV] C=1, kernel=linear ..............................................
[CV] .................. C=1, kernel=linear, score=0.954, total=  54.1s
[CV] C=1, kernel=linear ..............................................
[CV] .................. C=1, kernel=linear, score=0.960, total=  54.8s
[CV] C=1, kernel=linear ..............................................
[CV] .................. C=1, kernel=linear, score=0.952, total=  55.3s
[CV] C=1, kernel=linear ..............................................
[CV] .................. C=1, kernel=linear, score=0.963, total=  55.8s
[CV] C=1, kernel=linear ..............................................
[CV] .................. C=1, kernel=linear, score=0.958, total=  54.4s
[CV] C=10, kernel=linear .............................................
[CV] ................. C=10, kernel=linear, score=0.953, total=  52.6s
[CV] C=10, kernel=linear .............................................
[CV] .

[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed: 30.3min finished


{'C': 1, 'kernel': 'linear'}

In [121]:
grid_predictions = grid.predict(tfidf_test)
print(confusion_matrix(y_test, grid_predictions))

[[1165   45]
 [  45 1133]]


In [120]:
print(classification_report(y_test,  grid_predictions))

              precision    recall  f1-score   support

        fake       0.96      0.96      0.96      1210
        true       0.96      0.96      0.96      1178

    accuracy                           0.96      2388
   macro avg       0.96      0.96      0.96      2388
weighted avg       0.96      0.96      0.96      2388



In [177]:
df3 = pd.read_csv('CoronaTest.csv')

In [178]:
df3

Unnamed: 0,noticia,classificacao
0,Café previne o coronavírus,False
1,Beber água de 15 em 15 minutos cura o coronavírus,False
2,Chá de limão com bicarbonato quente cura coron...,False
3,Café previne o coronavírus,False
4,Beber água de 15 em 15 minutos cura o coronavírus,False
5,Café não previne o coronavírus,True
6,Beber água de 15 em 15 minutos cura o coronavírus,False
7,Chá de limão com bicarbonato quente cura coron...,False
8,Café previne o coronavírus,False
9,O Brasil teve 968 novas mortes registradas em ...,True


In [179]:
tfidf_target = tfidf_vectorizer.transform(df3['noticia'])

In [180]:
result_target = rf.predict(tfidf_target)
print(result_target)

['fake' 'fake' 'fake' 'fake' 'fake' 'fake' 'fake' 'fake' 'fake' 'fake']


In [173]:
for i in range(len(df3['noticia'])):
    if result_target[i]=='fake' :
        df3['classificacao'][i] = False
    else:
        df3['classificacao'][i] = True
    
display(df3)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,noticia,classificacao
0,Café previne o coronavírus,False
1,Beber água de 15 em 15 minutos cura o coronavírus,False
2,Chá de limão com bicarbonato quente cura coron...,False
3,Café previne o coronavírus,False
4,Beber água de 15 em 15 minutos cura o coronavírus,False
5,Café não previne o coronavírus,False
6,Beber água de 15 em 15 minutos cura o coronavírus,False
7,Chá de limão com bicarbonato quente cura coron...,False
8,Café previne o coronavírus,False
9,O Brasil teve 968 novas mortes registradas em ...,False
