In [112]:
import nltk
nltk.download('stopwords')
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from nltk.corpus import stopwords
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np
import urllib3

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alfredo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [56]:
df = pd.read_csv("database-news.csv")

In [57]:
df.head()

Unnamed: 0.1,Unnamed: 0,noticia,label
0,0,Kátia Abreu diz que vai colocar sua expulsão e...,fake
1,1,Blog esquerdista dá a entender que reclamar de...,fake
2,2,"Alckmin diz que por ele PSDB desembarca, mas...",fake
3,3,Cara de pau não tem limites: Zé Celso aciona M...,fake
4,4,Temer resolve o problema de Luislinda: liberd...,fake


In [58]:
df.tail()

Unnamed: 0.1,Unnamed: 0,noticia,label
7195,7195,Ficou longe das notícias no fim de ano? Veja o...,True
7196,7196,A nova denúncia contra o ex-presidente Luiz I...,True
7197,7197,Como a Bahia virou uma potência mundial do mar...,True
7198,7198,"Alvo da Lava Jato, Bendine tinha passagem comp...",True
7199,7199,Chefs convidados do Encontro Mundial das Cidad...,True


In [185]:
df2 = pd.read_csv('CoronaTest.csv')

In [186]:
df2.head()

Unnamed: 0,noticia,classificacao
0,Café previne o coronavírus,1
1,Beber água de 15 em 15 minutos cura o coronavírus,1
2,Chá de limão com bicarbonato quente cura coron...,1
3,Ministro da Saúde pede para compartilhar áudio...,1
4,"Aplicativo Coronavírus-SUS, do Governo do Bras...",1


In [187]:
df2 = df2[['noticia', 'classificacao']]
df2 = df2.rename(columns={"noticia": "noticia", "classificacao": "label"})
df2['label'] = df2['label'].replace({0: 'true', 1: 'fake'})
df2.head()
df = df.append(df2, ignore_index=True)

In [188]:
x_train,x_test,y_train,y_test=train_test_split(df['noticia'], df['label'], test_size=0.2, random_state=42)

In [189]:
tfidf_vectorizer = TfidfVectorizer(min_df=5, max_features=500, ngram_range=(1,3),
                        lowercase =True, analyzer='word', token_pattern=r'\w+',
                        use_idf=True, smooth_idf=True, sublinear_tf=True, 
                        stop_words=stopwords.words('portuguese'))
tfidf_vectorizer.fit(x_train)
tfidf_train = tfidf_vectorizer.transform(x_train)
tfidf_test = tfidf_vectorizer.transform(x_test)

In [190]:
rf = RandomForestClassifier(random_state=0)
rf.fit(tfidf_train,y_train)

y_pred = rf.predict(tfidf_test)
score = accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 95.64%


In [65]:
model = SVC(probability=True, kernel='rbf')
model.fit(tfidf_train,y_train)

KeyboardInterrupt: 

In [45]:
predictions = model.predict(tfidf_test)
score = accuracy_score(y_test,predictions)
print(f'Accuracy: {round(score*100,2)}%')

In [46]:
print(confusion_matrix(y_test, predictions))

[[3611    6]
 [  11 3596]]


In [48]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

        fake       1.00      1.00      1.00      3617
        true       1.00      1.00      1.00      3607

    accuracy                           1.00      7224
   macro avg       1.00      1.00      1.00      7224
weighted avg       1.00      1.00      1.00      7224



In [22]:
param_grid = {'C': [0.1, 1, 10, 100, 1000], 'kernel': ['rbf']}

In [23]:
grid = GridSearchCV(SVC(probability=True, kernel='rbf'), param_grid, refit=True, verbose=5)
grid.fit(tfidf_train, y_train)
grid.best_params_

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] C=0.1, kernel=rbf ...............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


KeyboardInterrupt: 

In [328]:
grid_predictions = grid.predict(tfidf_test)
print(confusion_matrix(y_test, grid_predictions))

[[717  18]
 [ 32 678]]


In [329]:
print(classification_report(y_test,  grid_predictions))

              precision    recall  f1-score   support

        fake       0.96      0.98      0.97       735
        true       0.97      0.95      0.96       710

    accuracy                           0.97      1445
   macro avg       0.97      0.97      0.97      1445
weighted avg       0.97      0.97      0.97      1445



In [197]:
df3 = pd.read_csv('CoronaResult.csv')

In [198]:
df3.tail()

Unnamed: 0,noticia,classificacao
0,beber água quente mata o coronavírus,1
1,china desenvolveu vacina contra o coronavírus,1
2,o Brasil ultrapassa um milhão de infectados pe...,0


In [199]:
tfidf_target = tfidf_vectorizer.transform(df3['noticia'])

In [201]:
result_target = rf.predict(tfidf_target)
print(result_target)

['fake' 'true' 'fake']


In [202]:
for i in range(len(df3['noticia'])):
    if result_target[i]=='fake' :
        df3['classificacao'][i] = False
    else:
        df3['classificacao'][i] = True
    
display(df3)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,noticia,classificacao
0,beber água quente mata o coronavírus,False
1,china desenvolveu vacina contra o coronavírus,True
2,o Brasil ultrapassa um milhão de infectados pe...,False
