In [11]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import re
import string

### Processing the English Training Dataset

In [10]:
df = pd.read_csv("dataset2.csv")
df.drop(columns=["Headline","Authors","Date","URL","Brand"],inplace=True)
df.loc[df["Label"]=="Credible",["Label"]] = 1
df.loc[df["Label"]=="Not Credible",["Label"]] = 0
df.rename(columns={"Label":"label","Content":"article"},inplace=True)
df

Unnamed: 0,article,label
0,Pollution caused by traditional cooking fuel i...,1
1,Justice Secretary Vitaliano Aguirre 2nd and Ph...,1
2,President Rodrigo Duterte on Monday night desc...,1
3,THE militant fisher folk group Pambansang Laka...,1
4,Magdalo Rep. Gary Alejano is willing to lead t...,1
...,...,...
22453,"Indeed, everybody is shocked — just shocked! —...",0
22454,"A TOTAL of 132,259 individuals from 28,101 fam...",1
22455,Shortly after Rod Duterte announced there will...,0
22456,President Barack Obama met for the first time ...,0


##### Checking for Null data points

In [24]:
df.isnull().sum()

article    0
label      0
dtype: int64

### Extracting data for testing

### Function for cleaning up the text

In [12]:
def cleantext(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) 
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)    
    return text

df["article"] = df["article"].apply(wordopt)

### Defining Variables

In [26]:
x = df["article"]
y = df["label"]
y=y.astype('int')

### Split to Training and Testing Sets

In [27]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

### Vectorizing text

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [29]:
vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)

### Creating Logistic Regression Model

In [30]:
from sklearn.linear_model import LogisticRegression

In [31]:
LR = LogisticRegression()
LR.fit(xv_train,y_train)

LogisticRegression()

In [33]:
pred_lr=LR.predict(xv_test)

In [34]:
LR.score(xv_test, y_test)

0.9506678539626002

In [35]:
print(classification_report(y_test, pred_lr))

              precision    recall  f1-score   support

           0       0.95      0.91      0.93      1935
           1       0.95      0.97      0.96      3680

    accuracy                           0.95      5615
   macro avg       0.95      0.94      0.94      5615
weighted avg       0.95      0.95      0.95      5615



### Testing the Models

In [38]:
def output_lable(n):
    if n == 0:
        return "Fake News"
    elif n == 1:
        return "Not A Fake News"
    
def manual_testing(news):
    testing_news = {"text":[news]}
    new_def_test = pd.DataFrame(testing_news)
    new_def_test["text"] = new_def_test["text"].apply(cleantext) 
    new_x_test = new_def_test["text"]
    new_xv_test = vectorization.transform(new_x_test)
    pred_LR = LR.predict(new_xv_test)

    return print("\n\nLR Prediction: {} ".format(output_lable(pred_LR[0])))

In [45]:
news = str(input())
manual_testing(news)

 "Our lawyers have verified. A warrant of arrest was issued," Kinumpirma na mismo ni Rappler CEO Maria Ressa na nailabas na umano ang warrant of arrest laban sa kanya kaugnay sa kasong tax evasion. Ayon sa kampo ni Ressa, magpa-piyansa sila agad-agad sa Lunes kung sakaling arestuhin ito ng Linggo. Nahaharap sa 4 na kasong tax evasion si Ressa. Isa pang kaso ang inihain sa Pasig court kaugnay naman sa paglabag sa Section 255 ng National Internal Revenue Code (NIRC) at ang hindi pagsusumite ng ulat tungkol sa total quarterly sales receipts na mula sa Philippine Depositary Receipts (PDR) noong 2015. Alam niyo bang naging laman ng balita ang Rappler noong Pebrero 2018 dahil sa malisyosong impormasyon ikinalat nila laban kay kay dating Special Assistant to the President Bong Go? Panoorin kung paano sinermonan ni Presidente Duterte ang reporter ng Rappler dahil sa maling impormasyon na ikinalat ng mga ito kay Go. Tila positibo naman tinanggap ng ilang netizens ang balitang ito. Source: GMA




LR Prediction: Fake News 


### Graphing Accuracy of Models

In [49]:
import matplotlib.pyplot as plt