In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import re
import string

### Processing the English Training Dataset

In [2]:
df = pd.read_csv("RawEnglishDataset.csv")
df.drop(columns=["Headline","Authors","Date","URL","Brand"],inplace=True)
df.loc[df["Label"]=="Credible",["Label"]] = 1
df.loc[df["Label"]=="Not Credible",["Label"]] = 0
df.rename(columns={"Label":"label","Content":"article"},inplace=True)
df

Unnamed: 0,article,label
0,Pollution caused by traditional cooking fuel i...,1
1,Justice Secretary Vitaliano Aguirre 2nd and Ph...,1
2,President Rodrigo Duterte on Monday night desc...,1
3,THE militant fisher folk group Pambansang Laka...,1
4,Magdalo Rep. Gary Alejano is willing to lead t...,1
...,...,...
22453,"Indeed, everybody is shocked — just shocked! —...",0
22454,"A TOTAL of 132,259 individuals from 28,101 fam...",1
22455,Shortly after Rod Duterte announced there will...,0
22456,President Barack Obama met for the first time ...,0


##### Checking for Null data points

In [3]:
df.isnull().sum()

article    0
label      0
dtype: int64

### Extracting data for testing

In [4]:
df.shape
df = df.sample(frac = 1)
testing_df = df.tail(378)
for i in range(22457,22079,-1):
    df.drop([i], axis = 0, inplace = True)

### Function for cleaning up the text

In [5]:
def cleantext(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) 
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)    
    return text

df["article"] = df["article"].apply(cleantext)

### Defining Variables

In [6]:
x = df["article"]
y = df["label"]
y=y.astype('int')

### Split to Training and Testing Sets

In [7]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

### Vectorizing text

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)

### Logistic Regression Model

In [10]:
from sklearn.linear_model import LogisticRegression

In [11]:
LR = LogisticRegression()
LR.fit(xv_train,y_train)

LogisticRegression()

In [12]:
pred_lr=LR.predict(xv_test)

In [13]:
LR.score(xv_test, y_test)

0.9514492753623188

In [14]:
print(classification_report(y_test, pred_lr))

              precision    recall  f1-score   support

           0       0.95      0.90      0.93      1865
           1       0.95      0.98      0.96      3655

    accuracy                           0.95      5520
   macro avg       0.95      0.94      0.95      5520
weighted avg       0.95      0.95      0.95      5520



### Decision Tree Classification

In [15]:
from sklearn.tree import DecisionTreeClassifier

In [16]:
DT = DecisionTreeClassifier()
DT.fit(xv_train, y_train)

DecisionTreeClassifier()

In [17]:
pred_dt = DT.predict(xv_test)

In [18]:
DT.score(xv_test, y_test)

0.8507246376811595

In [19]:
print(classification_report(y_test, pred_dt))

              precision    recall  f1-score   support

           0       0.78      0.78      0.78      1865
           1       0.89      0.89      0.89      3655

    accuracy                           0.85      5520
   macro avg       0.83      0.83      0.83      5520
weighted avg       0.85      0.85      0.85      5520



### Gradient Boosting Classifier

In [20]:
from sklearn.ensemble import GradientBoostingClassifier

In [21]:
GBC = GradientBoostingClassifier(random_state=0)
GBC.fit(xv_train, y_train)

GradientBoostingClassifier(random_state=0)

In [22]:
pred_gbc = GBC.predict(xv_test)

In [23]:
GBC.score(xv_test, y_test)

0.922463768115942

In [24]:
print(classification_report(y_test, pred_gbc))

              precision    recall  f1-score   support

           0       0.92      0.84      0.88      1865
           1       0.92      0.96      0.94      3655

    accuracy                           0.92      5520
   macro avg       0.92      0.90      0.91      5520
weighted avg       0.92      0.92      0.92      5520



### Random Forest Classifier

In [25]:
from sklearn.ensemble import RandomForestClassifier

In [26]:
RFC = RandomForestClassifier(random_state=0)
RFC.fit(xv_train, y_train)

RandomForestClassifier(random_state=0)

In [27]:
pred_rfc = RFC.predict(xv_test)

In [28]:
RFC.score(xv_test, y_test)

0.9222826086956522

In [29]:
print(classification_report(y_test, pred_rfc))

              precision    recall  f1-score   support

           0       0.93      0.83      0.88      1865
           1       0.92      0.97      0.94      3655

    accuracy                           0.92      5520
   macro avg       0.92      0.90      0.91      5520
weighted avg       0.92      0.92      0.92      5520



### Testing the Models

In [30]:
def output_label(n):
    if n == 0:
        return "Fake News"
    elif n == 1:
        return "Real News"
    
def LR_test(news):
    testing_news = {"text":[news]}
    new_def_test = pd.DataFrame(testing_news)
    new_def_test["text"] = new_def_test["text"].apply(cleantext) 
    new_x_test = new_def_test["text"]
    new_xv_test = vectorization.transform(new_x_test)
    pred_LR = LR.predict(new_xv_test)
    return pred_LR[0]

def DT_test(news):
    testing_news = {"text":[news]}
    new_def_test = pd.DataFrame(testing_news)
    new_def_test["text"] = new_def_test["text"].apply(cleantext) 
    new_x_test = new_def_test["text"]
    new_xv_test = vectorization.transform(new_x_test)
    pred_DT = DT.predict(new_xv_test)
    return pred_DT[0]

def GBC_test(news):
    testing_news = {"text":[news]}
    new_def_test = pd.DataFrame(testing_news)
    new_def_test["text"] = new_def_test["text"].apply(cleantext) 
    new_x_test = new_def_test["text"]
    new_xv_test = vectorization.transform(new_x_test)
    pred_GBC = GBC.predict(new_xv_test)
    return pred_GBC[0]
    
def RFC_test(news):
    testing_news = {"text":[news]}
    new_def_test = pd.DataFrame(testing_news)
    new_def_test["text"] = new_def_test["text"].apply(cleantext) 
    new_x_test = new_def_test["text"]
    new_xv_test = vectorization.transform(new_x_test)
    pred_RFC = RFC.predict(new_xv_test)
    return pred_RFC[0]

def manual_testing(news):
    return print(f'''
    
    LR Prediction: {output_label(LR_test(news))}
    DT Prediction: {output_label(DT_test(news))}
    GBC Prediction: {output_label(GBC_test(news))}
    RFC Prediction: {output_label(RFC_test(news))}
    ''')
    
def classify_result(correct, prediction):
    if correct == 1 and prediction == 1:
        return "True Positive"
    elif correct == 1 and prediction == 0:
        return "False Negative"
    elif correct == 0 and prediction == 0:
        return "True Negative"
    elif correct == 0 and prediction == 1:
        return "False Positive"
    

In [40]:
analysis_df = testing_df.copy()

In [43]:
analysis_df.loc[:,["LR"]] = analysis_df["article"].apply(LR_test)
analysis_df.loc[:,["LR"]] = np.vectorize(classify_result)(analysis_df["label"],analysis_df["LR"])

analysis_df.loc[:,["DT"]] = analysis_df["article"].apply(DT_test)
analysis_df.loc[:,["DT"]] = np.vectorize(classify_result)(analysis_df["label"],analysis_df["DT"])

analysis_df.loc[:,["GBC"]] = analysis_df["article"].apply(GBC_test)
analysis_df.loc[:,["GBC"]] = np.vectorize(classify_result)(analysis_df["label"],analysis_df["GBC"])

analysis_df.loc[:,["RFC"]] = analysis_df["article"].apply(RFC_test)
analysis_df.loc[:,["RFC"]] = np.vectorize(classify_result)(analysis_df["label"],analysis_df["RFC"])

analysis_df.to_csv("test_run.csv", index=False)

In [46]:
print(analysis_df['LR'].str.contains('True Positive').value_counts()[True])
print(analysis_df['LR'].str.contains('True Negative').value_counts()[True])
print(analysis_df['LR'].str.contains('False Positive').value_counts()[True])
print(analysis_df['LR'].str.contains('False Negative').value_counts()[True])

256
114
5
3


In [47]:
print(analysis_df['DT'].str.contains('True Positive').value_counts()[True])
print(analysis_df['DT'].str.contains('True Negative').value_counts()[True])
print(analysis_df['DT'].str.contains('False Positive').value_counts()[True])
print(analysis_df['DT'].str.contains('False Negative').value_counts()[True])

248
110
9
11


In [48]:
print(analysis_df['GBC'].str.contains('True Positive').value_counts()[True])
print(analysis_df['GBC'].str.contains('True Negative').value_counts()[True])
print(analysis_df['GBC'].str.contains('False Positive').value_counts()[True])
print(analysis_df['GBC'].str.contains('False Negative').value_counts()[True])

253
108
11
6


In [49]:
print(analysis_df['RFC'].str.contains('True Positive').value_counts()[True])
print(analysis_df['RFC'].str.contains('True Negative').value_counts()[True])
print(analysis_df['RFC'].str.contains('False Positive').value_counts()[True])
print(analysis_df['RFC'].str.contains('False Negative').value_counts()[True])

257
117
2
2


In [32]:
news = str(input())
manual_testing(news)

 hi



    
    LR Prediction: Fake News
    DT Prediction: Fake News
    GBC Prediction: Fake News
    RFC Prediction: Fake News
    


In [33]:
testing_df

Unnamed: 0,article,label,LR
9987,Heavily armed rebels of the communist-led New...,1,True Positive
10430,53 out of the 310 alleged rouge policemen brav...,0,False Positive
20684,How can some Filipinos profess to be Catholics...,1,True Positive
12707,The Philippine Red Cross (PRC) said it had act...,1,True Positive
19571,The sister-in-law of the elusive Gerardo Limli...,1,True Positive
...,...,...,...
14421,The Philippines started monitoring suspected b...,1,True Positive
21833,Over 200 sacks of rice and assorted relief goo...,1,True Positive
4907,"Only 160 out of 2,446 barangays in Central Vis...",1,True Positive
10303,"About 50,000 farmers stand to become new owner...",1,True Positive


### Graphing Accuracy of Models

In [34]:
import matplotlib.pyplot as plt

In [35]:
#classify_result(testing_df['label'],output_label(LR_test(testing_df['article'])))

In [36]:
def manual_testing(news):
    testing_news = {"text":[news]}
    new_def_test = pd.DataFrame(testing_news)
    new_def_test["text"] = new_def_test["text"].apply(cleantext) 
    new_x_test = new_def_test["text"]
    new_xv_test = vectorization.transform(new_x_test)
    pred_LR = LR.predict(new_xv_test)
    pred_DT = DT.predict(new_xv_test)
    pred_GBC = GBC.predict(new_xv_test)
    pred_RFC = RFC.predict(new_xv_test)

    return print("\n\nLR Prediction: {} \nDT Prediction: {} \nGBC Prediction: {} \nRFC Prediction: {}".format(output_label(pred_LR[0]), 
                                                                                                              output_label(pred_DT[0]), 
                                                                                                              output_label(pred_GBC[0]), 
                                                                                                              output_label(pred_RFC[0])))       