<h2>Dependecy Imports</h2>

In [20]:
import numpy as np
import pandas as pd
import re
import string

import joblib

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

<h2>Import data</h2>

In [21]:
real = pd.read_csv("./data/True.csv")
fake = pd.read_csv("./data/Fake.csv")

# add label "0" for fake "1" for real

real["label"] = 1
fake["label"] = 0

# combine and shuffle data

data = pd.concat([fake, real], axis=0)
data = data.sample(frac=1)
data.reset_index(inplace=True) 
data.drop(["index"], axis=1, inplace=True) 
data



Unnamed: 0,title,text,subject,date,label
0,U.S. Congressman asks DoD to investigate Trans...,(Reuters) - U.S. Congressman Ro Khanna asked t...,politicsNews,"March 21, 2017",1
1,Trump’s EPA OKs Pesticide That Causes Brain D...,Farmworkers were pulled from fields on Friday ...,News,"May 15, 2017",0
2,FEDERAL JUDGE STEPS IN To Review Legroom On Co...,Firebrand conservative Ann Coulter exposed Del...,politics,"Jul 31, 2017",0
3,U.S. House Speaker Ryan: Meeting with Trump 'e...,WASHINGTON (Reuters) - U.S. House Speaker Paul...,politicsNews,"May 12, 2016",1
4,Scalia death a blow to Obamacare contraception...,WASHINGTON (Reuters) - Christian groups asking...,politicsNews,"March 20, 2016",1
...,...,...,...,...,...
44893,France criticizes Russian stance on Syria toxi...,PARIS (Reuters) - France criticized Russia on ...,worldnews,"October 19, 2017",1
44894,AWESOME! WATCH DONALD TRUMP GIVE THE PERFECT A...,There s something about a politician who s not...,politics,"Jul 12, 2015",0
44895,Corporate AMT likely will not be in final U.S....,WASHINGTON (Reuters) - The chairman of the U.S...,politicsNews,"December 6, 2017",1
44896,Kansas asks U.S. appeals court to reinstate st...,DENVER (Reuters) - Kansas on Tuesday asked a U...,politicsNews,"August 23, 2016",1


<h2>Cleaning and Preprocessing</h2>

In [22]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'https?://S+ | www\.\S+', '', text)
    text = re.sub(r'\n', '', text)
    text = re.sub('\[.*?\]', '', text)
    words=[]
    for i in text:
        if i not in string.punctuation:
            words.append(i)
    return ''.join(words)
    
data['title'] = data['title'].apply(clean_text)
data['text'] = data['text'].apply(clean_text)

data.shape


(44898, 5)

<h2>Test train split</h2>

In [23]:
X,y = data["text"], data["label"]
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

<h2>Vectorize text for model training</h2>

In [24]:
vectorizer = TfidfVectorizer(stop_words = "english", max_df =0.7)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [25]:
#save vectorizer
joblib.dump(vectorizer, 'server/vectorizer.pkl')

['server/vectorizer.pkl']

<h2> Train and evaluate classifier models</h2>

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

In [27]:
classifiers=[
    LogisticRegression(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LinearSVC()
]
for classifier in classifiers:
    classifier_name=classifier.__class__.__name__
    print(f"Training {classifier_name}...")
    classifier.fit(X_train_vectorized, y_train)
    y_pred=classifier.predict(X_test_vectorized)
    accuracy=accuracy_score(y_test, y_pred)
    print(f"\n {classifier_name} \n ")
    print(classification_report(y_test, y_pred))


Training LogisticRegression...

 LogisticRegression 
 
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      4698
           1       0.98      0.98      0.98      4282

    accuracy                           0.98      8980
   macro avg       0.98      0.98      0.98      8980
weighted avg       0.98      0.98      0.98      8980

Training DecisionTreeClassifier...

 DecisionTreeClassifier 
 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4698
           1       1.00      1.00      1.00      4282

    accuracy                           1.00      8980
   macro avg       1.00      1.00      1.00      8980
weighted avg       1.00      1.00      1.00      8980

Training RandomForestClassifier...

 RandomForestClassifier 
 
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4698
           1       0.99      0.99      0.99      4282

 




 LinearSVC 
 
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      4698
           1       0.99      0.99      0.99      4282

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



In [28]:
from joblib import dump, load
model = classifiers[3]
dump(model, 'server/model.pkl') 

['server/model.pkl']