<h2>Dependecy Imports</h2>

In [11]:
import numpy as np
import pandas as pd
import re
import string

from sklearn.decomposition import TruncatedSVD
import seaborn as sns

import matplotlib.pyplot as plt
import joblib

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

<h2>Import data</h2>

In [2]:
real = pd.read_csv("./data/True.csv")
fake = pd.read_csv("./data/Fake.csv")

# add label "0" for fake "1" for real

real["label"] = 1
fake["label"] = 0

# combine and shuffle data

data = pd.concat([fake, real], axis=0)
data = data.sample(frac=1)
data.reset_index(inplace=True) 
data.drop(["index"], axis=1, inplace=True) 
data



Unnamed: 0,title,text,subject,date,label
0,An OBAMA “LOW LEVEL OFFENDER” Gets Early Relea...,According to the mother of the 35 year old mur...,politics,"Mar 4, 2016",0
1,Myriad of ways the CIA tried (and failed) to a...,RTThe father of the Cuban Revolution remains u...,Middle-east,"November 27, 2016",0
2,U.S. belatedly begins to comply with Russia sa...,WASHINGTON (Reuters) - The U.S. State Departme...,politicsNews,"October 26, 2017",1
3,Trump Literally Just Said He’d Take Guns Away...,In discussing shootings and crime in the first...,News,"September 26, 2016",0
4,There’s Something Hokey About Ted,"21st Century Wire says At some point, the poli...",US_News,"March 21, 2016",0
...,...,...,...,...,...
44893,"ASPIRATIONS: Young Chinese seize the day, seiz...",(Reuters) - The world is this generation s oys...,worldnews,"October 17, 2017",1
44894,HILARIOUS! Look Who LIBERAL Middlebury Profess...,"Two weeks ago at Middlebury College, Charles M...",politics,"Mar 14, 2017",0
44895,Senate to vote later on Wednesday to work with...,WASHINGTON (Reuters) - U.S. Senate Majority Le...,politicsNews,"December 6, 2017",1
44896,Trump pulls nearly even with Clinton after Rep...,NEW YORK (Reuters) - Republican presidential ...,politicsNews,"July 22, 2016",1


<h2>Cleaning and Preprocessing</h2>

In [3]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'https?://S+ | www\.\S+', '', text)
    text = re.sub(r'\n', '', text)
    text = re.sub('\[.*?\]', '', text)
    words=[]
    for i in text:
        if i not in string.punctuation:
            words.append(i)
    return ''.join(words)
    
data['title'] = data['title'].apply(clean_text)
data['text'] = data['text'].apply(clean_text)

data.shape


(44898, 5)

<h2>Test train split</h2>

In [4]:
X,y = data["text"], data["label"]
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

<h2>Vectorize text for model training</h2>

In [5]:
vectorizer = TfidfVectorizer(stop_words = "english", max_df =0.7)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [6]:
#save vectorizer
joblib.dump(vectorizer, 'server/vectorizer.pkl')

['server/vectorizer.pkl']

<h2> Train and evaluate classifier models</h2>

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

In [8]:
classifiers=[
    LogisticRegression(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LinearSVC()
]
for classifier in classifiers:
    classifier_name=classifier.__class__.__name__
    print(f"Training {classifier_name}...")
    classifier.fit(X_train_vectorized, y_train)
    y_pred=classifier.predict(X_test_vectorized)
    accuracy=accuracy_score(y_test, y_pred)
    print(f"\n {classifier_name} \n ")
    print(classification_report(y_test, y_pred))


Training LogisticRegression...

 LogisticRegression 
 
              precision    recall  f1-score   support

           0       0.99      0.98      0.98      4670
           1       0.98      0.98      0.98      4310

    accuracy                           0.98      8980
   macro avg       0.98      0.98      0.98      8980
weighted avg       0.98      0.98      0.98      8980

Training DecisionTreeClassifier...

 DecisionTreeClassifier 
 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4670
           1       1.00      1.00      1.00      4310

    accuracy                           1.00      8980
   macro avg       1.00      1.00      1.00      8980
weighted avg       1.00      1.00      1.00      8980

Training RandomForestClassifier...

 RandomForestClassifier 
 
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4670
           1       0.98      0.99      0.98      4310

 




 LinearSVC 
 
              precision    recall  f1-score   support

           0       1.00      0.99      1.00      4670
           1       0.99      1.00      0.99      4310

    accuracy                           1.00      8980
   macro avg       1.00      1.00      1.00      8980
weighted avg       1.00      1.00      1.00      8980



In [9]:
from joblib import dump, load
model = classifiers[3]
dump(model, 'server/model.pkl') 

['server/model.pkl']