In [1]:
# Import libraries
import numpy as np
import pandas as pd
import itertools
import pickle
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
# Training news data. 
data = pd.read_csv('../Resources/news.csv')

In [3]:
#  Note the last column denoting whether articles are fake or real
data.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [4]:
# Dropping the first column not needed.
data.drop("Unnamed: 0", axis=1, inplace=True)

In [5]:
# Lowercasing all in 'text' column.
data['text'] = data['text'].apply(lambda x: x.lower())

In [6]:
data.head()

Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,"daniel greenfield, a shillman journalism fello...",FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,google pinterest digg linkedin reddit stumbleu...,FAKE
2,Kerry to go to Paris in gesture of sympathy,u.s. secretary of state john f. kerry said mon...,REAL
3,Bernie supporters on Twitter erupt in anger ag...,"— kaydee king (@kaydeeking) november 9, 2016 t...",FAKE
4,The Battle of New York: Why This Primary Matters,it's primary day in new york and front-runners...,REAL


In [7]:
# (rows, columns) returned
data.shape

(6335, 3)

In [8]:
labels = data['label']
labels.head()

0    FAKE
1    FAKE
2    REAL
3    FAKE
4    REAL
Name: label, dtype: object

In [9]:
# We expect the last column will be either FAKE or REAL.  Just confirming.
data['label'].unique()

array(['FAKE', 'REAL'], dtype=object)

In [10]:
# Doing futher sanity checking by counting number of each columns.
data = data.dropna()
data.count()

title    6335
text     6335
label    6335
dtype: int64

In [11]:
# Split the dataset
x_train, x_test, y_train, y_test = train_test_split(data['text'], labels, test_size=0.2, random_state=7)

In [12]:
# Initialize a TfidfVectorizer. 
# See for details https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)

# Fit and transform train set, transform test set
tfidf_train = tfidf_vectorizer.fit_transform(x_train) 
tfidf_test = tfidf_vectorizer.transform(x_test)

In [13]:
# Initialize a PassiveAggressiveClassifier
pac = PassiveAggressiveClassifier(max_iter=50)
pac.fit(tfidf_train, y_train)

# Predict on the test set and calculate accuracy
y_pred = pac.predict(tfidf_test)
score = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print(f'Accuracy: {round(score*100,2)}%')
print(f"Classification Report:\n{report}")

Accuracy: 92.9%
Classification Report:
              precision    recall  f1-score   support

        FAKE       0.93      0.92      0.93       638
        REAL       0.92      0.93      0.93       629

    accuracy                           0.93      1267
   macro avg       0.93      0.93      0.93      1267
weighted avg       0.93      0.93      0.93      1267



In [14]:
# Build confusion matrix
confusion_matrix(y_test,y_pred, labels=['FAKE','REAL'])

# Below in the order of
# [True Positives, True Negatives],
# [False Positives, False Negatives]

array([[589,  49],
       [ 41, 588]], dtype=int64)

In [15]:
y_test.head()

3534    REAL
6265    FAKE
3123    REAL
3940    REAL
2856    REAL
Name: label, dtype: object

In [16]:
# Set the Pandas column width not to truncate the long text lines.
pd.set_option('display.max_colwidth', None)
data.iloc[[3534, 6265], :]

Unnamed: 0,title,text,label
3534,"Hillary Clinton, Bernie Sanders: King's legacy is alive","a day after the candidates squared off in a fiery debate, they came to columbia, south carolina, and largely agreed that while king's impact can still be felt today, work still needs to be done to guarantee racial equality.\r\n\r\n""yes, the challenges we face are many, but so are the quiet heroes working in every corner of america today doing their part to make our country a better place,"" said the former secretary of state. ""i for one receive much inspiration from that simple fact.""\r\n\r\nthere was symbolism in the event organized by the naacp: in front of a statehouse that flew the confederate battle flag until it was taken down last year. all three candidates noted the flag being removed.\r\n\r\n""the flag is down but we are still here because that flag was just one piece of something bigger,"" clinton said. ""dr. king died with his work unfinished and it is up to us to see through.""\r\n\r\nsanders argued that king is not just a historic figure, but someone whose moral compass should guide people today. repeating the phrase ""i think if he were here today,"" sanders argued that if king were alive today, he would be supporting many of his presidential positions. ""as we celebrate his life it is terribly important to me that we don't just look at him as a museum figure, somebody in the past,"" the vermont senator said. ""it is important to me that we look at his vision, to see the america he wanted to see."" sanders and o'malley walked in the naacp sponsored march before the event, strolling down the streets of charleston as activists chanted. o'malley, whose birthday is monday, laughed when asked what he wanted for his birthday, telling reporters that he is hoping for ""beat expectations"" in iowa for his birthday.",REAL
6265,VIDEO : FBI SOURCES SAY INDICTMENT LIKELY FOR CLINTON – TruthFeed,"video : fbi sources say indictment likely for clinton video : fbi sources say indictment likely for clinton videos by truthfeednews november 3, 2016 \r\nbret baier: here’s the deal: we talked to two separate sources with intimate knowledge of the fbi investigations. one: the clinton foundation investigation is far more expansive than anybody has reported so far… several offices separately have been doing their own investigations. \r\ntwo: the immunity deal that cheryl mills and heather samuelson, two top aides to hillary clinton, got from the justice department in which it was beleived that the laptops they had, after a narrow review for classified materials, were going to be destroyed. we have been told that those have not been destroyed — they are at the fbi field office here on washington and are being exploited. . \r\nthree: the clinton foundation investigation is so expansive, they have interviewed and re-interviewed many people. they described the evidence they have as ‘a lot of it’ and said there is an ‘avalanche coming in every day.’ wikileaks and the new emails. \r\nthey are “actively and aggressively pursuing this case.” remember the foundation case is about accusations of pay-for-play… they are taking the new information and some of them are going back to interview people for the third time. as opposed to what has been written about the clinton foundation investigation, it is expansive. \r\nthe classified e-mail investigation is being run by the national security division of the fbi. they are currently combing through anthony weiner’s laptop. they are having some success — finding what they believe to be new emaisls, not duplicates, that have been transported through hillary clinton’s server. \r\nfinally, we learned there is a confidence from these sources that her server had been hacked. and that it was a 99% accuracy that it had been hacked by at least five foreign intelligence agencies, and that things had been taken from that… \r\nthere has been some angst about attorney general loretta lynch — what she has done or not done. she obviously did not impanel, or go to a grand jury at the beginning. they also have a problem, these sources do, with what president obama said today and back in october of 2015… \r\ni pressed again and again on this very issue… the investigations will continue, there is a lot of evidence. and barring some obstruction in some way, they believe they will continue to likely an indictment. \r\nsupport the trump movement and help us fight liberal media bias. please like and share this story on facebook or twitter.",FAKE


In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [18]:
# Create a logistic regression model
lr = LogisticRegression(max_iter=50)

In [19]:
# Train the model
lr.fit(tfidf_train, y_train)


In [20]:
# Make predictions on the testing set
y_pred2 = lr.predict(tfidf_test)

In [21]:
# Evaluate the model
score = accuracy_score(y_test, y_pred2)
report = classification_report(y_test, y_pred2)

In [22]:
print(f'Accuracy: {round(score*100,2)}%')
print(f"Classification Report:\n{report}")

Accuracy: 91.71%
Classification Report:
              precision    recall  f1-score   support

        FAKE       0.90      0.94      0.92       638
        REAL       0.94      0.89      0.91       629

    accuracy                           0.92      1267
   macro avg       0.92      0.92      0.92      1267
weighted avg       0.92      0.92      0.92      1267



In [23]:
# Build confusion matrix
confusion_matrix(y_test,y_pred2, labels=['FAKE','REAL'])

# Below in the order of
# [True Positives, True Negatives],
# [False Positives, False Negatives]

array([[600,  38],
       [ 67, 562]], dtype=int64)

In [24]:
# Checking to see how many of the tested samples are FAKE or REAL
y_test.value_counts()

label
FAKE    638
REAL    629
Name: count, dtype: int64

In [26]:
# Saving then loading trained model using pickel lib. See https://machinelearningmastery.com/save-load-machine-learning-models-python-scikit-learn/ and https://python.hotexamples.com/examples/sklearn.linear_model/PassiveAggressiveClassifier/-/python-passiveaggressiveclassifier-class-examples.html
saved_model = '../app/model/pac_model.pkl'
saved_vectorizer = '../app/model/tfidf_vectorizer.pkl'
saved_model2 = '../app/model/lr_model2.pkl'

# Must specify binary file format 'wb'
pickle.dump(pac, open(saved_model, 'wb'))
pickle.dump(tfidf_vectorizer, open(saved_vectorizer, 'wb'))
pickle.dump(lr, open(saved_model2, 'wb'))
