# Classifying fake news using Supervised Learning with NLP

In [5]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [4]:
#Importing the dataset

df = pd.read_csv("fake_or_real_news.csv", index_col=[0])
df.head()

Unnamed: 0,title,text,label
8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


### Training the model

In [8]:
# Create a series to store the labels: y
y = df.label
y[:2]

8476     FAKE
10294    FAKE
Name: label, dtype: object

In [14]:
# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], y,
                                                   test_size = 0.33, random_state = 53) 

# Initialize a CountVectorizer object
count_vectorizer = CountVectorizer(stop_words = 'english')

# Transform the training data using only the 'text' column values
count_train = count_vectorizer.fit_transform(X_train)

# Transform the test data using only the 'text' column values
count_test = count_vectorizer.transform(X_test)

In [16]:
# Printing the first 10 features of the count_vectorizer
print(count_vectorizer.get_feature_names()[:50])

['00', '000', '0000', '00000031', '000035', '00006', '0001', '0001pt', '000ft', '000km', '001', '0011', '002', '003', '004', '006', '006s', '007', '007s', '008', '008s', '009', '0099', '00am', '00p', '00pm', '01', '010', '013', '014', '015', '016', '018', '01am', '02', '020', '022', '023', '024', '025', '027', '028', '02welcome', '03', '031', '032', '0325', '033', '034', '035']


## TfidfVectorizer for text classification

In [17]:
# Import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize a TfidfVectorizer object: tfidf_vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df = 0.7)

# Transform the training data: tfidf_train 
tfidf_train = tfidf_vectorizer.fit_transform(X_train)

# Transform the test data: tfidf_test 
tfidf_test = tfidf_vectorizer.transform(X_test)

# Print the first 10 features
print(tfidf_vectorizer.get_feature_names()[:10])

# Print the first 5 vectors of the tfidf training data
print(tfidf_train.A[:5])

['00', '000', '0000', '00000031', '000035', '00006', '0001', '0001pt', '000ft', '000km']
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


## Inspecting the vectors

In [20]:
# Create the CountVectorizer DataFrame: count_df
count_df = pd.DataFrame(count_train.A, columns=count_vectorizer.get_feature_names())

# Create the TfidfVectorizer DataFrame: tfidf_df
tfidf_df = pd.DataFrame(tfidf_train.A, columns=tfidf_vectorizer.get_feature_names())

In [21]:
count_df.head()

Unnamed: 0,00,000,0000,00000031,000035,00006,0001,0001pt,000ft,000km,...,حلب,عربي,عن,لم,ما,محاولات,من,هذا,والمرضى,ยงade
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
tfidf_df.head()

Unnamed: 0,00,000,0000,00000031,000035,00006,0001,0001pt,000ft,000km,...,حلب,عربي,عن,لم,ما,محاولات,من,هذا,والمرضى,ยงade
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
#Testing if the column names are the same for each DataFrame by creating a new object called difference

# Calculate the difference in columns: difference
difference = set(count_df.columns) - set(tfidf_df.columns)
print(difference)

set()


In [24]:
# Check whether the DataFrames are equal
print(count_df.equals(tfidf_df))

False


# Naive Bayes Classifier

<img src="naive_b_nlp.png" width="600" height="300">

## Training and testing the "fake news" model with CountVectorizer

In [38]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

# Create a Multinomial Naive Bayes classifier: nb_classifier
nb_classifier = MultinomialNB()

# Fit the classifier to the training data
nb_classifier.fit(count_train, y_train)

# Create the predicted tags: pred
pred = nb_classifier.predict(count_test)

# Calculate the accuracy score: score
print(f'The accuracy score is {metrics.accuracy_score(y_test, pred)}')

# Calculate the confusion matrix: cm
print(f"The confusion matrix is \n {metrics.confusion_matrix(y_test, pred, labels=['FAKE','REAL'])}")

The accuracy score is 0.893352462936394
The confusion matrix is 
 [[ 865  143]
 [  80 1003]]


## Training and testing the "fake news" model with TfidfVectorizer

In [39]:
# Create a Multinomial Naive Bayes classifier: nb_classifier
nb_classifier = MultinomialNB()

# Fit the classifier to the training data
nb_classifier.fit(tfidf_train, y_train)

# Create the predicted tags: pred
pred = nb_classifier.predict(tfidf_test)

# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, pred)
print(score)

# Calculate the confusion matrix: cm
cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
print(cm)

0.8565279770444764
[[ 739  269]
 [  31 1052]]


# Improving the model

Testing a few different alpha levels using the Tfidf vectors to determine if there is a better performing combination.

In [40]:
# Creating the list of alphas
alphas = np.arange(0,1,0.1)

# Defining train_and_predict()
def train_and_predict(alpha):
    # Instantiating the classifier: nb_classifier
    nb_classifier = MultinomialNB(alpha=alpha)
    # Fitting to the training data
    nb_classifier.fit(tfidf_train, y_train)
    # Predicting the labels: pred
    pred = nb_classifier.predict(tfidf_test)
    # Computing accuracy: score
    score = metrics.accuracy_score(y_test, pred)
    return score

# Iterating over the alphas and print the corresponding score
for alpha in alphas:
    print('Alpha: ', alpha)
    print('Score: ', train_and_predict(alpha))
    print()

Alpha:  0.0
Score:  0.8813964610234337

Alpha:  0.1
Score:  0.8976566236250598

Alpha:  0.2
Score:  0.8938307030129125

Alpha:  0.30000000000000004
Score:  0.8900047824007652

Alpha:  0.4
Score:  0.8857006217120995

Alpha:  0.5
Score:  0.8842659014825442

Alpha:  0.6000000000000001
Score:  0.874701099952176

Alpha:  0.7000000000000001
Score:  0.8703969392635102

Alpha:  0.8
Score:  0.8660927785748446

Alpha:  0.9
Score:  0.8589191774270684



  'setting alpha = %.1e' % _ALPHA_MIN)


### Inspecting the model

Now that I have built a "fake news" classifier, I'll investigate what it has learned. I'm going to map the important vector weights back to actual words using some simple inspection techniques.

In [47]:
# Getting the class labels
class_labels = nb_classifier.classes_

# Extracting the features
feature_names = tfidf_vectorizer.get_feature_names()

# Zipping the feature names together with the coefficient array and sort by weights
feat_with_weights = sorted(zip(nb_classifier.coef_[0], feature_names))

# Printing the first class label and the top 20 feat_with_weights entries
print(class_labels[0], feat_with_weights[:20])

print()

# Printing the second class label and the bottom 20 feat_with_weights entries
print(class_labels[1], feat_with_weights[-20:])

FAKE [(-11.316312804238807, '0000'), (-11.316312804238807, '000035'), (-11.316312804238807, '0001'), (-11.316312804238807, '0001pt'), (-11.316312804238807, '000km'), (-11.316312804238807, '0011'), (-11.316312804238807, '006s'), (-11.316312804238807, '007'), (-11.316312804238807, '007s'), (-11.316312804238807, '008s'), (-11.316312804238807, '0099'), (-11.316312804238807, '00am'), (-11.316312804238807, '00p'), (-11.316312804238807, '00pm'), (-11.316312804238807, '014'), (-11.316312804238807, '015'), (-11.316312804238807, '018'), (-11.316312804238807, '01am'), (-11.316312804238807, '020'), (-11.316312804238807, '023')]

REAL [(-7.742481952533027, 'states'), (-7.717550034444668, 'rubio'), (-7.703583809227384, 'voters'), (-7.654774992495461, 'house'), (-7.649398936153309, 'republicans'), (-7.6246184189367, 'bush'), (-7.616556675728881, 'percent'), (-7.545789237823644, 'people'), (-7.516447881078008, 'new'), (-7.448027933291952, 'party'), (-7.411148410203476, 'cruz'), (-7.410910239085596, 's