## NLP Assignment #1: Fake News Detection
##### By: Areknaz Khaligian & Conrad Lee

### Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from nltk.classify import MaxentClassifier
import nltk.corpus
import string
import re
from nltk.stem import SnowballStemmer
from nltk.stem.snowball import EnglishStemmer
from nltk.stem import WordNetLemmatizer
import spacy
import en_core_web_sm
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS as stopwords 
from spacy.lang.en import English

### Import the Data

In [2]:
train = pd.read_csv("fake_or_real_news_training.csv")
train.head()

Unnamed: 0,ID,title,text,label,X1,X2
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE,,
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE,,
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL,,
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE,,
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL,,


In [3]:
test = pd.read_csv("fake_or_real_news_test.csv")
test.head()

Unnamed: 0,ID,title,text
0,10498,September New Homes Sales Rise——-Back To 1992 ...,September New Homes Sales Rise Back To 1992 Le...
1,2439,Why The Obamacare Doomsday Cult Can't Admit It...,But when Congress debated and passed the Patie...
2,864,"Sanders, Cruz resist pressure after NY losses,...",The Bernie Sanders and Ted Cruz campaigns vowe...
3,4128,Surviving escaped prisoner likely fatigued and...,Police searching for the second of two escaped...
4,662,Clinton and Sanders neck and neck in Californi...,No matter who wins California's 475 delegates ...


### Check the 'label' column for incorrect values (not REAL or FAKE)

In [4]:
train['label'].value_counts()

REAL                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    

It looks like there are some misplaced values in the 'label' column that we will need to correct. Our method is to take the extra text in the label, X1, and X2 columns and concatenate it with the existing text in the 'text' column, then fill the 'label' column with the correct REAL or FAKE class.

### Create a copy of the train dataset

In [5]:
clean_train = train.copy()

### Clean the data columns

In [6]:
for i in np.arange(0,len(clean_train)):
    if (clean_train.loc[i]['X2'] == 'REAL' or clean_train.loc[i]['X2'] == 'FAKE'):
        clean_train.loc[i, 'text'] = clean_train.loc[i]['text'] + ' ' + clean_train.loc[i]['label'] + ' ' + clean_train.loc[i]['X1']
        clean_train.loc[i, 'label'] = clean_train.loc[i]['X2']
        clean_train.loc[i, 'X1'] = None
        clean_train.loc[i, 'X2'] = None
    elif (clean_train.loc[i]['X1'] == 'REAL' or clean_train.loc[i]['X1'] == 'FAKE'):
        clean_train.loc[i, 'text'] = clean_train.loc[i]['text'] + ' ' + clean_train.loc[i]['label']
        clean_train.loc[i, 'label'] = clean_train.loc[i]['X1']
        clean_train.loc[i, 'X1'] = None
        clean_train.loc[i, 'X2'] = None
    else:
        pass

### Confirm that the 'label' column contains only REAL or FAKE values

Count existing real/fake values.

In [7]:
clean_train['label'].value_counts()

REAL    2008
FAKE    1991
Name: label, dtype: int64

### Replace the original train dataset with the clean version

In [8]:
train = clean_train

In [9]:
train['label'].value_counts()

REAL    2008
FAKE    1991
Name: label, dtype: int64

### Assign the training labels to variable y

In [10]:
# set y to target column
y = train["label"]
y.head()

0    FAKE
1    FAKE
2    REAL
3    FAKE
4    REAL
Name: label, dtype: object

### 1.0 - Baseline Model (Naive Bayes & CountVectorizer with English Stopwords)

First, we decided to test a baseline model with the Naive Bayes Classifier, Count Vectorizer, and removing English stopwords.  This gives us score than we can use to compare against more complex models and pre-processing techniques.

### Experimenting with different column combinations

One thing we can try is using different combinations of title and text columns to see if it improves the accuracy of the classifier.

#### Creating New Column (title_text)

Here we concatenate the title and text column to create a new "title_text" variable.

In [11]:
# create title_text by strining together title and text
train["title_text"] = train["title"] + " " + train["text"]
test["title_text"] = test["title"] + " " + test["text"]

In [12]:
train.head()

Unnamed: 0,ID,title,text,label,X1,X2,title_text
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE,,,You Can Smell Hillary’s Fear Daniel Greenfield...
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE,,,Watch The Exact Moment Paul Ryan Committed Pol...
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL,,,Kerry to go to Paris in gesture of sympathy U....
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE,,,Bernie supporters on Twitter erupt in anger ag...
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL,,,The Battle of New York: Why This Primary Matte...


In [13]:
test.head()

Unnamed: 0,ID,title,text,title_text
0,10498,September New Homes Sales Rise——-Back To 1992 ...,September New Homes Sales Rise Back To 1992 Le...,September New Homes Sales Rise——-Back To 1992 ...
1,2439,Why The Obamacare Doomsday Cult Can't Admit It...,But when Congress debated and passed the Patie...,Why The Obamacare Doomsday Cult Can't Admit It...
2,864,"Sanders, Cruz resist pressure after NY losses,...",The Bernie Sanders and Ted Cruz campaigns vowe...,"Sanders, Cruz resist pressure after NY losses,..."
3,4128,Surviving escaped prisoner likely fatigued and...,Police searching for the second of two escaped...,Surviving escaped prisoner likely fatigued and...
4,662,Clinton and Sanders neck and neck in Californi...,No matter who wins California's 475 delegates ...,Clinton and Sanders neck and neck in Californi...


#### Naive Bayes + title only

Here is the result with the NB Classifier, Count Vectorizer (english stopwords), and only the title variable field.

In [14]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(
    train["title"], y, test_size=0.2, random_state=42
)

In [15]:
# define count vectorizer with english stopwords
count_vectorizer = CountVectorizer(stop_words="english")

# train model on X_train
count_train = count_vectorizer.fit_transform(X_train.values)

# transform X_test to match X_train
count_test = count_vectorizer.transform(X_test.values)

# define Naive Bayes Classifier
nb_classifier = MultinomialNB()

# train NB
nb_classifier.fit(count_train, y_train)

# predict
pred = nb_classifier.predict(count_test)

# return accuracy score
nb_title_cvect_score = metrics.accuracy_score(y_test, pred)
nb_title_cvect_score

0.83

#### Naive Bayes + text only

Here is the result with the NB Classifier, Count Vectorizer (english stopwords), and only the text variable field.     
Note that the score has increased.  This was to be expected because there is more information to train on compared to the shorter title variable.

In [16]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(
    train["text"], y, test_size=0.2, random_state=42
)

In [17]:
# define count vectorizer with english stopwords
count_vectorizer = CountVectorizer(stop_words="english")

# train model on X_train
count_train = count_vectorizer.fit_transform(X_train.values)

# transform X_test to match X_train
count_test = count_vectorizer.transform(X_test.values)

# define Naive Bayes Classifier
nb_classifier = MultinomialNB()

# train NB
nb_classifier.fit(count_train, y_train)

# predict
pred = nb_classifier.predict(count_test)

# return accuracy score
nb_text_cvect_score = metrics.accuracy_score(y_test, pred)
nb_text_cvect_score

0.88375

#### Naive Bayes + title_text

Here is the result with the NB Classifier, Count Vectorizer (english stopwords), and the combined title_text variable field.     
Once again, the score improved (incrementally) due to the increase in information.  This smaller margin of improvement is reasonable since the amount of increased information from the title is marginally less given that we already have a majority of the information from the text.

In [18]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(
    train["title_text"], y, test_size=0.2, random_state=42
)

In [19]:
# define count vectorizer with english stopwords
count_vectorizer = CountVectorizer(stop_words="english")

# train model on X_train
count_train = count_vectorizer.fit_transform(X_train.values)

# transform X_test to match X_train
count_test = count_vectorizer.transform(X_test.values)

# define Naive Bayes Classifier
nb_classifier = MultinomialNB()

# train NB
nb_classifier.fit(count_train, y_train)

# predict
pred = nb_classifier.predict(count_test)

# return accuracy score
nb_title_text_cvect_score = metrics.accuracy_score(y_test, pred)
nb_title_text_cvect_score

0.88875

### Results/Comments

Here we see that combining title and text give the highest accuracy.       
We will store the title_text score as the representative socre for the baseline model and we will use the title_text field for training the remainder of the models.

In [20]:
# organize scores in a dataframe
nb_cvect_scores = pd.DataFrame(
    data=[
        nb_title_cvect_score,
        nb_text_cvect_score,
        nb_title_text_cvect_score
    ],
    index=["title", "text", "title_text"],
    columns=["nb_cvect_scores"],
)
nb_cvect_scores

Unnamed: 0,nb_cvect_scores
title,0.83
text,0.88375
title_text,0.88875


In [21]:
# store title text score as the main count vectorizer score
nb_cvect_score = nb_title_text_cvect_score

In [22]:
# train test split (to be used for the remainder of the notebook)
X_train, X_test, y_train, y_test = train_test_split(
    train["title_text"], y, test_size=0.2, random_state=42
)

### 2.0 - Naive Bayes: Count vs. TFIDF vs. Hashing Vectorizer

Next, we compare the Naive Bayes Classifier with different vectorizers.

- Count Vectorizer: this vectorizer converts the text documents to a token count matrix where each row represents a document and each column represents a token.  The values are the token counts for each document.
- TFIDF Vectorizer: this vectorizer works similarly to the Count Vectorizer except instead of a basic token count for each value, it uses a weighted term frequency aspect to penalize tokens which are very common.
- Hashing Vectorizer: the difference with this vectorizer is that the tokens are encoded with a hashing function.  This is a more efficient way to store and access the feature names.  That is, it should increase computation speed compared to the Count Vectorizer, but there is a chance for information loss if multiple features are mapped to the same hash code.

In [23]:
# define tfidf vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words = 'english')

# train model on X_train
tfidf_train = tfidf_vectorizer.fit_transform(X_train.values)

# transform X_test to match X_train
tfidf_test = tfidf_vectorizer.transform(X_test.values)

# define Naive Bayes Classifier
nb_classifier = MultinomialNB()

# train NB 
nb_classifier.fit(tfidf_train, y_train)

# predict
pred = nb_classifier.predict(tfidf_test)

# save accuracy score
nb_tvect_score = metrics.accuracy_score(y_test, pred)
nb_tvect_score

0.82125

In [24]:
# define hash vectorizer
hash_vectorizer = HashingVectorizer(stop_words="english", non_negative=True)

# train model on X_train
hash_train = hash_vectorizer.fit_transform(X_train.values)

# transform X_test to match X_train
hash_test = hash_vectorizer.transform(X_test.values)

# define Naive Bayes Classifier
nb_classifier = MultinomialNB()

# train NB
nb_classifier.fit(hash_train, y_train)

# predict
pred = nb_classifier.predict(hash_test)

# save accuracy score
nb_hvect_score = metrics.accuracy_score(y_test, pred)
nb_hvect_score



0.81625

### Results/Comments

Here we see that the basic Count Vectorizer works best with the NB Classifier.

In [25]:
# organize scores in a dataframe
nb_scores = pd.DataFrame(
    data=[nb_cvect_score, nb_tvect_score, nb_hvect_score],
    index=["count", "tfidf", "hash"],
    columns=["nb_scores"],
)
nb_scores

Unnamed: 0,nb_scores
count,0.88875
tfidf,0.82125
hash,0.81625


### 2.1 - Optimizing Naive Bayes

We can optimize the Naive Bayes "alpha" smoothing parameter to improve the performance.

#### Count Vectorizer

In [26]:
# Create the list of alphas to try
parameters = {'alpha':np.arange(0,1,0.05)}

# define optimized Naive Bayes Classifier
nb_classifier = MultinomialNB()

# GridSearchCV
nb_cv = GridSearchCV(nb_classifier, parameters, cv=3)

# train NB (using count_train from previously defined Count Vectorizer)
nb_cv.fit(count_train, y_train)

# predict
pred = nb_cv.predict(count_test)

# save accuracy score
nb_opt_cvect_score = metrics.accuracy_score(y_test, pred)
nb_opt_cvect_score

# save optimal parameters
nb_opt_cvect_params = nb_cv.best_params_
nb_opt_cvect_params

  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


{'alpha': 0.05}

#### TFIDF Vectorizer

In [27]:
# Create the list of alphas to try
parameters = {'alpha':np.arange(0,1,0.05)}

# define optimized Naive Bayes Classifier
nb_classifier = MultinomialNB()

# GridSearchCV
nb_cv = GridSearchCV(nb_classifier, parameters, cv=3)

# train NB (using tfidf_train from previously defined TFIDF Vectorizer)
nb_cv.fit(tfidf_train, y_train)

# predict
pred = nb_cv.predict(tfidf_test)

# save accuracy score
nb_opt_tvect_score = metrics.accuracy_score(y_test, pred)
nb_opt_tvect_score

# save optimal parameters
nb_opt_tvect_params = nb_cv.best_params_
nb_opt_tvect_params

  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


{'alpha': 0.05}

#### Hash Vectorizer

In [28]:
# Create the list of alphas to try
parameters = {'alpha':np.arange(0,1,0.05)}

# define optimized Naive Bayes Classifier
nb_classifier = MultinomialNB()

# GridSearchCV
nb_cv = GridSearchCV(nb_classifier, parameters, cv=3)

# train NB (using hash_train from previously defined Hash Vectorizer)
nb_cv.fit(hash_train, y_train)

# predict
pred = nb_cv.predict(hash_test)

# save accuracy score
nb_opt_hvect_score = metrics.accuracy_score(y_test, pred)
nb_opt_hvect_score

# save optimal parameters
nb_opt_hvect_params = nb_cv.best_params_
nb_opt_hvect_params

  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


{'alpha': 0.05}

### Results/Comments

After optimizing the NB Classifier, the TFIDF Vectorizer performs slightly better than Count Vectorizer.

In [29]:
# organize scores in a dataframe
nb_opt_scores = pd.DataFrame(
    data=[nb_opt_cvect_score, nb_opt_tvect_score, nb_opt_hvect_score],
    index=["count", "tfidf", "hash"],
    columns=["nb_opt_scores"],
)
nb_opt_scores

Unnamed: 0,nb_opt_scores
count,0.9
tfidf,0.90125
hash,0.8825


### 3.0 - MaxEnt (Logistic Reg): Count vs. TFIDF vs. Hashing Vectorizer

Next, we try a new model the Maximum Entropy Classifier (this gives virtually the same results as a Logistic Regression, so we will use a Logistic Regression model as our MaxEnt Classifier).

In [30]:
# define MaxEnt (logistic Regression) Model
maxent = LogisticRegression()

# train based on previously defined count vectorizer train data
maxent.fit(count_train, y_train)

# predict
pred = maxent.predict(count_test)

# save accuracy score
maxent_cvect_score = metrics.accuracy_score(y_test, pred)
maxent_cvect_score



0.90875

In [31]:
# define MaxEnt (logistic Regression) Model
maxent = LogisticRegression()

# train based on previously defined tfidf vectorizer train data
maxent.fit(tfidf_train, y_train)

# predict
pred = maxent.predict(tfidf_test)

# save accuracy score
maxent_tvect_score = metrics.accuracy_score(y_test, pred)
maxent_tvect_score

0.89375

In [32]:
# define MaxEnt (logistic Regression) Model
maxent = LogisticRegression()

# train based on previously defined hash vectorizer train data
maxent.fit(hash_train, y_train)

# predict
pred = maxent.predict(hash_test)

# save accuracy score
maxent_hvect_score = metrics.accuracy_score(y_test, pred)
maxent_hvect_score

0.87875

### Results/Comments

Similarly to the NB Classifier,  we see that the Count Vectorizer works best with the basic MaxEnt Classifier.

In [33]:
# organize scores in a dataframe
maxent_scores = pd.DataFrame(
    data=[maxent_cvect_score, maxent_tvect_score, maxent_hvect_score],
    index=["count", "tfidf", "hash"],
    columns=["maxent_scores"],
)
maxent_scores

Unnamed: 0,maxent_scores
count,0.90875
tfidf,0.89375
hash,0.87875


### 3.1 - Optimizing MaxEnt (Vectorizer)

Here we can optimize the "C" regularization parameter to improve the model.

#### Count Vectorizer

In [34]:
# Create the list of alphas to try
parameters = {'C':[0.001,0.01,0.1,1,10,100,1000]}

# define MaxEnt (logistic Regression) Model
maxent = LogisticRegression()

# GridSearchCV
maxent_cv = GridSearchCV(maxent, parameters, cv=3)

# train MaxEnt (using count_train from previously defined Count Vectorizer)
maxent_cv.fit(count_train, y_train)

# predict
pred = maxent_cv.predict(count_test)

# save accuracy score
maxent_opt_cvect_score = metrics.accuracy_score(y_test, pred)
maxent_opt_cvect_score

# save optimal parameters
maxent_opt_cvect_params = maxent_cv.best_params_
maxent_opt_cvect_params



{'C': 0.1}

#### TFIDF Vectorizer

In [35]:
# Create the list of alphas to try
parameters = {'C':[0.001,0.01,0.1,1,10,100,1000]}

# define MaxEnt (logistic Regression) Model
maxent = LogisticRegression()

# GridSearchCV
maxent_cv = GridSearchCV(maxent, parameters, cv=3)

# train MaxEnt (using tfidf_train from previously defined TFIDF Vectorizer)
maxent_cv.fit(tfidf_train, y_train)

# predict
pred = maxent_cv.predict(tfidf_test)

# save accuracy score
maxent_opt_tvect_score = metrics.accuracy_score(y_test, pred)
maxent_opt_tvect_score

# save optimal parameters
maxent_opt_tvect_params = maxent_cv.best_params_
maxent_opt_tvect_params



{'C': 100}

#### Hash Vectorizer

In [36]:
# Create the list of alphas to try
parameters = {'C':[0.001,0.01,0.1,1,10,100,1000]}

# define MaxEnt (logistic Regression) Model
maxent = LogisticRegression()

# GridSearchCV
maxent_cv = GridSearchCV(maxent, parameters, cv=3)

# train MaxEnt (using hash_train from previously defined Hash Vectorizer)
maxent_cv.fit(hash_train, y_train)

# predict
pred = maxent_cv.predict(hash_test)

# save accuracy score
maxent_opt_hvect_score = metrics.accuracy_score(y_test, pred)
maxent_opt_hvect_score

# save optimal parameters
maxent_opt_hvect_params = maxent_cv.best_params_
maxent_opt_hvect_params



{'C': 1000}

### Results/Comments

Once again, the TFIDF Vectorizer outperforms the Count Vectorizer when the MaxEnt Classifier is optimized.  

In [37]:
# organize scores in a dataframe
maxent_opt_scores = pd.DataFrame(
    data=[maxent_opt_cvect_score, maxent_opt_tvect_score, maxent_opt_hvect_score],
    index=["count", "tfidf", "hash"],
    columns=["maxent_opt_scores"],
)
maxent_opt_scores

Unnamed: 0,maxent_opt_scores
count,0.91125
tfidf,0.9325
hash,0.91875


### 4.0 - SVM: Count vs. TFIDF vs. Hashing Vectorizer

Now, we try the SVM Classifier.

In [38]:
# Define linear SVM model
linear_svm = svm.LinearSVC()

# train on pre-defined count vectorizer train data
linear_svm.fit(count_train, y_train)

# predict
pred = linear_svm.predict(count_test)

# save score
svm_cvect_score = metrics.accuracy_score(y_test, pred)
svm_cvect_score



0.88625

In [39]:
# Define linear SVM model
linear_svm = svm.LinearSVC()

# train on pre-defined tfidf vectorizer train data
linear_svm.fit(tfidf_train, y_train)

# predict
pred = linear_svm.predict(tfidf_test)

# save score
svm_tvect_score = metrics.accuracy_score(y_test, pred)
svm_tvect_score

0.93

In [40]:
# Define linear SVM model
linear_svm = svm.LinearSVC()

# train on pre-defined hash vectorizer train data
linear_svm.fit(hash_train, y_train)

# predict
pred = linear_svm.predict(hash_test)

# save score
svm_hvect_score = metrics.accuracy_score(y_test, pred)
svm_hvect_score

0.92625

### Results/Comments

Here the TFIDF Vectorizer performs best.

In [41]:
# organize scores in a dataframe
svm_scores = pd.DataFrame(
    data=[svm_cvect_score, svm_tvect_score, svm_hvect_score],
    index=["count", "tfidf", "hash"],
    columns=["svm_scores"],
)
svm_scores

Unnamed: 0,svm_scores
count,0.88625
tfidf,0.93
hash,0.92625


### 4.1 - Opitimzing SVM 

Again, SVM has a "C" regularization parameter that we can optimize.

#### Count Vectorizer

In [42]:
# Create the list of alphas to try
parameters = {'C':[0.001,0.01,0.1,1,10,100,1000]}

# define the linear SVM Model
linear_svm = svm.LinearSVC()

# GridSearchCV
svm_cv = GridSearchCV(linear_svm, parameters, cv=3)

# train SVM (using count_train from previously defined Count Vectorizer)
svm_cv.fit(count_train, y_train)

# predict
pred = svm_cv.predict(count_test)

# save accuracy score
svm_opt_cvect_score = metrics.accuracy_score(y_test, pred)
svm_opt_cvect_score

# save optimal parameters
svm_opt_cvect_params = svm_cv.best_params_
svm_opt_cvect_params



{'C': 0.01}

#### TFIDIF Vectorizer

In [43]:
# Create the list of alphas to try
parameters = {'C':[0.001,0.01,0.1,1,10,100,1000]}

# define the linear SVM Model
linear_svm = svm.LinearSVC()

# GridSearchCV
svm_cv = GridSearchCV(linear_svm, parameters, cv=3)

# train SVM (using tfidf_train from previously defined TFIDF Vectorizer)
svm_cv.fit(tfidf_train, y_train)

# predict
pred = svm_cv.predict(tfidf_test)

# save accuracy score
svm_opt_tvect_score = metrics.accuracy_score(y_test, pred)
svm_opt_tvect_score

# save optimal parameters
svm_opt_tvect_params = svm_cv.best_params_
svm_opt_tvect_params



{'C': 1000}

#### Hash Vectorizer

In [44]:
# Create the list of alphas to try
parameters = {'C':[0.001,0.01,0.1,1,10,100,1000]}

# define the linear SVM Model
linear_svm = svm.LinearSVC()

# GridSearchCV
svm_cv = GridSearchCV(linear_svm, parameters, cv=3)

# train SVM (using hash_train from previously defined Hash Vectorizer)
svm_cv.fit(hash_train, y_train)

# predict
pred = svm_cv.predict(hash_test)

# save accuracy score
svm_opt_hvect_score = metrics.accuracy_score(y_test, pred)
svm_opt_hvect_score

# save optimal parameters
svm_opt_hvect_params = svm_cv.best_params_
svm_opt_hvect_params



{'C': 1}

### Results/Comments

The TFIDF Vectorizer still performs best when optimizing the SVM Model

In [45]:
# organize scores in a dataframe
svm_opt_scores = pd.DataFrame(
    data=[svm_opt_cvect_score, svm_opt_tvect_score, svm_opt_hvect_score],
    index=["count", "tfidf", "hash"],
    columns=["svm_opt_scores"],
)
svm_opt_scores

Unnamed: 0,svm_opt_scores
count,0.905
tfidf,0.9375
hash,0.92625


### 5.0 - Passive Aggressive: Count vs. TFIDF vs Hashing Vectorizer

The Passive Aggressive Classifier works similarly to SVM in terms of the cost function.  But the process of updating it takes the "passive-aggressive" approach.  When the classifier makes a correct decision, it acts "passively" and does not make any adjustments to the weight vector, but when the classifier makes an incorrect decision, it acts aggresively and will continue making small adjustments in the weight vector until the decision is correct (but the adjustments must be close to the previous weight vector to not lose the information gained from the previous iterations).  

https://www.bonaccorso.eu/2017/10/06/ml-algorithms-addendum-passive-aggressive-algorithms/

In [46]:
# Define Passive Aggresive Classifier
linear_clf = PassiveAggressiveClassifier()

# train model using pre-defined count vectorizer train data
linear_clf.fit(count_train, y_train)

# predict
pred = linear_clf.predict(count_test)

# save score
pa_cvect_score = metrics.accuracy_score(y_test, pred)
pa_cvect_score



0.88625

In [47]:
# Define Passive Aggresive Classifier
linear_clf = PassiveAggressiveClassifier()

# train model using pre-defined tfidf vectorizer train data
linear_clf.fit(tfidf_train, y_train)

# predict
pred = linear_clf.predict(tfidf_test)

# save score
pa_tvect_score = metrics.accuracy_score(y_test, pred)
pa_tvect_score

0.93875

In [48]:
# Define Passive Aggresive Classifier
linear_clf = PassiveAggressiveClassifier()

# train model using pre-defined hash vectorizer train data
linear_clf.fit(hash_train, y_train)

# predict
pred = linear_clf.predict(hash_test)

# save score
pa_hvect_score = metrics.accuracy_score(y_test, pred)
pa_hvect_score

0.90875

### Results/Comments

Again, TFIDF Vectorizer performs best, and the Passive Aggressive Classifier was able to out perform SVM!

In [49]:
# organize scores in a dataframe
pa_scores = pd.DataFrame(
    data=[pa_cvect_score, pa_tvect_score, pa_hvect_score],
    index=["count", "tfidf", "hash"],
    columns=["pa_scores"],
)
pa_scores

Unnamed: 0,pa_scores
count,0.88625
tfidf,0.93875
hash,0.90875


### 5.1 - Optimizing Passive Aggressive Model 

Again, we optimize the "C" regularization parameter.

#### Count Vectorizer

In [50]:
# Create the list of alphas to try
parameters = {'C':[0.001,0.01,0.1,1,10,100,1000]}

# Define Passive Aggresive Classifier
linear_clf = PassiveAggressiveClassifier()

# GridSearchCV
pa_cv = GridSearchCV(linear_clf, parameters, cv=3)

# train Passive Aggressive (using count_train from previously defined Count Vectorizer)
pa_cv.fit(count_train, y_train)

# predict
pred = pa_cv.predict(count_test)

# save accuracy score 
pa_opt_cvect_score = metrics.accuracy_score(y_test, pred)
pa_opt_cvect_score

# save optimal parameters
pa_opt_cvect_params = pa_cv.best_params_
pa_opt_cvect_params



{'C': 0.001}

#### TFIDF Vectorizer


In [51]:
# Create the list of alphas to try
parameters = {'C':[0.001,0.01,0.1,1,10,100,1000]}

# Define Passive Aggresive Classifier
linear_clf = PassiveAggressiveClassifier()

# GridSearchCV
pa_cv = GridSearchCV(linear_clf, parameters, cv=3)

# train Passive Aggressive (using tfidf_train from previously defined TFIDF Vectorizer)
pa_cv.fit(tfidf_train, y_train)

# predict
pred = pa_cv.predict(tfidf_test)

# save accuracy score
pa_opt_tvect_score = metrics.accuracy_score(y_test, pred)
pa_opt_tvect_score

# save optimal parameters
pa_opt_tvect_params = pa_cv.best_params_
pa_opt_tvect_params



{'C': 100}

#### Hash Vectorizer

In [52]:
# Create the list of alphas to try
parameters = {'C':[0.001,0.01,0.1,1,10,100,1000]}

# Define Passive Aggresive Classifier
linear_clf = PassiveAggressiveClassifier()

# GridSearchCV
pa_cv = GridSearchCV(linear_clf, parameters, cv=3)

# train Passive Aggressive (using hash_train from previously defined Hash Vectorizer)
pa_cv.fit(hash_train, y_train)

# predict
pred = pa_cv.predict(hash_test)

# save accuracy score
pa_opt_hvect_score = metrics.accuracy_score(y_test, pred)
pa_opt_hvect_score

# save optimal parameters
pa_opt_hvect_params = pa_cv.best_params_
pa_opt_hvect_params



{'C': 10}

### Results/Comments

TFIDF Vectorizer still outperforms the others.

In [53]:
# organize scores in a dataframe
pa_opt_scores = pd.DataFrame(
    data=[pa_opt_cvect_score, pa_opt_tvect_score, pa_opt_hvect_score],
    index=["count", "tfidf", "hash"],
    columns=["pa_opt_scores"],
)
pa_opt_scores

Unnamed: 0,pa_opt_scores
count,0.9025
tfidf,0.94
hash,0.91875


### Comparing Models

Here we can see that the optimized SVM Model performs the best (but only slightly better than the Passive Aggresive Model)!  Moving on we will try more sophisticated pre-processing techniques on the TFIDF Vectorizer since it consistently outperforms the other vectorizers.

In [54]:
# concatenate scores 
model_scores = pd.concat([nb_opt_scores,maxent_opt_scores,svm_opt_scores,pa_opt_scores],axis=1)
model_scores

Unnamed: 0,nb_opt_scores,maxent_opt_scores,svm_opt_scores,pa_opt_scores
count,0.9,0.91125,0.905,0.9025
tfidf,0.90125,0.9325,0.9375,0.94
hash,0.8825,0.91875,0.92625,0.91875


### Inspecting the TFIDF Vectorizer

To get a better understanding of how the vectorizer is working, we can inspect the top features (keywords) that result in a Fake or Real classification.

Link: https://www.datacamp.com/community/tutorials/scikit-learn-fake-news

#### Naive Bayes Model

In [55]:
# store vectorizer feature names
feature_names = tfidf_vectorizer.get_feature_names()

In [56]:
# we need to redefine the classifier with the optimal parameters obtained from the previous gridsearch

# Define NB Classifier
nb = MultinomialNB(**nb_opt_tvect_params)

# train model using pre-defined tfidf vectorizer train data
nb.fit(tfidf_train, y_train)

MultinomialNB(alpha=0.05, class_prior=None, fit_prior=True)

In [97]:
### 20 "most fake" features for Naive Bayes Model
nb_fake = pd.DataFrame(sorted(zip(nb.coef_[0], feature_names), reverse=False)[:20])[1]
nb_fake

In [58]:
### 20 "most real" features for Naive Bayes Model
nb_real = pd.DataFrame(sorted(zip(nb.coef_[0], feature_names), reverse=True)[:20])[1]
nb_real

0           trump
1            said
2         clinton
3           obama
4         sanders
5       president
6        campaign
7            cruz
8      republican
9           state
10          party
11            new
12            gop
13    republicans
14           bush
15         people
16          house
17        percent
18         voters
19          rubio
Name: 1, dtype: object

#### MaxEnt Model

In [59]:
# we need to redefine the classifier with the optimal parameters obtained from the previous gridsearch

# Define Passive Aggresive Classifier
maxent = LogisticRegression(**maxent_opt_tvect_params)

# train model using pre-defined tfidf vectorizer train data
maxent.fit(tfidf_train, y_train)



LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [60]:
### 20 "most fake" features for MaxEnt Model
maxent_fake = pd.DataFrame(sorted(zip(maxent.coef_[0], feature_names), reverse=False)[:20])[1]
maxent_fake

0           october
1              2016
2           hillary
3           article
4            source
5             mosul
6             share
7          november
8          election
9               fbi
10        corporate
11              oct
12          podesta
13              nov
14              com
15    advertisement
16              war
17        wikileaks
18            photo
19             just
Name: 1, dtype: object

In [61]:
### 20 "most real" features for MaxEnt Model
maxent_real = pd.DataFrame(sorted(zip(maxent.coef_[0], feature_names), reverse=True)[:20])[1]
maxent_real

0             said
1             cruz
2           debate
3     conservative
4              gop
5              tax
6          tuesday
7          islamic
8            state
9         marriage
10             sen
11         attacks
12          friday
13          monday
14      department
15          attack
16             nbc
17        campaign
18          reform
19        saturday
Name: 1, dtype: object

#### SVM Model

In [62]:
# we need to redefine the classifier with the optimal parameters obtained from the previous gridsearch

# Define SVM Classifier
linear_svm = svm.LinearSVC(**svm_opt_tvect_params)

# train model using pre-defined tfidf vectorizer train data
linear_svm.fit(tfidf_train, y_train)

LinearSVC(C=1000, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [63]:
### 20 "most fake" features for Passive Aggressive Model
svm_fake = pd.DataFrame(sorted(zip(linear_svm.coef_[0], feature_names), reverse=False)[:20])[1]
svm_fake

0           october
1              2016
2           article
3           hillary
4            source
5             share
6             mosul
7         corporate
8               nov
9             photo
10    advertisement
11              oct
12         november
13          podesta
14         election
15              fbi
16             reno
17           dakota
18      suppression
19           demand
Name: 1, dtype: object

In [64]:
### 20 "most real" features for Passive Aggressive Model
svm_real = pd.DataFrame(sorted(zip(linear_svm.coef_[0], feature_names), reverse=True)[:20])[1]
svm_real

0             said
1     conservative
2          tuesday
3             cruz
4              tax
5           debate
6              gop
7          attacks
8         marriage
9          islamic
10             nbc
11             sen
12          monday
13          reform
14        saturday
15          friday
16            rush
17            held
18       potential
19           state
Name: 1, dtype: object

#### Passive Aggresive Model

In [65]:
# we need to redefine the classifier with the optimal parameters obtained from the previous gridsearch

# Define Passive Aggresive Classifier
linear_clf = PassiveAggressiveClassifier(**pa_opt_tvect_params)

# train model using pre-defined tfidf vectorizer train data
linear_clf.fit(tfidf_train, y_train)



PassiveAggressiveClassifier(C=100, average=False, class_weight=None,
              early_stopping=False, fit_intercept=True, loss='hinge',
              max_iter=None, n_iter=None, n_iter_no_change=5, n_jobs=None,
              random_state=None, shuffle=True, tol=None,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [66]:
### 20 "most fake" features for Passive Aggressive Model
pa_fake = pd.DataFrame(sorted(zip(linear_clf.coef_[0], feature_names), reverse=False)[:20])[1]
pa_fake

0              2016
1           october
2           hillary
3           article
4             mosul
5             share
6            source
7         corporate
8          november
9               oct
10         election
11              nov
12              fbi
13    advertisement
14          podesta
15        wikileaks
16            north
17              war
18            photo
19              com
Name: 1, dtype: object

In [67]:
### 20 "most real" features for Passive Aggressive Model
pa_real = pd.DataFrame(sorted(zip(linear_clf.coef_[0],feature_names), reverse=True)[:20])[1]
pa_real

0             said
1             cruz
2     conservative
3          tuesday
4              tax
5           debate
6          islamic
7         marriage
8              gop
9              sen
10         attacks
11          monday
12             nbc
13           state
14          friday
15          voters
16          attack
17      department
18        campaign
19            rush
Name: 1, dtype: object

### Results/Comments

The import features for distinguishing fake and real news are mostly similar for the maxent, svm, and passive aggresive models, but it is clear that the naive bayes model is a bit of an outlier.  

The words themselves are also interesting.  We new the the dataset was made up of fake and real news articles, but here we see that the main keywords for classifying fake and real news are very political.  For example, "hillary" is an important keyword for fake news and "gop" is an important keyword for real news.

Surprisingly, "said" is the top keyword for real news.  This would suggest that direct quotes, which often are preceded or followed by the word said, occur more frequently in real news.  On the other hand "source" is an important keyword for fake news.  This would suggest that fake news articles more often refer to other "sources" for credibility.  

In [68]:
# organize real and fake features in a data frame
basic_preproc_features = pd.concat(
    [nb_fake, maxent_fake, svm_fake, pa_fake, nb_real, maxent_real, svm_real, pa_real],
    axis=1,
)
basic_preproc_features.columns = [
    "nb_fake",
    "maxent_fake",
    "svm_fake",
    "pa_fake",
    "nb_real",
    "maxent_real",
    "svm_real",
    "pa_real",
]
basic_preproc_features

Unnamed: 0,nb_fake,maxent_fake,svm_fake,pa_fake,nb_real,maxent_real,svm_real,pa_real
0,0001,october,october,2016,trump,said,said,said
1,000billion,2016,2016,october,said,cruz,conservative,cruz
2,000km,hillary,article,hillary,clinton,debate,tuesday,conservative
3,005,article,hillary,article,obama,conservative,cruz,tuesday
4,00684,source,source,mosul,sanders,gop,tax,tax
5,007s,mosul,share,share,president,tax,debate,debate
6,0099,share,mosul,source,campaign,tuesday,gop,islamic
7,00am,november,corporate,corporate,cruz,islamic,attacks,marriage
8,00p,election,nov,november,republican,state,marriage,gop
9,00pm,fbi,photo,oct,state,marriage,islamic,sen


### 6.0 - More Sophisticated Preprocessing : Lemmatization

Since the Naive Bayes model does not perform as well as the others, we will not include it going forward.

Lemmatization reduces words to their basic form.  For example: practice, practiced, and practicing would all be reduced to the basic word, practice.  This process should reduce keyword redundancy and make it easier for the models to correctly classify the documents.  

We will use the SpaCy library to assign each keyword to its lemma.  

Furthermore, we will create a tokenizer which also removes punctuation and english stopwords.

Sources:
- https://www.datacamp.com/community/tutorials/stemming-lemmatization-python
- https://www.machinelearningplus.com/nlp/lemmatization-examples-python/
- https://towardsdatascience.com/a-short-introduction-to-nlp-in-python-with-spacy-d0aa819af3ad

In [69]:
# define punctuations
punctuations = string.punctuation
punctuations += "‘" + "’" + "…" + "--" + "—" + "©" + "“" + "›" + "–" +"”" 
punctuations

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~‘’…--—©“›–”'

In [70]:
stopwords

frozenset({'a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides'

In [71]:
# define parser
parser = English()

# define tokenizer with lemmatization with English stopwords and punctuation removal
def spacy_tokenizer(sentence):
    tokens = parser(sentence)   
    tokens = [tok.lemma_.lower().strip() for tok in tokens]
    tokens = [tok for tok in tokens if tok not in ["-pron-"]]
    tokens = [tok for tok in tokens if tok not in stopwords]
    tokens = [tok for tok in tokens if tok not in punctuations]    
    return tokens

In [72]:
# redefine TFIDF Vectorizer with spacy tokenizer
tfidf_vectorizer = TfidfVectorizer(tokenizer = spacy_tokenizer)

# redefine tfidf train and test
tfidf_train = tfidf_vectorizer.fit_transform(X_train.values)
tfidf_test = tfidf_vectorizer.transform(X_test.values)

In [73]:
# fit MaxEnt
maxent.fit(tfidf_train, y_train)

# predict
pred = maxent.predict(tfidf_test)

# save score
maxent_lem_tvect_score = metrics.accuracy_score(y_test, pred)
maxent_lem_tvect_score

0.94375

In [74]:
# fit SVM
linear_svm.fit(tfidf_train, y_train)

# predict
pred = linear_svm.predict(tfidf_test)

# save score
svm_lem_tvect_score = metrics.accuracy_score(y_test, pred)
svm_lem_tvect_score



0.9325

In [75]:
# fit Passive Aggresive 
linear_clf.fit(tfidf_train, y_train)

# predict
pred = linear_clf.predict(tfidf_test)

# save score
pa_lem_tvect_score = metrics.accuracy_score(y_test, pred)
pa_lem_tvect_score

0.93875

### Results/Comments

Lemmatization in the TFIDF vectorizer improves all the models!

Looking at the top 30 important words for real or fake news, the word choices for all the models seem even more similar to each other, while the main trends observed earlier are still present.

In [76]:
# organize scores in a dataframe
lemma_scores = pd.DataFrame(
    data=[[maxent_opt_tvect_score, maxent_lem_tvect_score], [svm_opt_tvect_score,
         svm_lem_tvect_score], [pa_opt_tvect_score, pa_lem_tvect_score]],
    index=["maxent", "svm","pa"],
    columns=["basic","basic + lemma"],
)
lemma_scores

Unnamed: 0,basic,basic + lemma
maxent,0.9325,0.94375
svm,0.9375,0.9325
pa,0.94,0.93875


In [77]:
feature_names = tfidf_vectorizer.get_feature_names()

In [78]:
maxent_lem_fake = pd.DataFrame(sorted(zip(maxent.coef_[0], feature_names), reverse=False)[:30])[1]
maxent_lem_real = pd.DataFrame(sorted(zip(maxent.coef_[0],feature_names), reverse=True)[:30])[1]

svm_lem_fake = pd.DataFrame(sorted(zip(linear_svm.coef_[0], feature_names), reverse=False)[:30])[1]
svm_lem_real = pd.DataFrame(sorted(zip(linear_svm.coef_[0],feature_names), reverse=True)[:30])[1]

pa_lem_fake = pd.DataFrame(sorted(zip(linear_clf.coef_[0], feature_names), reverse=False)[:30])[1]
pa_lem_real = pd.DataFrame(sorted(zip(linear_clf.coef_[0],feature_names), reverse=True)[:30])[1]

In [79]:
# organize real and fake features in a data frame
lemma_features = pd.concat(
    [maxent_lem_fake, svm_lem_fake, pa_lem_fake, maxent_lem_real, svm_lem_real, pa_lem_real],
    axis=1,
)
lemma_features.columns = [
    "maxent_lem_fake",
    "svm_lem_fake",
    "pa_lem_fake",
    "maxent_lem_real",
    "svm_lem_real",
    "pa_lem_real",
]
lemma_features

Unnamed: 0,maxent_lem_fake,svm_lem_fake,pa_lem_fake,maxent_lem_real,svm_lem_real,pa_lem_real
0,october,october,october,say,say,say
1,2016,0,2016,conservative,conservative,debate
2,0,2016,0,debate,debate,conservative
3,hillary,article,hillary,cruz,cruz,cruz
4,article,source,article,tax,marriage,tax
5,source,share,source,gop,tax,tuesday
6,share,hillary,share,marriage,reform,islamic
7,mosul,advertisement,mosul,islamic,tuesday,attack
8,war,mosul,advertisement,attack,u.n,marriage
9,november,corporate,oct,tuesday,nbc,reform


### 6.1 - More Sophisticated Preprocessing : Ngrams

Ngrams are the process of including more than one word in a feature.  We will try different ngram ranges to see if we can optimize the models further.  For example, instead of two separate words: "apple" and "computer" a bigram would give us "apple computer" the meaning here changes.  

#### Ngrams (1, 1)

In [80]:
tfidf_vectorizer = TfidfVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1, 1))
tfidf_train = tfidf_vectorizer.fit_transform(X_train.values)
tfidf_test = tfidf_vectorizer.transform(X_test.values)

In [81]:
# fit MaxEnt
maxent.fit(tfidf_train, y_train)

# predict
pred = maxent.predict(tfidf_test)

# save score
maxent_lem_ngram11_tvect_score = metrics.accuracy_score(y_test, pred)

# ---------------------------------------------------------------------------

# fit SVM
linear_svm.fit(tfidf_train, y_train)

    # predict
pred = linear_svm.predict(tfidf_test)

# save score
svm_lem_ngram11_tvect_score = metrics.accuracy_score(y_test, pred)

# ---------------------------------------------------------------------------

# fit Passive Aggressive
linear_clf.fit(tfidf_train, y_train)

# predict
pred = linear_clf.predict(tfidf_test)

# save score
pa_lem_ngram11_tvect_score = metrics.accuracy_score(y_test, pred)

print("maxent:", maxent_lem_ngram11_tvect_score)
print("svm:", svm_lem_ngram11_tvect_score)
print("pa:", pa_lem_ngram11_tvect_score)

maxent: 0.94375
svm: 0.9325
pa: 0.9425


#### Ngrams (1, 2)

In [82]:
tfidf_vectorizer = TfidfVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1, 2))
tfidf_train = tfidf_vectorizer.fit_transform(X_train.values)
tfidf_test = tfidf_vectorizer.transform(X_test.values)

In [83]:
# fit MaxEnt
maxent.fit(tfidf_train, y_train)

# predict
pred = maxent.predict(tfidf_test)

# save score
maxent_lem_ngram12_tvect_score = metrics.accuracy_score(y_test, pred)

# ---------------------------------------------------------------------------

# fit SVM
linear_svm.fit(tfidf_train, y_train)

# predict
pred = linear_svm.predict(tfidf_test)

# save score
svm_lem_ngram12_tvect_score = metrics.accuracy_score(y_test, pred)

# ---------------------------------------------------------------------------

# fit Passive Aggressive
linear_clf.fit(tfidf_train, y_train)

# predict
pred = linear_clf.predict(tfidf_test)

# save score
pa_lem_ngram12_tvect_score = metrics.accuracy_score(y_test, pred)

print("maxent:", maxent_lem_ngram12_tvect_score)
print("svm:", svm_lem_ngram12_tvect_score)
print("pa:", pa_lem_ngram12_tvect_score)

maxent: 0.9425
svm: 0.94375
pa: 0.9475


#### Ngrams (1, 3)

In [84]:
tfidf_vectorizer = TfidfVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1, 3))
tfidf_train = tfidf_vectorizer.fit_transform(X_train.values)
tfidf_test = tfidf_vectorizer.transform(X_test.values)

In [85]:
# fit MaxEnt
maxent.fit(tfidf_train, y_train)

# predict
pred = maxent.predict(tfidf_test)

# save score
maxent_lem_ngram13_tvect_score = metrics.accuracy_score(y_test, pred)

# ---------------------------------------------------------------------------

# fit SVM
linear_svm.fit(tfidf_train, y_train)

# predict
pred = linear_svm.predict(tfidf_test)

# save score
svm_lem_ngram13_tvect_score = metrics.accuracy_score(y_test, pred)

# ---------------------------------------------------------------------------

# fit Passive Aggressive
linear_clf.fit(tfidf_train, y_train)

# predict
pred = linear_clf.predict(tfidf_test)

# save score
pa_lem_ngram13_tvect_score = metrics.accuracy_score(y_test, pred)

print("maxent:", maxent_lem_ngram13_tvect_score)
print("svm:", svm_lem_ngram13_tvect_score)
print("pa:", pa_lem_ngram13_tvect_score)

maxent: 0.9325
svm: 0.94375
pa: 0.9425


#### Ngrams (2, 2)

In [86]:
tfidf_vectorizer = TfidfVectorizer(tokenizer = spacy_tokenizer, ngram_range=(2, 2))
tfidf_train = tfidf_vectorizer.fit_transform(X_train.values)
tfidf_test = tfidf_vectorizer.transform(X_test.values)

In [87]:
# fit MaxEnt
maxent.fit(tfidf_train, y_train)

# predict
pred = maxent.predict(tfidf_test)

# save score
maxent_lem_ngram22_tvect_score = metrics.accuracy_score(y_test, pred)

# ---------------------------------------------------------------------------

# fit SVM
linear_svm.fit(tfidf_train, y_train)

# predict
pred = linear_svm.predict(tfidf_test)

# save score
svm_lem_ngram22_tvect_score = metrics.accuracy_score(y_test, pred)

# ---------------------------------------------------------------------------

# fit Passive Aggressive
linear_clf.fit(tfidf_train, y_train)

# predict
pred = linear_clf.predict(tfidf_test)

# save score
pa_lem_ngram22_tvect_score = metrics.accuracy_score(y_test, pred)

print("maxent:", maxent_lem_ngram22_tvect_score)
print("svm:", svm_lem_ngram22_tvect_score)
print("pa:", pa_lem_ngram22_tvect_score)

maxent: 0.92
svm: 0.92625
pa: 0.9225


#### Ngrams (2, 3)

In [88]:
tfidf_vectorizer = TfidfVectorizer(tokenizer = spacy_tokenizer, ngram_range=(2, 3))
tfidf_train = tfidf_vectorizer.fit_transform(X_train.values)
tfidf_test = tfidf_vectorizer.transform(X_test.values)

In [89]:
# fit MaxEnt
maxent.fit(tfidf_train, y_train)

# predict
pred = maxent.predict(tfidf_test)

# save score
maxent_lem_ngram23_tvect_score = metrics.accuracy_score(y_test, pred)

# ---------------------------------------------------------------------------

# fit SVM
linear_svm.fit(tfidf_train, y_train)

# predict
pred = linear_svm.predict(tfidf_test)

# save score
svm_lem_ngram23_tvect_score = metrics.accuracy_score(y_test, pred)

# ---------------------------------------------------------------------------

# fit Passive Aggressive
linear_clf.fit(tfidf_train, y_train)

# predict
pred = linear_clf.predict(tfidf_test)

# save score
pa_lem_ngram23_tvect_score = metrics.accuracy_score(y_test, pred)

print("maxent:", maxent_lem_ngram23_tvect_score)
print("svm:", svm_lem_ngram23_tvect_score)
print("pa:", pa_lem_ngram23_tvect_score)

maxent: 0.9275
svm: 0.93375
pa: 0.925


#### Ngrams (3, 3)

In [90]:
tfidf_vectorizer = TfidfVectorizer(tokenizer = spacy_tokenizer, ngram_range=(3, 3))
tfidf_train = tfidf_vectorizer.fit_transform(X_train.values)
tfidf_test = tfidf_vectorizer.transform(X_test.values)

In [91]:
# fit MaxEnt
maxent.fit(tfidf_train, y_train)

# predict
pred = maxent.predict(tfidf_test)

# save score
maxent_lem_ngram33_tvect_score = metrics.accuracy_score(y_test, pred)

# ---------------------------------------------------------------------------

# fit SVM
linear_svm.fit(tfidf_train, y_train)

# predict
pred = linear_svm.predict(tfidf_test)

# save score
svm_lem_ngram33_tvect_score = metrics.accuracy_score(y_test, pred)

# ---------------------------------------------------------------------------

# fit Passive Aggressive
linear_clf.fit(tfidf_train, y_train)

# predict
pred = linear_clf.predict(tfidf_test)

# save score
pa_lem_ngram33_tvect_score = metrics.accuracy_score(y_test, pred)

print("maxent:", maxent_lem_ngram33_tvect_score)
print("svm:", svm_lem_ngram33_tvect_score)
print("pa:", pa_lem_ngram33_tvect_score)

maxent: 0.90875
svm: 0.91375
pa: 0.90625


### Results/Comments

Here we see the models tend to perform best with ngrams of range (1, 2).


In the chart below, we see that mostly unigrams are used, but there are a few important bigrams such as "hillary clinton" and "donald trump".

In [92]:
# organize scores in a dataframe
lemma_ngram_scores = pd.DataFrame(
    data=[[maxent_lem_tvect_score, maxent_lem_ngram11_tvect_score, maxent_lem_ngram12_tvect_score,
          maxent_lem_ngram13_tvect_score, maxent_lem_ngram22_tvect_score, maxent_lem_ngram23_tvect_score, 
          maxent_lem_ngram33_tvect_score], [svm_lem_tvect_score, svm_lem_ngram11_tvect_score, svm_lem_ngram12_tvect_score,
          svm_lem_ngram13_tvect_score, svm_lem_ngram22_tvect_score, svm_lem_ngram23_tvect_score, 
          svm_lem_ngram33_tvect_score], [pa_lem_tvect_score, pa_lem_ngram11_tvect_score, pa_lem_ngram12_tvect_score,
          pa_lem_ngram13_tvect_score, pa_lem_ngram22_tvect_score, pa_lem_ngram23_tvect_score, 
          pa_lem_ngram33_tvect_score]],
    index=["maxent", "svm","pa"],
    columns=["no ngram","ngram (1, 1)", "ngram (1, 2)", "ngram (1, 3)", "ngram (2, 2)", "ngram (2, 3)", "ngram (3, 3)"],
)
lemma_ngram_scores

Unnamed: 0,no ngram,"ngram (1, 1)","ngram (1, 2)","ngram (1, 3)","ngram (2, 2)","ngram (2, 3)","ngram (3, 3)"
maxent,0.94375,0.94375,0.9425,0.9325,0.92,0.9275,0.90875
svm,0.9325,0.9325,0.94375,0.94375,0.92625,0.93375,0.91375
pa,0.93875,0.9425,0.9475,0.9425,0.9225,0.925,0.90625


In [93]:
# redefine vectorizer with ngram range (1, 2)
tfidf_vectorizer = TfidfVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1, 2))

# set train and test values
tfidf_train = tfidf_vectorizer.fit_transform(X_train.values)
tfidf_test = tfidf_vectorizer.transform(X_test.values)

# fit models
maxent.fit(tfidf_train, y_train)
linear_svm.fit(tfidf_train, y_train)
linear_clf.fit(tfidf_train, y_train)

PassiveAggressiveClassifier(C=100, average=False, class_weight=None,
              early_stopping=False, fit_intercept=True, loss='hinge',
              max_iter=None, n_iter=None, n_iter_no_change=5, n_jobs=None,
              random_state=None, shuffle=True, tol=None,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [94]:
# store feature names
feature_names = tfidf_vectorizer.get_feature_names()

In [95]:
# store top features for real and fake news classification
maxent_lem_ngram12_fake = pd.DataFrame(sorted(zip(maxent.coef_[0], feature_names), reverse=False)[:30])[1]
maxent_lem_ngram12_real = pd.DataFrame(sorted(zip(maxent.coef_[0],feature_names), reverse=True)[:30])[1]

svm_lem_ngram12_fake = pd.DataFrame(sorted(zip(linear_svm.coef_[0], feature_names), reverse=False)[:30])[1]
svm_lem_ngram12_real = pd.DataFrame(sorted(zip(linear_svm.coef_[0],feature_names), reverse=True)[:30])[1]

pa_lem_ngram12_fake = pd.DataFrame(sorted(zip(linear_clf.coef_[0], feature_names), reverse=False)[:30])[1]
pa_lem_ngram12_real = pd.DataFrame(sorted(zip(linear_clf.coef_[0],feature_names), reverse=True)[:30])[1]

In [98]:
# organize real and fake features in a data frame
lemma_ngram_features = pd.concat(
    [maxent_lem_fake, maxent_lem_ngram12_fake, svm_lem_fake, svm_lem_ngram12_fake, pa_lem_fake, pa_lem_ngram12_fake,
     maxent_lem_real, maxent_lem_ngram12_real, svm_lem_real, svm_lem_ngram12_real, pa_lem_real, pa_lem_ngram12_real],
    axis=1,
)
lemma_ngram_features.columns = [
    "maxent_lem_fake",
    "maxent_lem_ngram_fake",
    "svm_lem_fake",
    "svm_lem_ngram_fake",
    "pa_lem_fake",
    "pa_lem_ngram_fake",
    "maxent_lem_real",
    "maxent_lem_ngram_real",
    "svm_lem_real",
    "svm_lem_ngram_real",
    "pa_lem_real",
    "pa_lem_ngram_real"
]
lemma_ngram_features

Unnamed: 0,maxent_lem_fake,maxent_lem_ngram_fake,svm_lem_fake,svm_lem_ngram_fake,pa_lem_fake,pa_lem_ngram_fake,maxent_lem_real,maxent_lem_ngram_real,svm_lem_real,svm_lem_ngram_real,pa_lem_real,pa_lem_ngram_real
0,october,october,october,october,october,2016,say,say,say,say,say,say
1,2016,2016,0,2016,2016,october,conservative,conservative,conservative,conservative,debate,cruz
2,0,hillary,2016,0,0,hillary,debate,debate,debate,debate,conservative,conservative
3,hillary,0,article,hillary,hillary,0,cruz,cruz,cruz,cruz,cruz,debate
4,article,article,source,article,article,mosul,tax,obama,marriage,gop,tax,campaign
5,source,election,share,source,source,election,gop,gop,tax,campaign,tuesday,sander
6,share,mosul,hillary,mosul,share,article,marriage,sander,reform,attack,islamic,obama
7,mosul,source,advertisement,share,mosul,share,islamic,campaign,tuesday,sander,attack,gop
8,war,russia,mosul,election,advertisement,source,attack,state,u.n,obama,marriage,state
9,november,war,corporate,war,oct,november,tuesday,president,nbc,state,reform,attack


### Final Conclusions

In conclusion, the best vectorizer combination was TFIDF with lemmatization and ngram range (1, 2). The Passive Aggressive Model performed the best.

The highest accuracy acheived was 0.947, to improve this we could consider a neural network algorithm.

From this dataset, fake news was clearly connected to political events influencing elections during a very specific time frame (October 2016).  An interesting experiment could be to look at real and fake news from a politically calm time frame and see how the classifcation keywords differ.  

In [101]:
test_set = tfidf_vectorizer.transform(test["title_text"].values)
test_pred = linear_clf.predict(test_set)
test_pred

array(['FAKE', 'REAL', 'REAL', ..., 'FAKE', 'REAL', 'REAL'], dtype='<U4')

In [108]:
test_w_pred = test
test_w_pred["predictions"] = test_pred
test_w_pred

Unnamed: 0,ID,title,text,title_text,predictions
0,10498,September New Homes Sales Rise——-Back To 1992 ...,September New Homes Sales Rise Back To 1992 Le...,September New Homes Sales Rise——-Back To 1992 ...,FAKE
1,2439,Why The Obamacare Doomsday Cult Can't Admit It...,But when Congress debated and passed the Patie...,Why The Obamacare Doomsday Cult Can't Admit It...,REAL
2,864,"Sanders, Cruz resist pressure after NY losses,...",The Bernie Sanders and Ted Cruz campaigns vowe...,"Sanders, Cruz resist pressure after NY losses,...",REAL
3,4128,Surviving escaped prisoner likely fatigued and...,Police searching for the second of two escaped...,Surviving escaped prisoner likely fatigued and...,REAL
4,662,Clinton and Sanders neck and neck in Californi...,No matter who wins California's 475 delegates ...,Clinton and Sanders neck and neck in Californi...,REAL
5,8430,Hillary’s Crime Family: End Of Days For The U....,"Financial Markets , Market Manipulation , U.S....",Hillary’s Crime Family: End Of Days For The U....,FAKE
6,1220,"Why pundits, politicians and the press hate Te...",Senator Ted Cruz is now the frontrunner in the...,"Why pundits, politicians and the press hate Te...",REAL
7,9624,"WSJ Report Not About Black Rapists At Baylor, ...",X Dear Reader! VDARE.com isn’t just a website....,"WSJ Report Not About Black Rapists At Baylor, ...",FAKE
8,8211,The ‘Two-Party Racket’ Is Incapable of Dealing...,The ‘Two-Party Racket’ Is Incapable of Dealing...,The ‘Two-Party Racket’ Is Incapable of Dealing...,FAKE
9,4099,"Just the Beginning? Religious Freedom, Gay Rig...",Ten months after the Supreme Court passed a la...,"Just the Beginning? Religious Freedom, Gay Rig...",REAL


In [111]:
test_w_pred.to_csv('test_predictions.csv')