# Building Spam Classifier Using Naive Bayes and Random Forest

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier

In [3]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

In [27]:
from sklearn.ensemble import AdaBoostClassifier

In [4]:
df = pd.read_csv("fake_or_real_news.csv")

In [5]:
df.columns

Index(['Unnamed: 0', 'title', 'text', 'label'], dtype='object')

In [6]:
df.drop(['Unnamed: 0'],axis=1,inplace=True)

In [7]:
df.head().T

Unnamed: 0,0,1,2,3,4
title,You Can Smell Hillary’s Fear,Watch The Exact Moment Paul Ryan Committed Pol...,Kerry to go to Paris in gesture of sympathy,Bernie supporters on Twitter erupt in anger ag...,The Battle of New York: Why This Primary Matters
text,"Daniel Greenfield, a Shillman Journalism Fello...",Google Pinterest Digg Linkedin Reddit Stumbleu...,U.S. Secretary of State John F. Kerry said Mon...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",It's primary day in New York and front-runners...
label,FAKE,FAKE,REAL,FAKE,REAL


In [8]:
import re
import nltk
from nltk.stem.porter import PorterStemmer
stopwords = set(w.rstrip() for w in open('stopwords.txt'))

df['label']=np.where(df['label']=="REAL",1,0)

In [9]:
corpus_text = []
for i in range(df.shape[0]):
    review = re.sub('[^a-zA-Z]'," ",df['text'][i])
    review = review.lower()
    review = review.split()
    review = [word for word in review if len(review)>2]
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in stopwords]
    review = " ".join(review)
    corpus_text.append(review)  

In [10]:
corpus_title = []
for i in range(df.shape[0]):
    review = re.sub('[^a-zA-Z]'," ",df['title'][i])
    review = review.lower()
    review = review.split()
    review = [word for word in review if len(review)>2]
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in stopwords]
    review = " ".join(review)
    corpus_title.append(review)

In [12]:
cv = CountVectorizer()
X = cv.fit_transform(corpus_text).toarray()
X2 = cv.fit_transform(corpus_title).toarray()

y= df.iloc[:,-1]

In [13]:
def ReportAccuracy(X_test,y_test):
    y_pred = classifier.predict(X_test)
    a = confusion_matrix(y_test,y_pred)
    accuracy = (a[0][0]+a[1][1])/a.sum()
    print("accuracy = {} %".format(round(accuracy*100,2)))

# Using only Title ( Gaussian vs Multinomial vs Bernoulli ) 

In [29]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB


In [31]:
X_train, X_test, y_train, y_test = train_test_split(X2,y, test_size = 0.2 , random_state=0)
classifier = GaussianNB()
classifier.fit(X_train,y_train)
ReportAccuracy(X_test,y_test)

accuracy = 70.64 %


In [32]:
X_train, X_test, y_train, y_test = train_test_split(X2,y, test_size = 0.2 , random_state=0)
classifier = BernoulliNB()
classifier.fit(X_train,y_train)
ReportAccuracy(X_test,y_test)

accuracy = 82.08 %


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X2,y, test_size = 0.2 , random_state=0)
classifier = MultinomialNB()
classifier.fit(X_train,y_train)
ReportAccuracy(X_test,y_test)

accuracy = 82.48 %


# As expected, we should use Multinomial Naive Bayes as the word count will contain numbers like 1,2,3 

## Following code will only use Multinomial Naive Bayes

### Multinomial Naive Bayes Cross validated 


In [15]:
accuracies = cross_val_score(classifier,X_train,y_train,cv =5)
print("Cross Validation Mean accuracy = {} %".format(round(accuracies.mean()*100,2)))
print("Cross Validation Std = {} %".format(round(accuracies.std()*100,2)))

Cross Validation Mean accuracy = 80.01 %
Cross Validation Std = 0.8 %


# Using Only Content

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2 , random_state=0)
classifier = MultinomialNB()
classifier.fit(X_train,y_train)
ReportAccuracy(X_test,y_test)

accuracy = 87.37 %


### Cross validated  Content


In [18]:
accuracies = cross_val_score(classifier,X_train,y_train,cv =5)
print("Cross Validation Mean accuracy = {} %".format(round(accuracies.mean()*100,2)))
print("Cross Validation Std = {} %".format(round(accuracies.std()*100,2)))

Cross Validation Mean accuracy = 88.1 %
Cross Validation Std = 0.66 %


# Using Both Content and text

In [19]:
X.shape

(6335, 43659)

In [20]:
X2.shape

(6335, 6767)

In [21]:
X = np.array(X)
X2 = np.array(X2)

In [22]:
X3= np.concatenate((X,X2),axis=1)

In [23]:
X3.shape

(6335, 50426)

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X3,y, test_size = 0.2 , random_state=0)
classifier = MultinomialNB()
classifier.fit(X_train,y_train)
ReportAccuracy(X_test,y_test)

accuracy = 87.92 %


## Cross Validated Content and text

In [25]:
accuracies = cross_val_score(classifier,X_train,y_train,cv =5)
print("Cross Validation Mean accuracy = {} %".format(round(accuracies.mean()*100,2)))
print("Cross Validation Std = {} %".format(round(accuracies.std()*100,2)))

Cross Validation Mean accuracy = 88.93 %
Cross Validation Std = 0.77 %


# Using Random forest

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X3,y, test_size = 0.2 , random_state=0)
classifier = RandomForestClassifier(n_estimators=100)
classifier.fit(X_train,y_train)
ReportAccuracy(X_test,y_test)

accuracy = 90.37 %


In [34]:
accuracies = cross_val_score(classifier,X_train,y_train,cv =5)
print("Cross Validation Mean accuracy = {} %".format(round(accuracies.mean()*100,2)))
print("Cross Validation Std = {} %".format(round(accuracies.std()*100,2)))

Cross Validation Mean accuracy = 89.94 %
Cross Validation Std = 0.99 %


# Using Adaboost Classifier

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X3,y, test_size = 0.2 , random_state=0)
classifier = AdaBoostClassifier()
classifier.fit(X_train,y_train)
ReportAccuracy(X_test,y_test)

accuracy = 85.95 %


# Final Remark: 

### Accurary table: ( On Test Set )

#### Multinomial NB: 87.92%
#### Cross Validated NB: 88.93%
#### Random Forest: 90.37% 
#### Cross Validated RF: 89.94 %
#### Adaboost Classifier: 85.95% 

## So the winner is Random Forest Classifier in this particular case .
## But Execution time for NB was much lesser than RF . 
## and RF may overfit very easily.  