# Spam Vs. Ham

Predicting if the given SMS text is spam or ham. 

In [15]:
import numpy as np
import pandas as pd

import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer  
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score

Read the data. The dataset is from Kaggle.
Retain only columns 'v1' and 'v2' and rename them to 'label' and 'text' respectively.

In [2]:
data = pd.read_csv('spam.csv', encoding='latin-1')
data = data[['v1', 'v2']]
data.columns = ['label', 'text']

data.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
data.describe()

Unnamed: 0,label,text
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [4]:
data.groupby('label').describe()

Unnamed: 0_level_0,text,text,text,text
Unnamed: 0_level_1,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


We will introduce a new column - 'label_num'. It stores 1 for spam and 0 for ham.

In [5]:
data['label_num'] = data.label.map({'ham':0, 'spam':1})
data.head()

Unnamed: 0,label,text,label_num
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


# Text Preprocessing

Stopwords and stemmer.

In [6]:
stop_words = set(stopwords.words('english'))
porter = PorterStemmer()

Corpus is a series which holds cleaned tweets.
- Remove punctuation
- Convert to lower case
- Tokenize the words
- Remove stop words
- Stem the words
- Join all words in an sms text back into a single string
- Append it to the corpus

In [7]:
corpus = []              # to hold the cleaned tweets

for i in range(len(data)):
    #remove punctuation
    sms = re.sub('[^a-zA-Z]', ' ', data['text'][i])
    
    #convert to lower case
    sms = sms.lower()   
    
    #split it into words
    sms = word_tokenize(sms)
    
    #removing stopwords
    sms = [word for word in sms if not word in stop_words]
    
    #stemming words
    sms = [porter.stem(word) for word in sms]
    
    # joining all words back into a string
    sms = ' '.join(sms)
    
    #append cleaned tweets to corpus
    corpus.append(sms)

In [8]:
corpus[111]

'go dinner msg'

# Feature Extraction

We will extract 2 features - Bag of Words features and TF-IDF features

Bag of Words - 

In [21]:
bow_word_vectorizer = CountVectorizer(max_df=0.90, min_df=2)    #limiting the number of words to top 3000 frequent words
bow_word_feature = bow_word_vectorizer.fit_transform(corpus).toarray()       #DTM with words and their frequencies

TF-IDF features - 

In [11]:
# TF-IDF features
tfidf_word_vectorizer = TfidfVectorizer(max_df=0.90, min_df=2)
# TF-IDF feature matrix
tfidf_word_feature = tfidf_word_vectorizer.fit_transform(corpus).toarray()  

# Classification

In [16]:
#trying 4 different classifiers - Naive Bayes, Logistic Regression, Decision Tree, Random Forest
def classification(X_train, X_test, y_train, y_test):
    
    classifier = GaussianNB()
    classifier = classifier.fit(X_train,y_train)
    y_pred=classifier.predict(X_test)
    print("Naive Bayes: Accuracy:",accuracy_score(y_test, y_pred))
    print("Naive Bayes: F1 Score:",f1_score(y_test, y_pred))

    classifier = LogisticRegression(random_state=42)
    classifier.fit(X_train,y_train)
    y_pred=classifier.predict(X_test)
    print("Logistic Regression: Accuracy:",accuracy_score(y_test, y_pred))
    print("Logistic Regression: F1 Score:",f1_score(y_test, y_pred))

    classifier = DecisionTreeClassifier(random_state=42)
    classifier = classifier.fit(X_train,y_train)
    y_pred=classifier.predict(X_test)
    print("Decision Tree: Accuracy:",accuracy_score(y_test, y_pred))
    print("Decision Tree: F1 Score:",f1_score(y_test, y_pred))

    classifier = RandomForestClassifier(n_estimators=50, random_state=42)
    classifier = classifier.fit(X_train,y_train)
    y_pred=classifier.predict(X_test)
    print("Random Forest: Accuracy:",accuracy_score(y_test, y_pred))
    print("Random Forest: F1 Score:",f1_score(y_test, y_pred))

Classification using Bag of Words Features.

In [22]:
#X now has independent variables
X = bow_word_feature
#y has dependent variable
y = data.iloc[:,2].values

#splitting to train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [23]:
print('Classification using Bag of Words Features:')
classification(X_train, X_test, y_train, y_test)

Classification using Bag of Words Features:
Naive Bayes: Accuracy: 0.8648325358851675
Naive Bayes: F1 Score: 0.6295081967213115
Logistic Regression: Accuracy: 0.9778708133971292
Logistic Regression: F1 Score: 0.908641975308642
Decision Tree: Accuracy: 0.9688995215311005
Decision Tree: F1 Score: 0.8767772511848341
Random Forest: Accuracy: 0.9760765550239234
Random Forest: F1 Score: 0.900497512437811


Classification using TF-IDF Features.

In [19]:
X = tfidf_word_feature
y = data.iloc[:,2].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [20]:
print('Classification using TF-IDF Features:')
classification(X_train, X_test, y_train, y_test)

Classification using TF-IDF Features:
Naive Bayes: Accuracy: 0.8600478468899522
Naive Bayes: F1 Score: 0.6125827814569537
Logistic Regression: Accuracy: 0.9569377990430622
Logistic Regression: F1 Score: 0.8085106382978723
Decision Tree: Accuracy: 0.9539473684210527
Decision Tree: F1 Score: 0.8261851015801355
Random Forest: Accuracy: 0.9784688995215312
Random Forest: F1 Score: 0.9113300492610836
