In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# IMPORTS

In [None]:
import pandas as pd
import string
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
import spacy

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import naive_bayes
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

from scipy.sparse import hstack, csr_matrix

# LOADING DATA

In [None]:
data = pd.read_csv('drive/MyDrive/SMS-Spam-Classifier/spamdata.csv')
data.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
data['label'].value_counts(normalize=True)

ham     0.865937
spam    0.134063
Name: label, dtype: float64

# PREPROCESSING

In [None]:
nlp = English()

In [None]:
def clean_text(text):
    cleaned = text.lower()
    
    punctuations = string.punctuation
    cleaned = "".join(character for character in cleaned if character not in punctuations)
    
    my_doc = nlp(cleaned)
    token_list = []
    for token in my_doc:
        token_list.append(token.text)
    
    filtered_sentence =[] 

    for word in token_list:
        lexeme = nlp.vocab[word]
        if lexeme.is_stop == False:
            filtered_sentence.append(word)
    
    cleaned = filtered_sentence
    cleaned = " ".join(cleaned)
    
    return cleaned

In [None]:
data["cleaned"] = data["text"].apply(lambda x : clean_text(x))
data.head()

Unnamed: 0,label,text,cleaned
0,ham,"Go until jurong point, crazy.. Available only ...",jurong point crazy available bugis n great wor...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,u dun early hor u c
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah nt think goes usf lives


# FEATURE ENGINEERING

In [None]:
# Creating meta features

# Number of words in original text
data["word_count"] = data["text"].apply(lambda x : len(x.split()))

# Number of words in cleaned text
data["word_count_cleand"] = data["cleaned"].apply(lambda x : len(x.split()))

# Number of characters including spaces in the cleaned text
data["char_count"] = data["cleaned"].apply(lambda x : len(x))

# number of characters excluding spaces in the cleaned text
data["char_count_without_spaces"] = data["cleaned"].apply(lambda x : len(x.replace(" ","")))

# Number of digits in the cleaned text
data["num_dig"] = data["cleaned"].apply(lambda x :  sum([1 if w.isdigit() else 0 for w in x.split()]))

In [None]:
data.head()

Unnamed: 0,label,text,cleaned,word_count,word_count_cleand,char_count,char_count_without_spaces,num_dig
0,ham,"Go until jurong point, crazy.. Available only ...",jurong point crazy available bugis n great wor...,20,15,79,65,0
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni,6,6,23,18,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...,28,22,131,110,3
3,ham,U dun say so early hor... U c already then say...,u dun early hor u c,11,6,19,14,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah nt think goes usf lives,13,6,27,22,0


## COUNTING NOUNS AND VERBS

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
# Dictionary of noun and verb POS tags
pos_dic = {"noun" : ["NNP", "NN", "NNS", "NNPS"], "verb" : ["VBZ", "VB", "VBD","VBG", "VBN"]}

In [None]:
def pos_check(txt, family):

    txt = nlp(txt)
    
    all_tags = []

    for w in txt:
        all_tags.append(w.tag_)
    
    count = 0

    for tag in all_tags:
        if tag in pos_dic[family]:
            count += 1

    return count

In [None]:
data["noun_count"] = data["cleaned"].apply(lambda x : pos_check(x, "noun"))
data["verb_count"] = data["cleaned"].apply(lambda x : pos_check(x, "verb"))

In [None]:
data.head()

Unnamed: 0,label,text,cleaned,word_count,word_count_cleand,char_count,char_count_without_spaces,num_dig,noun_count,verb_count
0,ham,"Go until jurong point, crazy.. Available only ...",jurong point crazy available bugis n great wor...,20,15,79,65,0,12,1
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni,6,6,23,18,0,6,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...,28,22,131,110,3,13,0
3,ham,U dun say so early hor... U c already then say...,u dun early hor u c,11,6,19,14,0,5,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah nt think goes usf lives,13,6,27,22,0,3,2


# TRAIN / VAL SPLIT

In [None]:
target = data["label"].values
target = LabelEncoder().fit_transform(target)

In [None]:
train = data[['word_count', 'word_count_cleand', 'char_count', 
              'char_count_without_spaces', 'num_dig', 'noun_count', 
              'verb_count']]

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(train, target, random_state=20, stratify=target)

# MODEL

In [None]:
model = naive_bayes.MultinomialNB()

In [None]:
model.fit(x_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [None]:
pred_train = model.predict(x_train)
pred_valid = model.predict(x_valid)

In [None]:
accuracy_score(y_train, pred_train)

0.9430485762144054

In [None]:
accuracy_score(y_valid, pred_valid)

0.9382627422828428

# TF-IDF FEATURES

In [None]:
word_tfidf = TfidfVectorizer(max_features = 500)

In [None]:
word_tfidf.fit(data["cleaned"].values)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=500,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [None]:
word_vectors_tfidf = word_tfidf.transform(data["cleaned"].values)

In [None]:
word_vectors_tfidf

<5572x500 sparse matrix of type '<class 'numpy.float64'>'
	with 21920 stored elements in Compressed Sparse Row format>

# TRAIN / VAL SPLIT

In [None]:
feature_set1 = data[['word_count', 'word_count_cleand',
       'char_count', 'char_count_without_spaces', 'num_dig', 'noun_count',
       'verb_count']]

train = hstack([word_vectors_tfidf, csr_matrix(feature_set1)], "csr")

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(train, target, random_state=20, stratify=target)

# MODEL

In [None]:
model = naive_bayes.MultinomialNB()

In [None]:
model.fit(x_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [None]:
pred_train = model.predict(x_train)
pred_valid = model.predict(x_valid)

In [None]:
accuracy_score(y_train, pred_train)

0.9676956209619526

In [None]:
accuracy_score(y_valid, pred_valid)

0.9619526202440776