# SMS Spam Detection Using Machine Learning


---


## Practice Module: Pattern Recognition Systems (PRS)

## Group: 18

## Members:

Lim Jun Ming, A0231523U

Mediana, A0231458E

Yeong Wee Ping, A0231533R

# Feature Engineering & Selection

## 0. File Path & Library Setup

In [1]:
# Load All Necessary Packages

import os
import time
from google.colab import drive

import pandas as pd
import numpy as np

import re
import fnmatch
import nltk

from collections import defaultdict
from nltk.tokenize import word_tokenize
nltk.download('punkt')

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from imblearn.over_sampling import SMOTE

import h5py
import pickle

seed = 18

print('Versions of key libraries')
print('-------------------------')
print('pandas:  ', pd.__version__)
print('numpy:   ', np.__version__)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Versions of key libraries
-------------------------
pandas:   1.1.5
numpy:    1.19.5




In [2]:
# Mounting to Google Drive
drive.mount('/content/gdrive')

# Change Working Directory
os.chdir('/content/gdrive/My Drive/iss/prs_pm/training')

print('Working Directory: ')
!pwd

Mounted at /content/gdrive
Working Directory: 
/content/gdrive/My Drive/iss/prs_pm/training


## 1. Load Data and Train-Validation-Test Split

In [3]:
# Train Test Set Split
header = ['Label', 'Text']
rawdata = pd.read_csv('structured_data/smsdata.csv', encoding='UTF-8', names=header)
data = pd.read_csv('structured_data/procdata.csv', encoding='UTF-8', names=header)

data['Text'] = data['Text'].astype(str)
data['Text_Token'] = data['Text'].apply(lambda x: x.split(' '))

le = LabelEncoder()

X_data = data["Text"]
y_data = le.fit_transform(data["Label"])

# Split Train-Validation-Test set 0.7-0.15-0.15 ratio
feature_train, feature_valntest, y_train, y_valntest = train_test_split(X_data, y_data, test_size=0.3, stratify=y_data, random_state=seed) 

feature_val, feature_test, y_val, y_test = train_test_split(feature_valntest, y_valntest, test_size=0.5, stratify=y_valntest, random_state=seed)

train_idx = feature_train.index
val_idx = feature_val.index
test_idx = feature_test.index

X_train_token = data['Text_Token'][train_idx] 
X_val_token = data['Text_Token'][val_idx]
X_test_token = data['Text_Token'][test_idx] 

X_train_sent = data['Text'][train_idx] 
X_val_sent = data['Text'][val_idx] 
X_test_sent = data['Text'][test_idx] 

y_train_sm = np.append(y_train, np.ones((y_train==0).sum()-(y_train==1).sum())).astype(np.int)

print('Total number of data points in training dataset    : ' + str(feature_train.shape[0]))
print('Total number of data points in validation dataset  : ' + str(feature_val.shape[0]))
print('Total number of data points in testing dataset     : ' + str(feature_test.shape[0]))

Total number of data points in training dataset    : 4085
Total number of data points in validation dataset  : 876
Total number of data points in testing dataset     : 876


In [4]:
# Saving y label dataset into folder in pickle format
y_pickle = [y_train, y_val, y_test]
y_file = open('input_data/y_label.pickle', 'wb')
pickle.dump(y_pickle, y_file)
y_file.close()

print('Save Complete!')

Save Complete!


In [5]:
# # Testing Load Speed

# start = time.time()
# pickle_load = pickle.load(open('input_data/y_label.pickle', 'rb'))
# runspeed = round(time.time()-start,3)
# print(runspeed)

In [6]:
# Create word dictionary sorted by frequency
spam_words = " ".join([word for msg in data['Text_Token'][data['Label']=='spam'] for word in msg]) # list of words

# Create spam words dictionary
spam_dic = defaultdict(int)
for i in spam_words.split(" "):
  spam_dic[i] += 1
spam_dic_sorted = dict(sorted(spam_dic.items(), key=lambda item:item[1], reverse=True))
spam_show = 20 # To show top 20 spam words

## Approach 1: Manual Feature Design

In [7]:
# Custom Functions for manual feature extractions
# Count for the presence of mathematical symbols
math_symbol = ["+", "-", "/", "^", "<", ">"]
def count_math_symbol(sms):
    counter = 0
    for i in math_symbol:
      counter += sms.count(i)
    return counter

# Count for the presence of special symbols 
special_symbol = ["!", "@", "$", "~", "#", "&", "*"]
def count_special_symbol(sms):
    counter = 0
    for i in special_symbol:
      counter += sms.count(i)
    return counter

# Count for the presence of fully uppercased words
def count_uppercase(sms):
    counter = 0
    text = sms.strip() # Strip off excess spacing in the end of message
    text = re.sub(r'\d+', '', text) # Remove all numbers
    text = re.sub('[^A-Za-z0-9]+', ' ', text) # Remove punctuations
    text = word_tokenize(text) # Tokenization
    for i in text:
      counter += i.isupper()
    return counter

# Count for the presence of the top 20 common words for spam messages
# List of top 20 spam words
top_spam_words = list(spam_dic_sorted.keys())[:20]
# Function for counting spam words in processed text
def count_spam_words(tokens):
    counter = 0
    for i in top_spam_words:
      counter += tokens.count(i)
    return counter

# Indicator for presence of phone number (Taking consecutive numbers of length 5 or 8 and above as phone number)
phoneno_format = ["*xxxxx*", "*xxxx-xxxx*", "*xxxxxxxx*", "*+xx-xxxx-xxxx*"]
def check_phoneno(sms):
    text = re.sub(r'\d', 'x', sms)
    indicator = 0
    for i in phoneno_format:
      if fnmatch.fnmatch(text, i):
        indicator = 1
        break
    return indicator

# Function to extract all manually designed features
def mfd_extract(sms):
    feature = []
    feature.append(count_math_symbol(sms))
    feature.append(count_special_symbol(sms))
    feature.append(count_uppercase(sms))
    # feature.append(count_spam_words(text_process(sms)))
    feature.append(check_phoneno(sms))
    # feature.append(len(text_process(sms)))
    return feature

In [8]:
# Creating the feature map dataframe (For manual feature design)
feature_vector = rawdata['Text'].apply(lambda x: mfd_extract(x))
feature_name = ['Math_Symbol', 'Special_Symbol', 'Uppercase_Words', 'Phone_Number']# On Raw Data
data_mfd = pd.DataFrame(np.vstack(feature_vector), columns=feature_name) 
data_mfd['Spam_Words'] = data['Text_Token'].apply(lambda x: count_spam_words(x)) # Add feature: number of occurence of top 20 spam words
data_mfd['Num_WOrds'] = data['Text_Token'].apply(lambda x: len(x)) # Add feature: length of preprocessed text
X_train_mfd = data_mfd.loc[train_idx, :]
X_val_mfd = data_mfd.loc[val_idx, :]
X_test_mfd = data_mfd.loc[test_idx, :]


# Normalization
train_max = X_train_mfd.max() # Max Values
train_min = X_train_mfd.min() # Min Values
feature_name = X_train_mfd.columns
for i in data_mfd.columns:
  if i != 'Phone_Number': # Normalization exclude 'Phone_Number' feature
    X_train_mfd[i] = X_train_mfd[i].apply(lambda x: (x - train_min[i])/(train_max[i]- train_min[i]))
    X_val_mfd[i] = X_val_mfd[i].apply(lambda x: (x - train_min[i])/(train_max[i]- train_min[i]))
    X_test_mfd[i] = X_test_mfd[i].apply(lambda x: (x - train_min[i])/(train_max[i]- train_min[i]))

print('Feature Map for Train Dataset:\n')
print(X_train_mfd.head())
print('------------------------------------------------------------\n')
print('Feature Map for Validation Dataset:\n')
print(X_val_mfd.head())
print('------------------------------------------------------------\n')
print('Feature Map for Test Dataset:\n')
print(X_test_mfd.head())
print('------------------------------------------------------------')

Feature Map for Train Dataset:

      Math_Symbol  Special_Symbol  ...  Spam_Words  Num_WOrds
1074          0.0             0.0  ...         0.0   0.097222
1310          0.0             0.0  ...         0.0   0.055556
5257          0.0             0.0  ...         0.2   0.041667
597           0.0             0.0  ...         0.0   0.083333
1530          0.0             0.0  ...         0.0   0.180556

[5 rows x 6 columns]
------------------------------------------------------------

Feature Map for Validation Dataset:

      Math_Symbol  Special_Symbol  ...  Spam_Words  Num_WOrds
1711     0.000000        0.055556  ...         0.0   0.069444
687      0.615385        0.000000  ...         0.1   0.236111
1229     0.153846        0.018519  ...         0.0   0.027778
3460     0.000000        0.000000  ...         0.0   0.138889
617      0.000000        0.000000  ...         0.1   0.055556

[5 rows x 6 columns]
------------------------------------------------------------

Feature Map for Tes

In [9]:
# Saving Constants - Top 20 Spam Words
np.savetxt('feature_extraction_constants/top20spamwords.txt', top_spam_words, delimiter=" ", fmt="%s") 

In [10]:
# Saving Constants - Max and Min Parameters Values for Normalization
minmaxscale = pd.DataFrame(np.array([train_min, train_max]).transpose(), index=data_mfd.columns, columns=['Min', 'Max'])
minmaxscale.to_csv('feature_extraction_constants/minmaxscale.csv')


In [11]:
# Saving X_mfd dataset into folder in pickle format
X_mfd_pickle = [X_train_mfd, X_val_mfd, X_test_mfd]
X_mfd_file = open('input_data/X_mfd.pickle', 'wb')
pickle.dump(X_mfd_pickle, X_mfd_file)
X_mfd_file.close()

print('Save Complete!')

Save Complete!


In [12]:
# # Testing Load Speed

# start = time.time()
# pickle_load = pickle.load(open('input_data/X_mfd.pickle', 'rb'))
# runspeed = round(time.time()-start,3)
# print(runspeed)

## Approach 2: Bag-Of-Words (BOW)

In [13]:
# Initialize Tensorflow Tokenizer
tk = Tokenizer()

# Fit Tokenizer
X_train_tf = tk.fit_on_texts(X_train_sent)

### i. CountVectorizer

In [14]:
X_train_cv = tk.texts_to_matrix(X_train_sent, mode='count')
X_val_cv = tk.texts_to_matrix(X_val_sent, mode='count')
X_test_cv = tk.texts_to_matrix(X_test_sent, mode='count')

print('Train data feature space shape      : ' + str(X_train_cv.shape))
print('Validation data feature space shape : ' + str(X_val_cv.shape))
print('Test data feature space shape       : ' + str(X_test_cv.shape))
print('Size of Vocabulary                  : ' + str(X_train_cv.shape[1]))

Train data feature space shape      : (4085, 6390)
Validation data feature space shape : (876, 6390)
Test data feature space shape       : (876, 6390)
Size of Vocabulary                  : 6390


In [15]:
# Saving X_cv dataset into folder in pickle format
X_cv_pickle = [X_train_cv, X_val_cv, X_test_cv]
X_cv_file = open('input_data/X_cv.pickle', 'wb')
pickle.dump(X_cv_pickle, X_cv_file)
X_cv_file.close()

print('Save Complete!')

Save Complete!


In [16]:
# # Testing Load Speed

# start = time.time()
# pickle_load = pickle.load(open('input_data/X_cv.pickle', 'rb'))
# runspeed = round(time.time()-start,3)
# print(runspeed)

### ii. Tf-Idf Vectorizer

In [17]:
X_train_tfidf = tk.texts_to_matrix(X_train_sent, mode='tfidf')
X_val_tfidf = tk.texts_to_matrix(X_val_sent, mode='tfidf')
X_test_tfidf = tk.texts_to_matrix(X_test_sent, mode='tfidf')

print('Train data feature space shape      : ' + str(X_train_tfidf.shape))
print('Validation data feature space shape : ' + str(X_val_tfidf.shape))
print('Test data feature space shape       : ' + str(X_test_tfidf.shape))
print('Size of Vocabulary                  : ' + str(X_train_tfidf.shape[1]))

Train data feature space shape      : (4085, 6390)
Validation data feature space shape : (876, 6390)
Test data feature space shape       : (876, 6390)
Size of Vocabulary                  : 6390


In [18]:
# Saving X_tfidf dataset into folder in pickle format
X_tfidf_pickle = [X_train_tfidf, X_val_tfidf, X_test_tfidf]
X_tfidf_file = open('input_data/X_tfidf.pickle', 'wb')
pickle.dump(X_tfidf_pickle, X_tfidf_file)
X_tfidf_file.close()

print('Save Complete!')

Save Complete!


In [19]:
# # Testing Load Speed

# start = time.time()
# pickle_load = pickle.load(open('input_data/X_tfidf.pickle', 'rb'))
# runspeed = round(time.time()-start,3)
# print(runspeed)

### iii. Word Dictionary Index Sequencing

In [31]:
# Function for padding vectors
def CreateIndexMatrix(data, tokenizer=tk, sms_length=50, pad_mode='post'):
    mat = tokenizer.texts_to_sequences(data)
    mat = pad_sequences(mat, maxlen=sms_length, padding=pad_mode)
    return mat

# Size 50 words length vectorization
size30 = 30
X_train_30idx = CreateIndexMatrix(X_train_sent, tk, size30)
X_val_30idx = CreateIndexMatrix(X_val_sent, tk, size30)
X_test_30idx = CreateIndexMatrix(X_test_sent, tk, size30)

print('Feature Space Information based on 50 words index vectorization:')
print('Train data feature space shape      : ' + str(X_train_30idx.shape))
print('Validation data feature space shape : ' + str(X_val_30idx.shape))
print('Test data feature space shape       : ' + str(X_test_30idx.shape))
print('Size of Vocabulary                  : ' + str(len(list(tk.word_counts.keys()))))


Feature Space Information based on 50 words index vectorization:
Train data feature space shape      : (4085, 30)
Validation data feature space shape : (876, 30)
Test data feature space shape       : (876, 30)
Size of Vocabulary                  : 6389


In [21]:
# Saving X_30idx dataset into folder in pickle format
X_30idx_pickle = [X_train_30idx, X_val_30idx, X_test_30idx]
X_30idx_file = open('input_data/X_30idx.pickle', 'wb')
pickle.dump(X_30idx_pickle, X_30idx_file)
X_30idx_file.close()

print('Save Complete!')

Save Complete!


In [22]:
# # Testing Load Speed

# start = time.time()
# pickle_load = pickle.load(open('input_data/X_30idx.pickle', 'rb'))
# runspeed = round(time.time()-start,3)
# print(runspeed)

In [23]:
# Saving Serialized Tokenizer
tokenizer_file = open('feature_extraction_constants/bowtokenizer.pickle', 'wb')
pickle.dump(tk, tokenizer_file, protocol=pickle.HIGHEST_PROTOCOL)
tokenizer_file.close()

## Approach 3: Pre-Trained Word Embeddings

In [24]:
# Load and Read Pre-Trained Word Embeddings into Dictionaries

embeddings_50d = {}
with open("raw_data/glove.6B.50d.txt", 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_50d[word] = vector


In [25]:
# Filter pre-trained word embeddings to training dataset word vocabulary
vec_dim_50d = len(list(embeddings_50d.values())[0])
# vec_dim_100d = len(list(embeddings_100d.values())[0])
# vec_dim_200d = len(list(embeddings_200d.values())[0])

train_vocab = list(tk.word_index.keys())
train_vocab_vect = {}
for word in train_vocab:
  vect_50d  = embeddings_50d.get(word)
  if vect_50d is None: # If the word is not found in GloVe 50d vector dictionary, assign vector of zeros to the word
    vect_50d = np.zeros((vec_dim_50d))
  train_vocab_vect[word] = [vect_50d]

dict_vect = np.array(list(train_vocab_vect.values()), dtype=object)
lost_50d  = sum([np.array_equal(np.zeros(vec_dim_50d), word) for word in dict_vect[:, 0]])

print('Total number of unique words from training dataset           : ' + str(len(train_vocab)))
print('Total number of vectorized words using 50d GloVe Embeddings  : ' + str(len(train_vocab) - lost_50d))
print('Total number of words not vectorized and assigned with zeros : ' + str(lost_50d))

Total number of unique words from training dataset           : 6389
Total number of vectorized words using 50d GloVe Embeddings  : 5181
Total number of words not vectorized and assigned with zeros : 1208


In [26]:
# Extract pre-trained embeddings based on training vocabulary words
train_50d_dic = {}
for word in list(train_vocab_vect.keys()):
  if not(np.array_equal(train_vocab_vect[word][0], np.zeros(50))):
    train_50d_dic[word] = train_vocab_vect[word][0]


In [27]:
# Save Pre-trained GloVe Embeddings
with open('feature_extraction_constants/embeddingsglove.pickle', 'wb') as f:
    pickle.dump(train_50d_dic, f, protocol=pickle.HIGHEST_PROTOCOL)
f.close()


In [28]:
# Function for transforming data using pre-trained GloVe embeddings on local vocabulary
def convert_to_emb(tokens, emb_dim, sms_length=50):
    vect = np.zeros((sms_length, emb_dim))
    for i in np.arange(min(len(tokens), sms_length)):
      get_vect = train_vocab_vect.get(tokens[i])
      if get_vect is not None:
        vect[i] = get_vect[0]
    return vect

print(' ')
# Embedding dim = 50, tokens lenght = 30
mlen = 30
emb = 50
X_train_vec_30x50 = np.array([convert_to_emb(tokens, emb, mlen) for tokens in X_train_token])
X_val_vec_30x50 = np.array([convert_to_emb(tokens, emb, mlen) for tokens in X_val_token])
X_test_vec_30x50 = np.array([convert_to_emb(tokens, emb, mlen) for tokens in X_test_token])
print('Shapes of train dataset of word length 50 and embedding size 50 is       : ' + str(X_train_vec_30x50.shape))
print('Shapes of validation dataset of word length 50 and embedding size 50 is  : ' + str(X_val_vec_30x50.shape))
print('Shapes of test dataset of word length 50 and embedding size 50 is        : ' + str(X_test_vec_30x50.shape))
print('-------------------------------------------------------------------------------------------\n')



 
Shapes of train dataset of word length 50 and embedding size 50 is       : (4085, 30, 50)
Shapes of validation dataset of word length 50 and embedding size 50 is  : (876, 30, 50)
Shapes of test dataset of word length 50 and embedding size 50 is        : (876, 30, 50)
-------------------------------------------------------------------------------------------



In [29]:
# Saving X_vec_30x50 dataset into folder in pickle format
X_vec_30x50_pickle = [X_train_vec_30x50, X_val_vec_30x50, X_test_vec_30x50]
X_vec_30x50_file = open('input_data/X_vec_30x50.pickle', 'wb')
pickle.dump(X_vec_30x50_pickle, X_vec_30x50_file)
X_vec_30x50_file.close()

print('Save Complete!')

Save Complete!


In [30]:
# # Testing Load Speed

# start = time.time()
# pickle_load = pickle.load(open('input_data/X_vec_30x50.pickle', 'rb'))
# runspeed = round(time.time()-start,3)
# print(runspeed)