In [3]:
# data lib
import pandas as pd
import numpy as np

# nlp lib
import re
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# preprocessing lib
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# modeling lib
from sklearn.svm import SVC

# metric lib
from sklearn.metrics import f1_score

In [15]:
data = pd.read_csv('../input/sms-spam-collection-dataset/spam.csv' , encoding = 'latin')

In [16]:
data

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [17]:
data.drop([data.columns[col] for col in [2 , 3, 4]] , axis = 1 , inplace = True)

In [18]:
data

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [19]:
encoder = LabelEncoder()

data['v1'] = encoder.fit_transform(data['v1'])
class_mapping = {index : label for index , label in enumerate(encoder.classes_)}

In [20]:
class_mapping

{0: 'ham', 1: 'spam'}

In [21]:
data

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ì_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [22]:
# process an email
def process_email(contents):
    """
    take an email string and convert to a list of stemmed words
    """
    ps = PorterStemmer()
    
    contents = contents.lower()
    contents = re.sub(r'<[^<>]+>' , ' ' , contents)
    contents = re.sub(r'[0-9]+' , 'number' , contents)
    contents = re.sub(r'(http|https)://[^\s]*' , 'httpsaddr' , contents)
    contents = re.sub(r'[^\s]+@[^\s]+' , 'emailaddr' , contents)
    contents = re.sub(r'[$]+' , 'dollar' , contents)
    
    words = word_tokenize(contents)
    
    for i in range(len(words)):
        words[i] = re.sub(r'[^a-zA-Z0-9]' , '' , words[i])
        words[i] = ps.stem(words[i])
        
    words = [word for word in words if len(word) >= 1]
    
    return words

In [23]:
# testing 

email = """> Anyone knows how much it costs to host a web portal ?
>
Well, it depends on how many visitors you're expecting.
This can be anywhere from less than 10 bucks a month to a couple of $100.
You should checkout https://www.rakspace.com/ or perhaps Amazon EC2
if youre running something big..

To unsubscribe yourself from this mailing list, send an email to:
groupname-unsubscribe@egroups.com

"""

In [24]:
process_email(email)

['anyon',
 'know',
 'how',
 'much',
 'it',
 'cost',
 'to',
 'host',
 'a',
 'web',
 'portal',
 'well',
 'it',
 'depend',
 'on',
 'how',
 'mani',
 'visitor',
 'you',
 're',
 'expect',
 'thi',
 'can',
 'be',
 'anywher',
 'from',
 'less',
 'than',
 'number',
 'buck',
 'a',
 'month',
 'to',
 'a',
 'coupl',
 'of',
 'dollarnumb',
 'you',
 'should',
 'checkout',
 'httpsaddr',
 'or',
 'perhap',
 'amazon',
 'ecnumb',
 'if',
 'your',
 'run',
 'someth',
 'big',
 'to',
 'unsubscrib',
 'yourself',
 'from',
 'thi',
 'mail',
 'list',
 'send',
 'an',
 'email',
 'to',
 'emailaddr']

In [38]:
# vocabulary 
def get_vocabulary(emails , vocab_length):
    """
    take a list of email and get a dictionary of the most words
    """
    
    vocabulary =  dict()
    for i in range(len(emails)):
        emails[i] = process_email(emails[i])
        for word in emails[i]:
            if word in vocabulary.keys():
                vocabulary[word] +=1
            else:
                vocabulary[word] = 1
                
    
    vocabulary = sorted(vocabulary.items() , key = lambda x : x[1] , reverse = True)
    vocabulary = list(map(lambda x : x[0] , vocabulary[0 : vocab_length]))
    vocabulary = {index : word for index , word in enumerate(vocabulary)}
    
    return vocabulary

In [39]:
def get_key(dictionary, val):
    """
    Get a dictionary key given a value
    """
    for key, value in dictionary.items():
        if value == val:
            return key

In [31]:
def get_indices(email, vocabulary):
    """
    Get the indices of vocab words used in a given email
    """
    word_indices = set()
    
    for word in email:
        if word in vocabulary.values():
            word_indices.add(getKey(vocabulary, word))
    
    return word_indices

In [33]:
def get_feature_vector(word_indices, vocab_length):
    feature_vec = np.zeros(vocab_length)
    
    for i in word_indices:
        feature_vec[i] = 1
        
    return feature_vec

In [34]:
vocab_length = 2000

In [41]:
vocabulary = get_vocabulary(data['v2'].to_list(), vocab_length)

emails = data['v2'].to_list()
emails = list(map(lambda x: process_email(x), emails))

In [42]:
y = data['v1']

In [43]:
X = list(map(lambda x: getFeatureVector(getIndices(x, vocabulary), vocab_length), emails))
X = pd.DataFrame(np.array(X).astype(np.int16))

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

In [45]:
model = SVC()

model.fit(X_train, y_train)

SVC()

In [46]:
model.score(X_test, y_test)

0.9856502242152466

In [47]:
y_pred = model.predict(X_test)

In [48]:
f1_score(y_test, y_pred)

0.9477124183006536

In [53]:
print(f"Model Score : {model.score(X_test , y_test)}")
print(f"F1 Score : {f1_score(y_test , y_pred)}")

Model Score : 0.9856502242152466
F1 Score : 0.9477124183006536
