# Spam Classification 

In [1]:
import pandas as pd
import numpy as np

In [2]:
spam_data = pd.read_csv('spam.csv', encoding = 'latin-1')
spam_data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


##### Clean the data

In [3]:
# Delete useless columns from dataframe
spam_data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis = 1, inplace = True)

# Rename columns v1 and v1 to 'target' and 'text'
spam_data.rename(columns = {'v1': 'target', 'v2' : 'text'}, inplace = True)

# Change the data in the target variable to be a binary indicator of spam or not spam (ham)
spam_data['target'] = np.where(spam_data['target']=='spam',1,0)

In [4]:
spam_data.head()

Unnamed: 0,target,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
# Split the data into a training and testing set with y representing the target variable 
# and x representing the feature(s)

from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(spam_data['text'], 
                                                    spam_data['target'], 
                                                    random_state=0)

Calculate the percentage of the data labeled as spam

In [6]:
# Shows that the dataset is inbalanced

100*len(spam_data[spam_data['target']==1])/len(spam_data)

13.406317300789663

#### Build a multinomial Naive Bayes model using CountVectorizer to transform the current features (sns text) into a vector of word counts. 
Since the classes in the data are imbalanced, instead of using the accuracy score of the model which mesures the (true positives + true negativs)/all instances, I'll evaluate the efficiency of the model by calculating the area under the ROC (Reciever Opperating Characteristic) Curve which mesures the false positive vs the true positive rates. The closer to 1, the better the performance.

In [7]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
# Train and fit a naive bayes classifier with parameters alpha = 0.1
# the alpha parameter will ensure that the p(word|class) != 0 even if that word 
# has not previously been encountered by our classifier

def naive_bayes():
    vect = CountVectorizer().fit(X_train)
    X_train_vectorized = vect.transform(X_train)
    model = MultinomialNB(alpha=0.1).fit(X_train_vectorized, y_train)
    predictions = model.predict(vect.transform(X_test))
    
    feature_names = np.array(vect.get_feature_names())
    # Sort model coefficients by index value
    sorted_coef_index = model.coef_[0].argsort()
    
    # Return feature names (words) sorted by the sorted coefficient index value and slice top 10 and bottom 10 segments
    print('Words least likely to predict spam:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
    print('Words most likely to predict spam: \n{}\n'.format(feature_names[sorted_coef_index[:-11:-1]]))
    print('ROC_auc Score: {}'.format(roc_auc_score(y_test, predictions)))
    

In [9]:
naive_bayes()

Words least likely to predict spam:
['kaiez' 'needy' 'needs' 'needing' 'needed' 'needa' 'ned' 'necklace' 'neck'
 'necessity']

Words most likely to predict spam: 
['to' 'call' 'you' 'your' 'free' 'for' 'the' 'now' 'or' 'txt']

ROC_auc Score: 0.9720812182741116


The smallest coefficients predicted by the model correspond to words which have a higher probability in appearing in a text which is labled as not being spam, whereas the largest coefficients show the opposite.

#### Naive Bayes model using TfidfVectorizer
Unlike count vectorizer which uses a 'bag of words' approach to give us word a count frequency as a feature, tfidf (term frequency inverse document frequency) uses a weighting system to determine an individual words importance based on its frequency in a document and the corpus as a whole. The weight will increase due to its frequency in the document but decrease in response to its frequency in the corpus. By this method (as opposed to simply returning  counts of words) tfidf will penalize stop words such as 'the', 'and', and 'a' as these words likely won't be strong indicators of what we're trying to predict.

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
def nb_tfidf():
    
    vect = TfidfVectorizer(min_df=3).fit(X_train)
    X_train_vectorized = vect.transform(X_train)
    model = MultinomialNB(alpha=0.1).fit(X_train_vectorized, y_train)
    predictions = model.predict(vect.transform(X_test))
    
    feature_names = np.array(vect.get_feature_names())
    
    sorted_coef_index = model.coef_[0].argsort()
    
    
    print('Words least likely to predict spam:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
    print('words most likely to predict spam: \n{}\n'.format(feature_names[sorted_coef_index[:-11:-1]]))

    print('ROC_auc Score: {}'.format(roc_auc_score(y_test, predictions)))

In [11]:
nb_tfidf()

Words least likely to predict spam:
['ûò' 'hell' 'height' 'hee' 'heavy' 'shower' 'showing' 'head' 'hella' 'he']

words most likely to predict spam: 
['to' 'call' 'free' 'your' 'txt' 'you' 'or' 'for' 'now' 'stop']

ROC_auc Score: 0.9416243654822335


To improve the performance of the model I want to add features to the feature space that serve as strong indicators of whether or not an email is classified as spam.

In [12]:
not_spam = spam_data[spam_data.target == 0]
spam = spam_data[spam_data.target == 1]   

In [13]:
spam_text_len = spam.text.str.len().mean()
non_spam_text_len = not_spam.text.str.len().mean()
print('Average length of spam messages: {}\nAverage length of non-spam messages: {}'.format(spam_text_len,
                                                                                               non_spam_text_len))                                                                                            

Average length of spam messages: 138.8661311914324
Average length of non-spam messages: 71.02362694300518


In [14]:
spam_digit_count = spam.text.str.findall(r'\d').str.len().mean()
non_spam_digit_count = not_spam.text.str.findall(r'\d').str.len().mean()
    
print('Average number of digets in spam messages: {}\nAverage number of digits in non-spam messages: {}'.format(spam_digit_count,
                                                                                                                        non_spam_digit_count))

Average number of digets in spam messages: 15.759036144578314
Average number of digits in non-spam messages: 0.2992746113989637


In [15]:
no_word_spam = spam.text.str.findall(r'\W').str.len().mean()
no_word_not_spam = not_spam.text.str.findall(r'\W').str.len().mean()
    
print('Average number of non-word characters in spam messages: {}\nAverage number of non-word characters in non-spam messages: {}'.format(no_word_spam, no_word_not_spam))

Average number of non-word characters in spam messages: 29.041499330655956
Average number of non-word characters in non-spam messages: 17.29181347150259


As there are substantial differences between the spam and not spam rows described by the 
statistics above, these counts would likely make good indicators for our classifier.

In [16]:
from sklearn.linear_model import LogisticRegression

In [17]:
def add_feature(X, feature_to_add):
    
    # Returns sparse feature matrix with added feature.
    # feature_to_add can also be a list of features.
    
    from scipy.sparse import csr_matrix, hstack
    return hstack([X, csr_matrix(feature_to_add).T], 'csr')

using the add_feature function to add the above mentioned features to our data, I'll train and fit a logistic regression model and evaluate the roc_auc score

In [18]:
def added_features_log_reg():
    vect = CountVectorizer(min_df=5, analyzer = 'char_wb', ngram_range=(2,6)).fit(X_train)
    
    length_of_doc = X_train.str.len().values
    length_of_doc_test = X_test.str.len().values
    digit_count = X_train.str.findall(r'\d').str.len().values
    digit_count_test = X_test.str.findall(r'\d').str.len().values
    non_word_char_count = X_train.str.findall(r'\W').str.len().values
    non_word_char_count_test = X_test.str.findall(r'\W').str.len().values
   
    X_train_vectorized = vect.transform(X_train)
    X_test_vectorized = vect.transform(X_test)
    
    X_train_vectorized = add_feature(X_train_vectorized, [length_of_doc, 
                                                          digit_count,
                                                         non_word_char_count])
    X_test_vectorized = add_feature(X_test_vectorized, [length_of_doc_test, 
                                                        digit_count_test,
                                                       non_word_char_count_test])
    
    model = LogisticRegression(C=100).fit(X_train_vectorized, y_train)
    
    predictions = model.predict(X_test_vectorized)
    
    feature_names = vect.get_feature_names()
    
    sorted_coef_index = model.coef_[0].argsort()
    
    feature_names.append('length_of_doc')
    feature_names.append('digit_count')
    feature_names.append('non_word_char_count')
    feature_names = np.array(feature_names)
    
    print('Words segments least likely to predict spam:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
    print('Words segments most likely to predict spam: \n{}\n'.format(feature_names[sorted_coef_index[:-11:-1]]))

    print('ROC_auc Score: {}'.format(roc_auc_score(y_test, predictions)))

In [19]:
added_features_log_reg()

Words segments least likely to predict spam:
['. ' '..' '? ' ' i' ' y' ' go' ':)' ' h' ' m' 'h ']

Words segments most likely to predict spam: 
['digit_count' 'ne' 'co' 'ia' 'xt' ' ch' 'mob' 'ar' 'ww' ' x']

ROC_auc Score: 0.9788593110707434


Unlike the models trained before which used whole words as features to predict our target variable spam, this model (as a parameter in the count vectorizer function) uses segments of words ranging from 2 to 6 characters long in order to make its predictions. Although this makes interpretation of the words as features more difficult, it also makes the model more robust to spelling errors made by the senders of the messages.