## Alexander Ye, Spam Detection Classification
### CS89.21, 23F, Vosoughi

Creating a classifier for detecting spam

Each line in the SPAM.csv dataset corresponds to one message and has a label of either "ham" or "spam". In this assignment, you are experimenting with different features and models to create the best spam detector possible.  

I tested a combination of: <br> 
(1) Logistic Regression (LR) <br> 
(2) Random Forest (RF) <br> 

(3) with and without lowercasing <br> 
(4) with and without stopword removal  <br> 
(5) with and without lemmatization <br>

(6) unigrams <br>
(7) unigrams and bigrams <br>
(8) unigrams, bigrams and trigrams <br>
(9) tfidf unigrams <br>
(10) tfidf  unigrams and bigrams <br>
(11) tfidf unigrams, bigrams and trigrams  <br>

That's 2 model types x 8 possible prepreocessing combinations x 6 possible features = 96 models <br>

In [98]:
import pandas as pd

spam_df = pd.read_csv('SPAM.csv')

In [99]:
spam_df.head(10)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [100]:
import nltk

spam_df['Tokenized_Message'] = spam_df['Message'].apply(nltk.word_tokenize) 

In [101]:
spam_df.head(10)

Unnamed: 0,Category,Message,Tokenized_Message
0,ham,"Go until jurong point, crazy.. Available only ...","[Go, until, jurong, point, ,, crazy, .., Avail..."
1,ham,Ok lar... Joking wif u oni...,"[Ok, lar, ..., Joking, wif, u, oni, ...]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[Free, entry, in, 2, a, wkly, comp, to, win, F..."
3,ham,U dun say so early hor... U c already then say...,"[U, dun, say, so, early, hor, ..., U, c, alrea..."
4,ham,"Nah I don't think he goes to usf, he lives aro...","[Nah, I, do, n't, think, he, goes, to, usf, ,,..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...,"[FreeMsg, Hey, there, darling, it, 's, been, 3..."
6,ham,Even my brother is not like to speak with me. ...,"[Even, my, brother, is, not, like, to, speak, ..."
7,ham,As per your request 'Melle Melle (Oru Minnamin...,"[As, per, your, request, 'Melle, Melle, (, Oru..."
8,spam,WINNER!! As a valued network customer you have...,"[WINNER, !, !, As, a, valued, network, custome..."
9,spam,Had your mobile 11 months or more? U R entitle...,"[Had, your, mobile, 11, months, or, more, ?, U..."


Labeling the target column

In [102]:
category_numeric = []

for email in spam_df['Category']:
    if email == 'spam':
        category_numeric.append(1)
    elif email == 'ham':
        category_numeric.append(0)
    else:
        category_numeric.append('NULL')
category_numeric[:10]

[0, 0, 1, 0, 0, 1, 0, 0, 1, 1]

Sanity check

In [7]:
# single run test with no preprocessing
# from sklearn.linear_model import LogisticRegression
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.metrics import f1_score
# from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(spam_df, category_numeric, 
#                                                     test_size=0.15, random_state=20)

# LR = LogisticRegression()

# vectorizer = CountVectorizer(ngram_range=(1,1))
# features = vectorizer.fit_transform(X_train['Message'])
# test_features = vectorizer.transform(X_test['Message'])

# LR.fit(features, y_train)
# y_pred = LR.predict(test_features)

# f1_score(y_test, y_pred)

0.929460580912863

Helper Functions

In [8]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def preprocess(spam_df,lowercase=False,remove_stopwords=False,lemmatize=False):
    spam_df['Filtered_Message'] = spam_df['Tokenized_Message']
    
    if lowercase:
        spam_df['Filtered_Message'] = spam_df['Tokenized_Message'].apply(lambda x: [token.lower() for token in x])

    if remove_stopwords:
        stop_words = set(stopwords.words('english'))
        spam_df['Filtered_Message'] = spam_df['Filtered_Message'].apply(lambda x: [token for token in x if token not in stop_words])

    if lemmatize:
        lemmatizer = WordNetLemmatizer()
        spam_df['Filtered_Message'] = spam_df['Filtered_Message'].apply(lambda x: [lemmatizer.lemmatize(token) for token in x])

    spam_df['Filtered_Message'] = spam_df['Filtered_Message'].apply(lambda x: ' '.join(x))

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

def train_model(model_type,train_features, train_labels):
    trained_model = None
    
    if model_type == 'LR':
        trained_model = LogisticRegression()
    elif model_type == 'RF':
        trained_model = RandomForestClassifier()
        
    trained_model.fit(train_features, train_labels)
    
    return trained_model

In [10]:
from sklearn.metrics import accuracy_score, f1_score

def evaluate_model(trained_model,metric,eval_features,eval_labels):
    y_pred = trained_model.predict(eval_features)
    
    if metric == 'f1':
        model_eval = f1_score(eval_labels, y_pred)
    elif metric == 'weighted_f1':
        model_eval = f1_score(eval_labels, y_pred, average='weighted')
    elif metric == 'accuracy':
        model_eval = accuracy_score(eval_labels, y_pred)
    
    return model_eval

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix

def run_model(spam_df, category_numeric, model_type = 'LR', lowercase=False, remove_stopwords=False,lemmatize=False, 
              tfidf=False, ngram_range=(1,1)):

    preprocess(spam_df, lowercase, remove_stopwords, lemmatize)
    
    X = spam_df['Filtered_Message']
    y = category_numeric
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=20)
    
    vectorizer = CountVectorizer(ngram_range=ngram_range)
    if tfidf:
        vectorizer = TfidfVectorizer(ngram_range=ngram_range)

    features = vectorizer.fit_transform(X_train)
    test_features = vectorizer.transform(X_test)

    model = train_model(model_type, features, y_train)
    
    metrics_data = []
    
    # metrics from train data
    for metric in ['f1', 'weighted_f1', 'accuracy']:
        data = evaluate_model(model, metric, features, y_train)
        metrics_data.append(data)
        
    # metrics from test data
    for metric in ['f1', 'weighted_f1', 'accuracy']:
        data = evaluate_model(model, metric, test_features, y_test)
        metrics_data.append(data)
    
    # building in confusion matrix into DataFrame
    y_pred = model.predict(test_features)
    cm = confusion_matrix(y_test, y_pred)
    
    return metrics_data, cm

Individual Test

In [12]:
run_model(spam_df, category_numeric, model_type = 'LR', lowercase=False, remove_stopwords=True,lemmatize=False, 
              tfidf=False, ngram_range=(1,1))

([0.9853181076672105,
  0.9961782111219822,
  0.9961993243243243,
  0.9288702928870293,
  0.9792042677748827,
  0.9796650717703349],
 array([[708,   2],
        [ 15, 111]]))

Running all models

In [13]:
df = pd.DataFrame(columns=['model','lowercased','stopwords_removed', 'lemmatized', 'unigrams', 
                          'bigrams', 'trigrams', 'tfidf unigrams', 'tfidf bigrams','tfidf trigrams', 
                          'f1_train', 'weighted_f1_train', 'accuracy_train', 'f1_test', 
                          'weighted_f1_test', 'accuracy_test', 'confusion_matrix_test'])

row = []
i=0
models = ['LR', 'RF']

for model in models:
    for lowercase in [False, True]:
        for remove_stopwords in [False, True]:
            for lemmatize in [False, True]:
                for n_grams in [[True, False, False], [True, True, False], [True,True,True]]:
                    for tfidf in [False, True]:
                        
                        # Fill out boolean labels
                        row.append(model)
                        row.append(lowercase)
                        row.append(remove_stopwords)
                        row.append(lemmatize)
                        if tfidf:
                            row.extend([False, False, False])
                            row.extend(n_grams)
                        else:
                            row.extend(n_grams)
                            row.extend([False, False, False])
                        
                        n_gram_len = sum(n_grams)
                        res, cm = run_model(spam_df, category_numeric, model, lowercase, remove_stopwords,
                                        lemmatize, tfidf, ngram_range=(1,n_gram_len))
                        
                        row.extend(res)
                        row.append(cm)
    
                        df.loc[i] = row
                        i = i+1
                        row = []
df.head(20)

Unnamed: 0,model,lowercased,stopwords_removed,lemmatized,unigrams,bigrams,trigrams,tfidf unigrams,tfidf bigrams,tfidf trigrams,f1_train,weighted_f1_train,accuracy_train,f1_test,weighted_f1_test,accuracy_test,confusion_matrix_test
0,LR,False,False,False,True,False,False,False,False,False,0.987775,0.996816,0.996833,0.929461,0.979279,0.979665,"[[707, 3], [14, 112]]"
1,LR,False,False,False,False,False,False,True,False,False,0.892444,0.973302,0.974451,0.878261,0.965161,0.966507,"[[707, 3], [25, 101]]"
2,LR,False,False,False,True,True,False,False,False,False,0.995958,0.998942,0.998944,0.92766,0.97905,0.979665,"[[710, 0], [17, 109]]"
3,LR,False,False,False,False,False,False,True,True,False,0.839552,0.96117,0.963682,0.844037,0.956634,0.95933,"[[710, 0], [34, 92]]"
4,LR,False,False,False,True,True,True,False,False,False,0.995958,0.998942,0.998944,0.923077,0.977776,0.978469,"[[710, 0], [18, 108]]"
5,LR,False,False,False,False,False,False,True,True,True,0.762948,0.944497,0.949747,0.796209,0.944289,0.948565,"[[709, 1], [42, 84]]"
6,LR,False,False,True,True,False,False,False,False,False,0.986122,0.996389,0.99641,0.938272,0.981782,0.982057,"[[707, 3], [12, 114]]"
7,LR,False,False,True,False,False,False,True,False,False,0.891459,0.97307,0.97424,0.892704,0.969074,0.970096,"[[707, 3], [22, 104]]"
8,LR,False,False,True,True,True,False,False,False,False,0.995958,0.998942,0.998944,0.919831,0.976672,0.977273,"[[708, 2], [17, 109]]"
9,LR,False,False,True,False,False,False,True,True,False,0.846797,0.962834,0.96516,0.849315,0.958001,0.960526,"[[710, 0], [33, 93]]"


In [14]:
df.shape

(96, 17)

In [15]:
import numpy as np

print("Model with the best weighted f1 according to the train set:")
print(df.iloc[df['weighted_f1_train'].idxmax()])

print("\n")

print("Model with the best weighted f1 according to the test set:")
print(df.iloc[df['weighted_f1_test'].idxmax()])

Model with the best weighted f1 according to the train set:
model                                       RF
lowercased                               False
stopwords_removed                        False
lemmatized                               False
unigrams                                  True
bigrams                                  False
trigrams                                 False
tfidf unigrams                           False
tfidf bigrams                            False
tfidf trigrams                           False
f1_train                                   1.0
weighted_f1_train                          1.0
accuracy_train                             1.0
f1_test                               0.899563
weighted_f1_test                      0.971326
accuracy_test                         0.972488
confusion_matrix_test    [[710, 0], [23, 103]]
Name: 48, dtype: object


Model with the best weighted f1 according to the test set:
model                                       LR
lowercase

In [16]:
# Confusion matrix for best preforming model on test set

print("Confusion matrix for best preforming model on test set:")
df.iloc[df['weighted_f1_test'].idxmax()]['confusion_matrix_test']

Confusion matrix for best preforming model on test set:


array([[710,   0],
       [ 14, 112]])

The top left number (710) refers to the true positive count, that is correctly predicted spam emails. The bottom right number (112) refers to the true negative count, correctly predicted non-spam. The top right number (0) refers to the false positive count, that is ham emails that have been labeled spam. And the bottom left number (14) refers to the false negative count, that is spam emails that have falsey been identified as ham emails. 

Repeat the experiment for the best-performing combination of model type, preprocessing, and lexical features above, but this time limit the analysis to all the combinations of the parts-of-speeches below (total of 8 combinations):<br>

(1) Adjectives <br>
(2) Nouns  <br>
(3) Verbs  <br>

What this means is that after tokenization and before preprocessing, you remove all words that do not have the part of speech you are looking at. E.g., for the combination Adjectives & Nouns, all words that are not a noun or adjective should be removed. <br>

That's 8 new models. <br>

In [18]:
import nltk

verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ']
adj_tags = ['JJ', 'JJR', 'JJS']
noun_tags = ['NN','NNS','NNP','NNPS']

def filter_tokens(tokens, include_adjectives, include_nouns, include_verbs):
    filtered_tokens = []
    
    # Get POS tags from tokenized message
    pos_tags = nltk.pos_tag(tokens)
    
    for word, pos in pos_tags:       
        if include_adjectives and pos in adj_tags:
            filtered_tokens.append(word)
        if include_nouns and pos in noun_tags:
            filtered_tokens.append(word)
        if include_verbs and pos in verb_tags:
            filtered_tokens.append(word)
        
        # Exception: all False, words that are not adjectives, nouns, or verbs (adverbs, determiners, etc.) all come back in
        if include_adjectives == False and include_nouns == False and include_verbs == False:
            if (pos not in adj_tags) and (pos not in noun_tags) and (pos not in verb_tags):
                filtered_tokens.append(word)
        
    return filtered_tokens
    
def filter_POS(spam_df, adjectives=True, nouns=True, verbs=True):
    spam_df['Tokenized_Message'] = spam_df['Message'].apply(nltk.word_tokenize)
    spam_df['Tokenized_Message'] = spam_df['Tokenized_Message'].apply(lambda x: filter_tokens(x, adjectives, nouns, verbs))   
    

In [19]:
# input_tokens = nltk.word_tokenize("The quick brown fox jumps over the lazy dog.")
# print(nltk.pos_tag(input_tokens))
# filter_tokens(input_tokens, False, False, False)

[('The', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumps', 'VBZ'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN'), ('.', '.')]


['The', 'over', 'the', '.']

In [20]:
# Best preforming model from Q1                                     
# model                                     LR
# lowercased                                True
# stopwords_removed                         True
# lemmatized                               False
# unigrams                                  True
# bigrams                                  False
# trigrams                                 False
# tfidf unigrams                           False
# tfidf bigrams                            False
# tfidf trigrams                           False

df2 = pd.DataFrame(columns=['Adjectives','Nouns','Verbs', 'f1_test', 
                          'weighted_f1_test', 'accuracy_test', 'confusion_matrix_test'])

row = []
i=0
for include_adjectives in [True, False]:
    for include_nouns in [True, False]:
        for include_verbs in [True, False]:
            row.append(include_adjectives)
            row.append(include_nouns)
            row.append(include_verbs)
            
            filter_POS(spam_df, include_adjectives, include_nouns, include_verbs)
            
            res, cm = run_model(spam_df, category_numeric, 'LR', lowercase=True, remove_stopwords=True,
                lemmatize=False, tfidf=False, ngram_range=(1,1))

            row.extend(res[-3:])
            row.append(cm)
            
            df2.loc[i] = row
            i = i+1
            row = []
            
df2.head(10)

Unnamed: 0,Adjectives,Nouns,Verbs,f1_test,weighted_f1_test,accuracy_test,confusion_matrix_test
0,True,True,True,0.92437,0.977941,0.978469,"[[708, 2], [16, 110]]"
1,True,True,False,0.918455,0.976496,0.977273,"[[710, 0], [19, 107]]"
2,True,False,True,0.805556,0.946195,0.949761,"[[707, 3], [39, 87]]"
3,True,False,False,0.642105,0.907091,0.91866,"[[707, 3], [65, 61]]"
4,False,True,True,0.896552,0.970254,0.971292,"[[708, 2], [22, 104]]"
5,False,True,False,0.886957,0.967649,0.9689,"[[708, 2], [24, 102]]"
6,False,False,True,0.611399,0.898364,0.910287,"[[702, 8], [67, 59]]"
7,False,False,False,0.604396,0.899336,0.913876,"[[709, 1], [71, 55]]"


In [21]:
print("Model with the best weighted f1 according to the test set:")
print(df2.iloc[df2['weighted_f1_test'].idxmax()])

Model with the best weighted f1 according to the test set:
Adjectives                                True
Nouns                                     True
Verbs                                     True
f1_test                                0.92437
weighted_f1_test                      0.977941
accuracy_test                         0.978469
confusion_matrix_test    [[708, 2], [16, 110]]
Name: 0, dtype: object


In [22]:
# Confusion matrix for best preforming model on test set

print("Confusion matrix for best preforming model on test set:")
df2.iloc[df2['weighted_f1_test'].idxmax()]['confusion_matrix_test']

Confusion matrix for best preforming model on test set:


array([[708,   2],
       [ 16, 110]])

Repeat the experiment for the best-performing model type from Q1 (i.e., LR or RF) using the following features (no preprocessing is required): <br>
(1) Word2Vec features from GoogleNews (limit vocabulary to 40000 words) <br>
(2) Features from a new Word2Vec model trained on the **train** set of your dataset. Use the following hyperparameters: window=5,vector_size=100 ,min_count=5 <br>

You can average the semantic embeddings for the words in a document to create a single semantic vector for the document. You can ignore words that are not present in your Word2Vec model. <br>

In [23]:
import gensim
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
    
X_train, X_test, y_train, y_test = train_test_split(spam_df, category_numeric, test_size=0.15, random_state=20)
    
    
google_news_model = gensim.models.KeyedVectors.load_word2vec_format('../GoogleNews-vectors-negative300.bin', 
                                                                    binary=True, limit=40000)

train_word2vec_model = Word2Vec(sentences=X_train['Message'].apply(lambda x: x.split()), 
                                window=5, vector_size=100, min_count=5)

In [24]:
import numpy as np
# averaging the semantic embeddings for a document

def get_word2vec_features(text, w2vmodel):
    tokens = nltk.word_tokenize(text)
    valid_words = [word for word in tokens if word in w2vmodel]

    if valid_words:
        return np.mean(w2vmodel[valid_words], axis=0)
    else:
        return np.zeros(w2vmodel.vector_size)

In [25]:
# training semantic only
vectors = ['google_news_semantic', 'train_word2vec_semantic']

for vector in vectors:

    X_train_raw = X_train['Message']
    X_test_raw = X_test['Message']
    
    if vector == 'google_news_semantic': 
        X_train_array = np.array([get_word2vec_features(text, google_news_model) for text in X_train_raw])
        X_test_array = np.array([get_word2vec_features(text, google_news_model) for text in X_test_raw])
    elif vector == 'train_word2vec_semantic':
        X_train_array = np.array([get_word2vec_features(text, train_word2vec_model.wv) for text in X_train_raw])
        X_test_array = np.array([get_word2vec_features(text, train_word2vec_model.wv) for text in X_test_raw])
        
    model = train_model('LR', X_train_array, y_train)
    
    data = evaluate_model(model, 'weighted_f1', X_test_array, y_test)
    
    print(vector + " Features -  Weighted_f1 score: " + str(data))

google_news_semantic Features -  Weighted_f1 score: 0.9492811007888508
train_word2vec_semantic Features -  Weighted_f1 score: 0.7800652401319659


The google news vector has better semantic features. Now combine with Q2

In [26]:
from sklearn.feature_extraction.text import CountVectorizer

X_train_raw = X_train['Message']
X_test_raw = X_test['Message']
X_train_array = np.array([get_word2vec_features(text, google_news_model) for text in X_train_raw])
X_test_array = np.array([get_word2vec_features(text, google_news_model) for text in X_test_raw])

Q3_A_model = train_model('LR', X_train_array, y_train)
data = evaluate_model(Q3_A_model, 'weighted_f1', X_test_array, y_test)
print("Semantic only model: " + str(data))

# Filter the dataset using parts of speech we found from Q2 
filter_POS(X_train, True, True, True)
filter_POS(X_test, True, True, True)

X_train_filtered = X_train['Tokenized_Message'].apply(lambda x: ' '.join(x))
X_test_filtered = X_test['Tokenized_Message'].apply(lambda x: ' '.join(x))

vectorizer = CountVectorizer(ngram_range=(1,1))
features = vectorizer.fit_transform(X_train_filtered)
test_features = vectorizer.transform(X_test_filtered)

X_train_combined = np.concatenate([features.toarray(), X_train_array], axis=1)
X_test_combined = np.concatenate([test_features.toarray(), X_test_array], axis=1)

Q3_B_model = train_model("LR", X_train_combined, y_train)

data = evaluate_model(Q3_B_model, 'weighted_f1', X_test_combined, y_test)
print("Semantic and lexical model: " + str(data))

Semantic only model: 0.9492811007888508
Semantic and lexical model: 0.9879175677615438


Part C

In [27]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix

def run_model2(spam_df, y, model, combined_w_Q2=False):

    # Assume spam has message and tokenized message
    
    X_train, X_test, y_train, y_test = train_test_split(spam_df, y, test_size=0.15, random_state=20)
    
    X_train_array = np.array([get_word2vec_features(text, model) for text in X_train['Message']])
    X_test_array = np.array([get_word2vec_features(text, model) for text in X_test['Message']])
    
    if combined_w_Q2: 
        filter_POS(X_train, True, True, True)
        filter_POS(X_test, True, True, True)
        
        X_train_filtered = X_train['Tokenized_Message'].apply(lambda x: ' '.join(x))
        X_test_filtered = X_test['Tokenized_Message'].apply(lambda x: ' '.join(x))

        vectorizer = CountVectorizer(ngram_range=(1,1))
        features = vectorizer.fit_transform(X_train_filtered)
        test_features = vectorizer.transform(X_test_filtered)

        X_train_array = np.concatenate([features.toarray(), X_train_array], axis=1)
        X_test_array = np.concatenate([test_features.toarray(), X_test_array], axis=1)

    model = train_model("LR", X_train_array, y_train)
    
    metrics_data = []

    for metric in ['f1', 'weighted_f1', 'accuracy']:
        data = evaluate_model(model, metric, X_test_array, y_test)
        metrics_data.append(data)
    
    y_pred = model.predict(X_test_array)
    cm = confusion_matrix(y_test, y_pred)
    
    return metrics_data, cm

In [37]:
df3 = pd.DataFrame(columns=['model', 'lowercased','stopwords_removed', 'lemmatized', 'adjectives','nouns','verbs',
                           'all', 'unigrams', 'bigrams', 'trigrams', 'tfidf unigrams', 'tfidf bigrams',
                           'tfidf trigrams', 'w2v_GoogleNews', 'w2v_Span',
                           'f1_test', 'weighted_f1_test', 'accuracy_test', 'confusion_matrix_test'])

# Constants
model_1 = ["Q1", True, True, False, False, False, False, True, True, False, False, False, False, False, False, False]
model_2 = ["Q2", True, True, False, True, True, True, True, True, False, False, False, False, False, False, False]
model_3 = ["Q3-A", False, False, False, True, True, True, True, False, False, False, False, False, False, True, False]
model_4 = ["Q3-B", False, False, False, True, True, True, True, True, False, False, False, False, False, True, False]
    
row = []
i=0
for model in [model_1, model_2, model_3, model_4]:
    row.extend(model)

    if model[0] == 'Q2':
        filter_POS(spam_df, True, True, True)
        
    if model[0] == 'Q1' or model[0] == 'Q2':
        res, cm = run_model(spam_df, category_numeric, 'LR', lowercase=True, remove_stopwords=True,
                        lemmatize=False, tfidf=False, ngram_range=(1,1))
        res = res[-3:]

    # assume the google news model is loaded in a previous kernal
    if model[0] == 'Q3-A':
        res, cm = run_model2(spam_df, category_numeric, google_news_model, combined_w_Q2=False)
    
    if model[0] == 'Q3-B':
        res, cm = run_model2(spam_df, category_numeric, google_news_model, combined_w_Q2=True)
    
    row.extend(res)
    row.append(cm)
                
    df3.loc[i] = row
    i = i+1
    row = []
            
df3.head(10)

Unnamed: 0,model,lowercased,stopwords_removed,lemmatized,adjectives,nouns,verbs,all,unigrams,bigrams,trigrams,tfidf unigrams,tfidf bigrams,tfidf trigrams,w2v_GoogleNews,w2v_Span,f1_test,weighted_f1_test,accuracy_test,confusion_matrix_test
0,Q1,True,True,False,False,False,False,True,True,False,False,False,False,False,False,False,0.941176,0.982843,0.983254,"[[710, 0], [14, 112]]"
1,Q2,True,True,False,True,True,True,True,True,False,False,False,False,False,False,False,0.92437,0.977941,0.978469,"[[708, 2], [16, 110]]"
2,Q3-A,False,False,False,True,True,True,True,False,False,False,False,False,False,True,False,0.824034,0.949281,0.950957,"[[699, 11], [30, 96]]"
3,Q3-B,False,False,False,True,True,True,True,True,False,False,False,False,False,True,False,0.95935,0.987918,0.988038,"[[708, 2], [8, 118]]"


Part D

In [38]:
print("Model with the best weighted f1 according to the test set:")
print(df3.iloc[df3['weighted_f1_test'].idxmax()])

print("\n")

print("Confusion matrix for best preforming model on test set:")
df3.iloc[df3['weighted_f1_test'].idxmax()]['confusion_matrix_test']

Model with the best weighted f1 according to the test set:
model                                    Q3-B
lowercased                              False
stopwords_removed                       False
lemmatized                              False
adjectives                               True
nouns                                    True
verbs                                    True
all                                      True
unigrams                                 True
bigrams                                 False
trigrams                                False
tfidf unigrams                          False
tfidf bigrams                           False
tfidf trigrams                          False
w2v_GoogleNews                           True
w2v_Span                                False
f1_test                               0.95935
weighted_f1_test                     0.987918
accuracy_test                        0.988038
confusion_matrix_test    [[708, 2], [8, 118]]
Name: 3, dtype: objec

array([[708,   2],
       [  8, 118]])

### Trying to improve even more using grid search cross validation

In [112]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(spam_df, category_numeric, 
                                                    test_size=0.15, random_state=20)

X_train_array = np.array([get_word2vec_features(text, google_news_model) for text in X_train['Message']])
X_test_array = np.array([get_word2vec_features(text, google_news_model) for text in X_test['Message']])

filter_POS(X_train, True, True, True)
filter_POS(X_test, True, True, True)

# Preprocessing couldn't seem to bring my score up

X_train_filtered = X_train['Tokenized_Message'].apply(lambda x: ' '.join(x))
X_test_filtered = X_test['Tokenized_Message'].apply(lambda x: ' '.join(x))

vectorizer = CountVectorizer(ngram_range=(1,1))
features = vectorizer.fit_transform(X_train_filtered)
test_features = vectorizer.transform(X_test_filtered)

X_train_combined = np.concatenate([features.toarray(), X_train_array], axis=1)
X_test_combined = np.concatenate([test_features.toarray(), X_test_array], axis=1)

In [113]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

bonus_model = LogisticRegression()

param_grid = {'penalty': ['l1', 'l2'],
              'C': [0.1, 1, 10],
              'solver': ['liblinear'],
              'max_iter': [1000],
              'class_weight': ['balanced']}

bonus_model_tuned = GridSearchCV(bonus_model, param_grid=param_grid, error_score='raise', scoring='f1_weighted')
bonus_model_tuned.fit(X_train_combined, y_train)
bonus_model_tuned.best_params_

{'C': 10,
 'class_weight': 'balanced',
 'max_iter': 1000,
 'penalty': 'l2',
 'solver': 'liblinear'}

In [114]:
from sklearn.model_selection import cross_val_score
# from last kernel
bonus_model.set_params(**bonus_model_tuned.best_params_)
bonus_model.fit(X_train_combined, y_train)

print(cross_val_score(bonus_model, X_train_combined, y_train, cv=5))

data = evaluate_model(bonus_model, 'weighted_f1', X_test_combined, y_test)

print("Bonus model Weighted F1 score: " + str(data))

[0.98417722 0.98521647 0.99155227 0.98099261 0.97676874]
Bonus model Weighted F1 score: 0.9904306220095693
