In [1]:
import pandas as pd

import nltk

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import accuracy_score, recall_score

from prepare import prepare_messages
from get_db_url import get_db_url

# Model Exercises

## Acquire and Prepare Spam Messages

In [2]:
# We'll use the spam data since it is labeled.
df = pd.read_sql('SELECT * FROM spam;', get_db_url('spam_db'), index_col = 'id')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   5572 non-null   object
 1   text    5572 non-null   object
dtypes: object(2)
memory usage: 130.6+ KB


In [3]:
df.head(2)

Unnamed: 0_level_0,label,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...


In [4]:
df = prepare_messages(df)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   label       5572 non-null   object
 1   original    5572 non-null   object
 2   clean       5572 non-null   object
 3   stemmed     5572 non-null   object
 4   lemmatized  5572 non-null   object
dtypes: object(5)
memory usage: 261.2+ KB


## Split Data

In [5]:
# Let's split the data into two sets: train and test
train, test = train_test_split(df, train_size = 0.8, test_size = 0.2, random_state = 24)

## Establish a Baseline

In [6]:
# Let's establish a baseline model.
# First let's see the value counts of ham and spam in the label feature.
train.label.value_counts()

ham     3854
spam     603
Name: label, dtype: int64

In [7]:
# Most messages are ham so predicting ham will be the baseline.
baseline = pd.DataFrame({'label' : 'ham'}, index = train.index)

In [8]:
accuracy_score(train.label, baseline.label)

0.8647072021539152

## Decision Tree

In [9]:
# We'll use a CountVectorizer first
cv = CountVectorizer()
X_bow = cv.fit_transform(train.clean)

# Let's create and fit a decision tree using the cleaned version of the messages
model = DecisionTreeClassifier(max_depth = 5, random_state = 24)
model.fit(X_bow, train.label)

DecisionTreeClassifier(max_depth=5, random_state=24)

In [10]:
model.score(X_bow, train.label)

0.9288759255104331

In [11]:
recall_score(train.label, model.predict(X_bow), pos_label = 'spam')

0.6965174129353234

In [12]:
model.score(cv.transform(test.clean), test.label)

0.9264573991031391

In [14]:
recall_score(test.label, model.predict(cv.transform(test.clean)), pos_label = 'spam')

0.6527777777777778

This model has good performance. Let's try using the tfidf vectorizer.

In [15]:
# Use a TfidfVectorizer now
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(train.clean)

# Let's create the same model by this time using the tfidf vectorizer we just created.
model = DecisionTreeClassifier(max_depth = 5, random_state = 24)
model.fit(X_tfidf, train.label)

DecisionTreeClassifier(max_depth=5, random_state=24)

In [16]:
model.score(X_tfidf, train.label)

0.9544536683868072

In [17]:
recall_score(train.label, model.predict(X_bow), pos_label = 'spam')

0.06633499170812604

In [18]:
model.score(tfidf.transform(test.clean), test.label)

0.9354260089686098

In [19]:
recall_score(test.label, model.predict(cv.transform(test.clean)), pos_label = 'spam')

0.06944444444444445

We got slightly better performance with the tfidf vectorizer. Now let's try using the stemmed version of the messages.

In [20]:
# We can re-use the cv vectorizer
X_bow = cv.fit_transform(train.stemmed)

# Again we'll use the same model but with the new X_bow.
model = DecisionTreeClassifier(max_depth = 5, random_state = 24)
model.fit(X_bow, train.label)

DecisionTreeClassifier(max_depth=5, random_state=24)

In [21]:
model.score(X_bow, train.label)

0.9306708548350908

In [22]:
recall_score(train.label, model.predict(X_bow), pos_label = 'spam')

0.494195688225539

In [23]:
model.score(cv.transform(test.stemmed), test.label)

0.9273542600896861

In [24]:
recall_score(test.label, model.predict(cv.transform(test.clean)), pos_label = 'spam')

0.2777777777777778

The performance is slightly better than the count vectorizer model using the simple cleaned version of the messages.

In [25]:
# Now the tfidf vectorizer
X_tfidf = tfidf.fit_transform(train.stemmed)

# And now the model with the new X_tfidf.
model = DecisionTreeClassifier(max_depth = 5, random_state = 24)
model.fit(X_tfidf, train.label)

DecisionTreeClassifier(max_depth=5, random_state=24)

In [26]:
model.score(X_tfidf, train.label)

0.9551267668835539

In [27]:
recall_score(train.label, model.predict(X_bow), pos_label = 'spam')

0.14427860696517414

In [28]:
model.score(tfidf.transform(test.stemmed), test.label)

0.9282511210762332

In [29]:
recall_score(test.label, model.predict(cv.transform(test.clean)), pos_label = 'spam')

0.04861111111111111

There is a larger drop off in performance compared to the model that used the simple cleaned version of the messages. Next we'll use the lemmatized version of the messages.

In [30]:
# The cv vectorizer again
X_bow = cv.fit_transform(train.lemmatized)

# Now the same model again with the new X_bow
model = DecisionTreeClassifier(max_depth = 5, random_state = 24)
model.fit(X_bow, train.label)

DecisionTreeClassifier(max_depth=5, random_state=24)

In [31]:
model.score(X_bow, train.label)

0.9299977563383441

In [32]:
recall_score(train.label, model.predict(X_bow), pos_label = 'spam')

0.4892205638474295

In [33]:
model.score(cv.transform(test.lemmatized), test.label)

0.9264573991031391

In [34]:
recall_score(test.label, model.predict(cv.transform(test.clean)), pos_label = 'spam')

0.4166666666666667

Similar performance to the model with the cleaned version of the messages. Now the tfidf vectorizer.

In [35]:
# the tfidf vectorizer again
X_tfidf = tfidf.fit_transform(train.lemmatized)

# Now the model, again
model = DecisionTreeClassifier(max_depth = 5, random_state = 24)
model.fit(X_tfidf, train.label)

DecisionTreeClassifier(max_depth=5, random_state=24)

In [36]:
model.score(X_tfidf, train.label)

0.9540049360556428

In [37]:
recall_score(train.label, model.predict(X_bow), pos_label = 'spam')

0.1691542288557214

In [38]:
model.score(tfidf.transform(test.lemmatized), test.label)

0.9318385650224216

In [39]:
recall_score(test.label, model.predict(cv.transform(test.clean)), pos_label = 'spam')

0.1527777777777778

This is similar performance to the same model with the cleaned version of the messages.

## Naive Bayes

This time let's focus on the recall score.

In [40]:
# This time we'll try putting the code in a loop.
# We'll use the BernoulliNB model from SK learn.
# We'll use the count vectorizer.

columns = [
    'clean',
    'stemmed',
    'lemmatized'
]

for column in columns:
    cv = CountVectorizer()
    X_bow = cv.fit_transform(train[column])
    
    model = BernoulliNB()
    model.fit(X_bow, train.label)
    
    print(f'{column} train recall: {recall_score(train.label, model.predict(X_bow), pos_label = "spam")}')
    print(f'{column} test recall: {recall_score(test.label, model.predict(cv.transform(test[column])), pos_label = "spam")}')
    print()

clean train recall: 0.8905472636815921
clean test recall: 0.7708333333333334

stemmed train recall: 0.8922056384742952
stemmed test recall: 0.7847222222222222

lemmatized train recall: 0.8905472636815921
lemmatized test recall: 0.7638888888888888



For all three we have similar recall scores. Let's try building models with n-grams this time.

In [41]:
for column in columns:
    # We'll use bi-grams and tri-grams
    cv = CountVectorizer(ngram_range = (2, 3))
    X_bow = cv.fit_transform(train[column])
    
    model = BernoulliNB()
    model.fit(X_bow, train.label)
    
    print(f'{column} train recall: {recall_score(train.label, model.predict(X_bow), pos_label = "spam")}')
    print(f'{column} test recall: {recall_score(test.label, model.predict(cv.transform(test[column])), pos_label = "spam")}')
    print()

clean train recall: 0.48258706467661694
clean test recall: 0.20833333333333334

stemmed train recall: 0.4975124378109453
stemmed test recall: 0.2013888888888889

lemmatized train recall: 0.4892205638474295
lemmatized test recall: 0.20833333333333334



Wow, that's significantly worse. Let's try only bi-grams with the tfidf vectorizer.

In [42]:
for column in columns:
    tfidf = TfidfVectorizer(ngram_range = (2, 2))
    X_bow = tfidf.fit_transform(train[column])
    
    model = BernoulliNB()
    model.fit(X_bow, train.label)
    
    print(f'{column} train recall: {recall_score(train.label, model.predict(X_bow), pos_label = "spam")}')
    print(f'{column} test recall: {recall_score(test.label, model.predict(tfidf.transform(test[column])), pos_label = "spam")}')
    print()

clean train recall: 0.4975124378109453
clean test recall: 0.2361111111111111

stemmed train recall: 0.5290215588723052
stemmed test recall: 0.2638888888888889

lemmatized train recall: 0.5058043117744611
lemmatized test recall: 0.24305555555555555



That's still really bad.

## Random Forest

Now let's try a random forest model with the count vectorizer and only single words.

In [43]:
# We'll try various values of max_depth

for column in columns:
    for depth in range(3, 9):
        cv = CountVectorizer()
        X_bow = cv.fit_transform(train[column])

        model = RandomForestClassifier(max_depth = depth)
        model.fit(X_bow, train.label)

        print(f'{column}, depth = {depth}, train recall: {recall_score(train.label, model.predict(X_bow), pos_label = "spam")}')
        print(f'{column}, depth = {depth}, test recall: {recall_score(test.label, model.predict(cv.transform(test[column])), pos_label = "spam")}')
        print()

clean, depth = 3, train recall: 0.0
clean, depth = 3, test recall: 0.0

clean, depth = 4, train recall: 0.009950248756218905
clean, depth = 4, test recall: 0.013888888888888888

clean, depth = 5, train recall: 0.008291873963515755
clean, depth = 5, test recall: 0.006944444444444444

clean, depth = 6, train recall: 0.03150912106135987
clean, depth = 6, test recall: 0.013888888888888888

clean, depth = 7, train recall: 0.06799336650082918
clean, depth = 7, test recall: 0.041666666666666664

clean, depth = 8, train recall: 0.0945273631840796
clean, depth = 8, test recall: 0.06944444444444445

stemmed, depth = 3, train recall: 0.0
stemmed, depth = 3, test recall: 0.0

stemmed, depth = 4, train recall: 0.0
stemmed, depth = 4, test recall: 0.0

stemmed, depth = 5, train recall: 0.008291873963515755
stemmed, depth = 5, test recall: 0.006944444444444444

stemmed, depth = 6, train recall: 0.04643449419568822
stemmed, depth = 6, test recall: 0.020833333333333332

stemmed, depth = 7, train recall

In [44]:
# Let's try that again with bi-grams

for column in columns:
    for depth in range(3, 9):
        cv = CountVectorizer(ngram_range = (1, 2))
        X_bow = cv.fit_transform(train[column])

        model = RandomForestClassifier(max_depth = depth)
        model.fit(X_bow, train.label)

        print(f'{column}, depth = {depth}, train recall: {recall_score(train.label, model.predict(X_bow), pos_label = "spam")}')
        print(f'{column}, depth = {depth}, test recall: {recall_score(test.label, model.predict(cv.transform(test[column])), pos_label = "spam")}')
        print()

clean, depth = 3, train recall: 0.0
clean, depth = 3, test recall: 0.0

clean, depth = 4, train recall: 0.0
clean, depth = 4, test recall: 0.0

clean, depth = 5, train recall: 0.004975124378109453
clean, depth = 5, test recall: 0.006944444444444444

clean, depth = 6, train recall: 0.014925373134328358
clean, depth = 6, test recall: 0.006944444444444444

clean, depth = 7, train recall: 0.01990049751243781
clean, depth = 7, test recall: 0.006944444444444444

clean, depth = 8, train recall: 0.04477611940298507
clean, depth = 8, test recall: 0.020833333333333332

stemmed, depth = 3, train recall: 0.0
stemmed, depth = 3, test recall: 0.0

stemmed, depth = 4, train recall: 0.0
stemmed, depth = 4, test recall: 0.0

stemmed, depth = 5, train recall: 0.0
stemmed, depth = 5, test recall: 0.0

stemmed, depth = 6, train recall: 0.009950248756218905
stemmed, depth = 6, test recall: 0.0

stemmed, depth = 7, train recall: 0.03316749585406302
stemmed, depth = 7, test recall: 0.020833333333333332

stem

## Conclusion

Overall it looks like the BernoulliNB model with the CountVectorizer produces the best results.