In [1]:
import math
import numpy as np
import pandas as pd


import nltk
import nltk.classify.util
from nltk.metrics.scores import (precision, recall, f_measure, accuracy)
from nltk.classify import NaiveBayesClassifier
from nltk.probability import DictionaryProbDist
from nltk.stem.snowball import SnowballStemmer

from collections import defaultdict, Counter

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

import matplotlib.pyplot as plt

In [2]:
# Get data
ALL_DATA = pd.read_csv("spam.csv", usecols=[0, 1], encoding_errors="ignore")
TRAIN_PERCENT = 0.80
TRAIN_DATA = ALL_DATA[:int(TRAIN_PERCENT * len(ALL_DATA))]
DEV_DATA = ALL_DATA[int(TRAIN_PERCENT * len(ALL_DATA)):]
DEV_DATA.reset_index()

Unnamed: 0,index,v1,v2
0,4457,ham,Die... I accidentally deleted e msg i suppose ...
1,4458,spam,Welcome to UK-mobile-date this msg is FREE giv...
2,4459,ham,This is wishing you a great day. Moji told me ...
3,4460,ham,Thanks again for your reply today. When is ur ...
4,4461,ham,"Sorry I flaked last night, shit's seriously go..."
...,...,...,...
1110,5567,spam,This is the 2nd time we have tried 2 contact u...
1111,5568,ham,Will _ b going to esplanade fr home?
1112,5569,ham,"Pity, * was in mood for that. So...any other s..."
1113,5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
def generate_tuples_from_data(data) -> list:
    """
    Generates tuples from data frame formated like:

    tokenized text from file: [[word1, word2, ...], [word1, word2, ...], ...]
    labels: [0, 1, 0, 1, ...]

    Parameters:
        data - data frame with data to separate
    Return:
        a list of lists of tokens and a list of int labels
    """
    X = []
    y = []
    for i in range(data.shape[0]):
        X.append(nltk.word_tokenize(data.iloc[i,1]))
        y.append(data.iloc[i,0])
    return X, y

In [4]:
# Load in data
train_tups = generate_tuples_from_data(TRAIN_DATA)
dev_tups = generate_tuples_from_data(DEV_DATA)

In [5]:
def featurize(words: list) -> dict:
    stemmer = SnowballStemmer("english")
    features = defaultdict(int)
    all_caps = 0

    for word in words:
        if word.isupper():
            all_caps += 1
        stem = stemmer.stem(word.lower())
        features[stem] += 1
    percent_caps = all_caps / len(words)
    features["CAPS AMOUNT"] = math.floor(percent_caps*20)
    return features

In [6]:
def prepare_featureset(tups) -> list:
    reviews = tups[0]
    labels = tups[1]
    featureset = [(featurize(reviews[i]), labels[i]) for i in range(len(reviews))]
    return featureset

In [7]:
model = NaiveBayesClassifier.train(prepare_featureset(train_tups))
model.show_most_informative_features()

Most Informative Features
                   award = 1                spam : ham    =    168.3 : 1.0
                  servic = 1                spam : ham    =    152.1 : 1.0
                       t = 1                spam : ham    =    139.4 : 1.0
                    free = 2                spam : ham    =     91.5 : 1.0
                     txt = 1                spam : ham    =     89.8 : 1.0
                   await = 1                spam : ham    =     87.4 : 1.0
                    code = 1                spam : ham    =     87.4 : 1.0
                      uk = 1                spam : ham    =     87.3 : 1.0
                      16 = 1                spam : ham    =     78.9 : 1.0
                    2000 = 1                spam : ham    =     74.6 : 1.0


In [10]:
preds = model.classify_many([featurize(review) for review in dev_tups[0]])
print(f"Accuracy: {accuracy_score(dev_tups[1], preds)}")
print(f"Precision: {precision_score(dev_tups[1], preds, pos_label="spam")}")
print(f"Recall: {recall_score(dev_tups[1], preds, pos_label="spam")}")
print(f"F1: {f1_score(dev_tups[1], preds, pos_label="spam")}")


Accuracy: 0.9309417040358744
Precision: 0.6545454545454545
Recall: 0.993103448275862
F1: 0.7890410958904109
