# A Gibbs Sampler for Spam Detection

In an [earlier post](https://nbviewer.jupyter.org/github/bobflagg/gibbs-sampling-for-the-uninitiated/blob/master/Gibbs-sampling-for-the-Uninitiated.ipynb) I gave a Python implementation of the Gibbs sampler for text classification described in the excellent tutorial paper
[Gibbs Sampling for the Uninitiated](https://www.umiacs.umd.edu/~resnik/pubs/LAMP-TR-153.pdf).  In this notebook, I'll show how to use that sampler to detect spam.  


## A Spam-vs-Ham Training Corpus

I'll use a combination of the [Enron-Spam](http://www.aueb.gr/users/ion/data/enron-spam/) data set and the [SpamAssassin public corpus](https://spamassassin.apache.org/publiccorpus/) to buld a training set for our spam detector. To simpify the presentation, I've done some minimal pre-processing of the original data and collected 
the results in a file in which each line contains the class (ham or spam) and the text of an 
e-mail message separated by a tab.  The corpus is in contained in the archive at data/spam-or-ham.zip.

In [125]:
from collections import Counter
import numpy as np
from numpy.random import beta, binomial, dirichlet

In [99]:
def read_data(path='data/spam-or-ham.txt'):
    fp = open(path, 'r')
    texts = []
    labels = []
    for line in fp:
        line = line.strip()
        if line:
            label, text = line.split('\t')
            labels.append(label)
            texts.append(text)
    fp.close()    
    return texts, labels

In [100]:
def select_vocabulary(texts, V, min_cnt=50, max_cnt=10000):
    counter = Counter()
    for text in texts:
        for word in text.split():
            counter[word.lower()] += 1    
    words = [w for w in counter.keys() if counter[w] > min_cnt and counter[2] < max_cnt]
    words = sorted(words, key=lambda x: counter[x])
    return set(words[-V:])

In [108]:
V = 10000
texts, labels = read_data()
vocabulary = select_vocabulary(texts, V)
word2id = {w:i for i, w in enumerate(vocabulary)}
id2word = {i:w  for i, w in enumerate(vocabulary)}

In [119]:
def build_corpus(texts, vocabulary):
    corpus = []
    for text in texts:
        words = [w.lower() for w in text.split() if w.lower() in vocabulary]
        ids = [word2id[w] for w in words]
        counter = Counter(ids)
        document = {(i,c) for i, c in counter.items()}
        corpus.append(document)
    return corpus

In [120]:
corpus = build_corpus(texts, vocabulary)

In [126]:
def sample_labels(J, gamma_pi):
    pi = beta(gamma_pi[0], gamma_pi[1])
    return binomial(1, pi, J)

def initialize(W, labels, gamma_pi, gamma_theta):
    N = len(W)
    M = len(labels)
    V = len(gamma_theta)

    L = sample_labels(N - M, gamma_pi)
    theta = dirichlet(gamma_theta, 2)

    C = np.zeros((2,))
    C += gamma_pi
    cnts = np.zeros((2, V))
    cnts += gamma_theta
    
    for d, l in zip(W, labels.tolist() + L.tolist()):
        for i, c in d: cnts[l][i] += c
        C[l] += 1

    return {'C':C, 'N':cnts, 'L':L, 'theta':theta}

In [184]:
def update(state, X):
    C = state['C']
    N = state['N']
    L = state['L']
    theta = state['theta']
    # Update the labels for all documents:
    for j, l in enumerate(L):
        # Drop document j from the corpus:
        for i, c in X[j]: N[l][i] -= c
        C[l] -= 1  
        # Compute the conditional probability that L[j] = 1:  
        if C[0] == 1: pi = 1.0
        elif C[1] == 1 <= 0: pi = 0.0 
        else:
            # compute the product of probabilities (sum of logs)
            d = np.sum(C) - 1
            v0 = np.log((C[0] - 1.0) / d)
            v1 = np.log((C[1] - 1.0) / d)
            for i, c in X[j]:
                v0 += c * np.log(theta[0,i])
                v1 += c * np.log(theta[1,i])
            m = max(v0, v1)
            v0 = np.exp(v0 - m)
            v1 = np.exp(v1 - m)
            pi = v1 / (v0 + v1)
        if np.isnan(pi):
            d = np.sum(C) - 1
            v0 = np.log((C[0] - 1.0) / d)
            v1 = np.log((C[1] - 1.0) / d)
            print('v0, v1',v0, v1)
            for i, c in X[j]:
                v0 += c * np.log(theta[0,i])
                v1 += c * np.log(theta[1,i])
                print('v0, v1', v0, v1)
            m = max(v0, v1)
            print('m', m, v0 - m, v1 - m)
            v0 = np.exp(v0 - m)
            v1 = np.exp(v1 - m)
            print('v0, v1', v0, v1)
            pi = v1 / (v0 + v1)
        # Sample the new label from the conditional probability:
        l = binomial(1, pi)
        L[j] = l
        # Add document j back into the corpus:
        C[l] += 1
        for i, c in X[j]: N[l][i] += c
    #print('--->>>', np.min(cnts[0]), np.min(cnts[1]))
    # Update the topics:
    theta[0] = dirichlet(N[0])
    theta[1] = dirichlet(N[1])

In [128]:
def run_sampler(W, labels, iterations, gamma_pi, gamma_theta):
    state = initialize(W, labels, gamma_pi, gamma_theta)
    X = W[len(labels):]
    for t in range(iterations): update(state, X)
    return state['L']

In [129]:
def compute_accuracy(L_true, L_predicted):
    correct = 0
    for i, l in enumerate(L_predicted):
        if L_true[i] == l: correct += 1
    accuracy = float(correct)/len(L_predicted)
    return accuracy

In [202]:
gamma_pi = (1, 1)
gamma_theta = [1] * V

N = 10000
W = corpus[:N]
n = int(N * 0.9)
labels_observed = np.array([0 if x == 'ham' else 1 for x in labels[:n]])
labels_unobserved = np.array([0 if x == 'ham' else 1 for x in labels[n:N]])
    
iterations = 200
L = run_sampler(W, labels_observed, iterations, gamma_pi, gamma_theta)
accuracy = compute_accuracy(labels_unobserved, L)
print(accuracy)

0.92


In [58]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer()
counts = count_vectorizer.fit_transform(texts)

In [1]:
import os

NEWLINE = '\n'
SKIP_FILES = {'cmds'}


def read_files(path):
    for root, dir_names, file_names in os.walk(path):
        for path in dir_names:
            read_files(os.path.join(root, path))
        for file_name in file_names:
            if file_name not in SKIP_FILES:
                file_path = os.path.join(root, file_name)
                if os.path.isfile(file_path):
                    past_header, lines = False, []
                    f = open(file_path, encoding="latin-1")
                    for line in f:
                        if past_header:
                            lines.append(line)
                        elif line == NEWLINE:
                            past_header = True
                    f.close()
                    content = NEWLINE.join(lines)
                    yield file_path, content

In [2]:
from pandas import DataFrame


def build_data_frame(path, classification):
    rows = []
    index = []
    for file_name, text in read_files(path):
        rows.append({'text': text, 'class': classification})
        index.append(file_name)

    data_frame = DataFrame(rows, index=index)
    return data_frame

In [6]:
HAM = 'ham'
SPAM = 'spam'

SOURCES = [
    ('data/spamassassin/spam',        SPAM),
    ('data/spamassassin/easy_ham',    HAM),
    ('data/spamassassin/hard_ham',    HAM),
    ('data/enron/beck-s',      HAM),
    ('data/enron/BG',          SPAM),
    ('data/enron/farmer-d',    HAM),
    ('data/enron/GP',          SPAM),
    ('data/enron/kaminski-v',  HAM),
    ('data/enron/kitchen-l',   HAM),
    ('data/enron/lokay-m',     HAM),
    ('data/enron/SH',          SPAM),
    ('data/enron/williams-w3', HAM)
]

data = DataFrame({'text': [], 'class': []})
for path, classification in SOURCES:
    data = data.append(build_data_frame(path, classification))

data = data.reindex(np.random.permutation(data.index))

In [29]:
path = '/home/data/english-words/words.txt'
fp = open(path, 'r')
english_words = set()
for word in fp:
    word = word.strip()
    if word: english_words.add(word)
fp.close()

In [35]:
import string 
from nltk import word_tokenize
english_words |= set(string.punctuation)

In [50]:
import re

TAG_RE = re.compile(r'<[^>]+>')
WS_RE = re.compile(r'\s\s+')

def remove_tags(text):
    return TAG_RE.sub('', text)

def remove_white_extra_space(text):
    text = text.strip()
    return WS_RE.sub(' ', text)

def tokenize_and_join(text):
    words = word_tokenize(text)
    return " ".join(words)

In [51]:
import codecs 
path = 'data/spam-or-ham.txt'
fp = codecs.open(path, 'w', 'UTF-8')
cnt = 0
for label, message in zip(data['class'].tolist(), data['text'].tolist()):
    text = remove_tags(message)
    text = remove_white_extra_space(text)
    text = tokenize_and_join(text)
    if len(text) > 10: fp.write("%s\t%s\n" % (label, text))
    cnt += 1
    #if cnt > 10: break
fp.close()
print(cnt)

55326


In [None]:
6.1.1 Beyond SGD

In [49]:
text = '''The CPUC still plans to vote on suspending DA on Thursday but a few new twists: The State Treasurer announced last week that the State has no cash flow crisis and has adequate cash reserves and short-term borrowing capacity to last until the end of the fascal year in 2002 (we think July ends the fiscal year) President of Senate says at press conference on Monday that he intends to ask CPUC to study how to provide DA while still protecting small customers (letter not yet written) Enron's coalitions, AReM and WPTF, filed a motion at the CPUC today saying.'''
word_tokenize(text)

['The',
 'CPUC',
 'still',
 'plans',
 'to',
 'vote',
 'on',
 'suspending',
 'DA',
 'on',
 'Thursday',
 'but',
 'a',
 'few',
 'new',
 'twists',
 ':',
 'The',
 'State',
 'Treasurer',
 'announced',
 'last',
 'week',
 'that',
 'the',
 'State',
 'has',
 'no',
 'cash',
 'flow',
 'crisis',
 'and',
 'has',
 'adequate',
 'cash',
 'reserves',
 'and',
 'short-term',
 'borrowing',
 'capacity',
 'to',
 'last',
 'until',
 'the',
 'end',
 'of',
 'the',
 'fascal',
 'year',
 'in',
 '2002',
 '(',
 'we',
 'think',
 'July',
 'ends',
 'the',
 'fiscal',
 'year',
 ')',
 'President',
 'of',
 'Senate',
 'says',
 'at',
 'press',
 'conference',
 'on',
 'Monday',
 'that',
 'he',
 'intends',
 'to',
 'ask',
 'CPUC',
 'to',
 'study',
 'how',
 'to',
 'provide',
 'DA',
 'while',
 'still',
 'protecting',
 'small',
 'customers',
 '(',
 'letter',
 'not',
 'yet',
 'written',
 ')',
 'Enron',
 "'s",
 'coalitions',
 ',',
 'AReM',
 'and',
 'WPTF',
 ',',
 'filed',
 'a',
 'motion',
 'at',
 'the',
 'CPUC',
 'today',
 'saying',
 

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer()
counts = count_vectorizer.fit_transform(data['text'].values)

In [10]:
from sklearn.naive_bayes import MultinomialNB

classifier = MultinomialNB()
targets = data['class'].values
classifier.fit(counts, targets)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [11]:
examples = ['Free Viagra call today!', "I'm going to attend the Linux users group tomorrow."]
example_counts = count_vectorizer.transform(examples)
predictions = classifier.predict(example_counts)
predictions # [1, 0]

array(['spam', 'ham'], 
      dtype='<U4')

In [13]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('vectorizer',  CountVectorizer()),
    ('classifier',  MultinomialNB()) ])

pipeline.fit(data['text'].values, data['class'].values)
pipeline.predict(examples) # ['spam', 'ham']

array(['spam', 'ham'], 
      dtype='<U4')

In [14]:
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, f1_score

k_fold = KFold(n=len(data), n_folds=6)
scores = []
confusion = numpy.array([[0, 0], [0, 0]])
for train_indices, test_indices in k_fold:
    train_text = data.iloc[train_indices]['text'].values
    train_y = data.iloc[train_indices]['class'].values

    test_text = data.iloc[test_indices]['text'].values
    test_y = data.iloc[test_indices]['class'].values

    pipeline.fit(train_text, train_y)
    predictions = pipeline.predict(test_text)

    confusion += confusion_matrix(test_y, predictions)
    score = f1_score(test_y, predictions, pos_label=SPAM)
    scores.append(score)

print('Total emails classified:', len(data))
print('Score:', sum(scores)/len(scores))
print('Confusion matrix:')
print(confusion)




Total emails classified: 55326
Score: 0.942656880273
Confusion matrix:
[[21658   180]
 [ 3472 30016]]
