# Notebook Imports

In [1]:
import pandas as pd
import numpy as np

# Constants

In [2]:
VOCAB_SIZE = 2500

TRAINING_DATA_FILE = 'SpamData/02_Training/train-data.txt'
TESTING_DATA_FILE = 'SpamData/03_Testing/test-data.txt'

TOKEN_SPAM_PROB_FILE = 'SpamData/03_Testing/prob-spam.txt'
TOKEN_HAM_PROB_FILE = 'SpamData/03_Testing/prob-nonspam.txt'
TOKEN_ALL_PROB_FILE = 'SpamData/03_Testing/prob-all-tokens.txt'

TEST_FEATURE_MATRIX = 'SpamData/03_Testing/test-features.txt'
TEST_TRAGET_FILE = 'SpamData/03_Testing/test-target.txt'

# Read and load features from txt files into numpy arrays

In [3]:
sparse_train_data = np.loadtxt(TRAINING_DATA_FILE, delimiter=' ',dtype=int)

In [4]:
sparse_test_data = np.loadtxt(TESTING_DATA_FILE,delimiter=' ',dtype=int)

In [5]:
sparse_test_data[:5]

array([[     8,      2,      1,  30048,      1],
       [     8,      3,      1, 120139,      4],
       [     8,      4,      1,  59908,      2],
       [     8,      5,      1,  30061,      1],
       [     8,      6,      1,  59855,      2]])

In [6]:
sparse_test_data[-5:]

array([[  5793,   2350,      0,  91898,      2],
       [  5793,   2360,      0,  45908,      1],
       [  5793,   2364,      0, 184266,      4],
       [  5793,   2414,      0, 137769,      3],
       [  5793,   2489,      0,  45978,      1]])

In [7]:
print(f'Number of rows in training file {sparse_train_data.shape[0]}')
print(f'Number of rows in testing file {sparse_test_data.shape[0]}')

Number of rows in training file 258362
Number of rows in testing file 117772


In [8]:
print(f'Number of unique emails in the training file {np.unique(sparse_train_data[:,0]).size}')
print(f'Number of unique emails in the testing file {np.unique(sparse_test_data[:,0]).size}')

Number of unique emails in the training file 4013
Number of unique emails in the testing file 1724


### How to create an empty dataframe

In [9]:
column_names = ['DOC_ID']+['CATEGORY']+list(range(0,VOCAB_SIZE))
column_names[:5]

['DOC_ID', 'CATEGORY', 0, 1, 2]

In [10]:
index_names = np.unique(sparse_train_data[:,0])
index_names

array([   0,    1,    2, ..., 5791, 5794, 5795])

In [11]:
full_train_data = pd.DataFrame(index=index_names, columns=column_names)

In [12]:
full_train_data.fillna(0,inplace=True)
full_train_data.head()

Unnamed: 0,DOC_ID,CATEGORY,0,1,2,3,4,5,6,7,...,2490,2491,2492,2493,2494,2495,2496,2497,2498,2499
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
def make_full_matrix(sparse_matrix, nr_words, doc_idx=0, word_idx=1, cat_idx=2, freq_idx=4):
    """
    Form a full matrix from a sparse matrix. Return a pandas dataframe. 
    Keyword arguments:
    sparse_matrix -- numpy array
    nr_words -- size of the vocabulary. Total number of tokens. 
    doc_idx -- position of the document id in the sparse matrix. Default: 1st column
    word_idx -- position of the word id in the sparse matrix. Default: 2nd column
    cat_idx -- position of the label (spam is 1, nonspam is 0). Default: 3rd column
    freq_idx -- position of occurrence of word in sparse matrix. Default: 4th column
    """
    column_names = ['DOC_ID'] + ['CATEGORY'] + list(range(0, VOCAB_SIZE))
    doc_id_names = np.unique(sparse_matrix[:, 0])
    full_matrix = pd.DataFrame(index=doc_id_names, columns=column_names)
    full_matrix.fillna(value=0, inplace=True)
    
    for i in range(sparse_matrix.shape[0]):
        doc_nr = sparse_matrix[i][doc_idx]
        word_id = sparse_matrix[i][word_idx]
        label = sparse_matrix[i][cat_idx]
        occurrence = sparse_matrix[i][freq_idx]
        
        full_matrix.at[doc_nr, 'DOC_ID'] = doc_nr
        full_matrix.at[doc_nr, 'CATEGORY'] = label
        full_matrix.at[doc_nr, word_id] = occurrence
    
    full_matrix.set_index('DOC_ID', inplace=True)
    return full_matrix

In [14]:
full_train_data = make_full_matrix(sparse_train_data,VOCAB_SIZE)

In [15]:
full_train_data.head()

Unnamed: 0_level_0,CATEGORY,0,1,2,3,4,5,6,7,8,...,2490,2491,2492,2493,2494,2495,2496,2497,2498,2499
DOC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,0,0,1,2,1,0,0,3,0,...,0,0,0,0,0,0,0,0,0,0
1,1,7,1,2,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,1,6,1,1,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,1,6,0,0,2,4,0,3,14,0,...,0,0,0,0,0,0,0,0,0,0
4,1,5,1,2,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


# Training the Naive Bayes Probability

In [16]:
prob_spam = full_train_data['CATEGORY'].sum()/full_train_data.shape[0]
prob_ham = 1-prob_spam

In [17]:
print(f'Probability of spam is {prob_spam}')
print(f'Probability of ham is {prob_ham}')

Probability of spam is 0.310989284824321
Probability of ham is 0.689010715175679


# Total Number of Words or Tokens

In [18]:
full_train_features = full_train_data.iloc[:,1:]

In [19]:
full_train_features.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,2490,2491,2492,2493,2494,2495,2496,2497,2498,2499
DOC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,1,2,1,0,0,3,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,1,2,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,6,1,1,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,6,0,0,2,4,0,3,14,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,1,2,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
email_lengths = full_train_features.sum(axis=1)

In [21]:
email_lengths.head()

DOC_ID
0     87
1     53
2     40
3    183
4     43
dtype: int64

In [22]:
total_wc = email_lengths.sum()
total_wc

429243

# Number of tokens in spam and ham emails

In [23]:
spam_lengths = email_lengths.loc[full_train_data[full_train_data['CATEGORY'] == 1].index]
spam_wc = spam_lengths.sum()

In [24]:
ham_lengths = email_lengths.loc[full_train_data[full_train_data['CATEGORY'] == 0].index]
ham_wc = ham_lengths.sum()

In [25]:
email_lengths.shape[0]-spam_lengths.shape[0]-ham_lengths.shape[0]

0

In [26]:
total_wc-spam_wc-ham_wc

0

In [27]:
print(f'Average number of words in spam emails {spam_lengths.mean()}')
print(f'Average number of words in ham emails {ham_lengths.mean()}')

Average number of words in spam emails 141.2908653846154
Average number of words in ham emails 91.46907775768535


# Summing the tokens occuring in spam

In [28]:
full_train_features.shape

(4013, 2500)

In [29]:
train_spam_tokens = full_train_features.loc[full_train_data['CATEGORY'] == 1]
train_spam_tokens.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,2490,2491,2492,2493,2494,2495,2496,2497,2498,2499
DOC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,1,2,1,0,0,3,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,1,2,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,6,1,1,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,6,0,0,2,4,0,3,14,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,1,2,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
train_ham_tokens = full_train_features.loc[full_train_data.CATEGORY == 0]
summed_ham_tokens = train_ham_tokens.sum(axis=0) + 1

In [31]:
summed_spam_tokens = train_spam_tokens.sum(axis=0) + 1 #apply laplace smoothing 
# because we will be dividing the total number of occurances by total number of words in numerator and denominator, if any of them is eqeual to 0, then we will get 0 or ZeroDivisionError, to avoid that, we add 1

In [32]:
summed_spam_tokens.head()

0    2179
1     935
2    1217
3    2022
4    1219
dtype: int64

In [33]:
summed_ham_tokens = train_ham_tokens.sum(axis=0) + 1

$P(Token|Spam) = \frac{P(Spam|Token) \times P(Token)}{P(Spam)}$

In [34]:
prob_tokens_spam = summed_spam_tokens/(spam_wc+VOCAB_SIZE) # To balance laplace smoothing
prob_tokens_spam.head()

0    0.012185
1    0.005228
2    0.006805
3    0.011307
4    0.006816
dtype: float64

In [35]:
prob_tokens_spam.sum() #sum of all probabilities should be 1 

1.0

$P(Token|Ham) = \frac{P(Ham|Token) \times P(Token)}{P(Ham)}$

In [36]:
prob_tokens_ham = summed_ham_tokens/(ham_wc+VOCAB_SIZE)
prob_tokens_ham.head()

0    0.021471
1    0.010140
2    0.008007
3    0.003672
4    0.006311
dtype: float64

In [37]:
prob_tokens_ham.sum()

1.0

# P(Token) - probability that a token occurs

In [38]:
prob_tokens_all = full_train_features.sum(axis=0)/total_wc
prob_tokens_all.head()

0    0.017848
1    0.008207
2    0.007595
3    0.006891
4    0.006591
dtype: float64

In [39]:
prob_tokens_all.sum()

1.0

# Save the trained model

In [40]:
np.savetxt(TOKEN_SPAM_PROB_FILE, prob_tokens_spam)
np.savetxt(TOKEN_HAM_PROB_FILE, prob_tokens_ham)
np.savetxt(TOKEN_ALL_PROB_FILE, prob_tokens_all)

# Prepare the testing data

In [41]:
%%time
full_test_data = make_full_matrix(sparse_test_data, VOCAB_SIZE)

Wall time: 5.91 s


In [42]:
X_test = full_test_data.loc[:,full_test_data.columns != 'CATEGORY']
y_test = full_test_data.CATEGORY

In [43]:
np.savetxt(TEST_FEATURE_MATRIX, X_test)
np.savetxt(TEST_TRAGET_FILE, y_test)