# Overview
This is a Python notebook that performs spam filtering on the lingspam dataset. 

In [None]:
# imports
import numpy as np
import pandas as pd
import os
from os.path import join
import math
from sklearn.metrics import confusion_matrix
#from re import search
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score


In [None]:
# Import to disable warnigns. Don't import to see function run with warnings. 
import warnings; warnings.simplefilter('ignore')

# Data Definition
Loading the raw data


In [None]:
#Download data source
!wget http://www.aueb.gr/users/ion/data/lingspam_public.tar.gz
#Unzip the download. Use the first command if you want to view files inside.
#!tar -xvf lingspam_public.tar.gz
!tar -xf lingspam_public.tar.gz

--2020-10-23 20:01:14--  http://www.aueb.gr/users/ion/data/lingspam_public.tar.gz
Resolving www.aueb.gr (www.aueb.gr)... 195.251.255.156
Connecting to www.aueb.gr (www.aueb.gr)|195.251.255.156|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://www2.aueb.gr/users/ion/data/lingspam_public.tar.gz [following]
--2020-10-23 20:01:14--  http://www2.aueb.gr/users/ion/data/lingspam_public.tar.gz
Resolving www2.aueb.gr (www2.aueb.gr)... 195.251.255.138
Connecting to www2.aueb.gr (www2.aueb.gr)|195.251.255.138|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 11564714 (11M) [application/x-gzip]
Saving to: ‘lingspam_public.tar.gz’


2020-10-23 20:01:30 (775 KB/s) - ‘lingspam_public.tar.gz’ saved [11564714/11564714]



# Pre Processing


In [None]:
# reading training data
train_emails = []
path = 'lingspam_public/lemm_stop/part'
train_labels = []
spam_counter = 0
ham_counter = 0
for i in range(1,10):
  files = [f for f in os.listdir(path+str(i))]
  for each in files:
    if each.startswith('spm'):
      train_labels.append(0)
      spam_counter+=1
    else:
      train_labels.append(1)
      ham_counter+=1
    with open(join(path+str(i),each)) as f:
      train_emails.append(f.read())

train_df = pd.DataFrame(columns = ['emails','labels'])
train_df['labels']= train_labels
train_df['emails']= train_emails
  
# reading testing data
test_emails = []
test_path = 'lingspam_public/lemm_stop/part10'
test_labels = []
files = [f for f in os.listdir(test_path)]
for each in files:
  if each.startswith('spm'):
    test_labels.append(0)
  else:
    test_labels.append(1)
  with open(join(test_path,each)) as f:
    test_emails.append(f.read())

test_df = pd.DataFrame(columns = ['emails','labels'])
test_df['labels']= test_labels
test_df['emails']= test_emails

In [None]:
# Index retreival of the spam labels
spam_columns = np.array(train_df[train_df['labels']==0].index)
test_spam_columns = np.array(test_df[test_df['labels']==0].index)
ham_columns = np.array(train_df[train_df['labels']==1].index)

In [None]:
# Samples
print('Training Samples: ')
print(train_df.sample(5))

print('\nTesting Samples: ')
print(test_df.sample(5))


Training Samples: 
                                                 emails  labels
2537  Subject: distribute access linguistic resource...       1
2328  Subject: linguistic typology 2 : 1 ( 1998 )\n\...       1
2519  Subject: anthropological linguistic , vol . 39...       1
1187  Subject: call : weisgerber colloquium\n\ncall ...       1
1328  Subject: semantic / syntax - semantic interfac...       1

Testing Samples: 
                                                emails  labels
107  Subject: semantic : il dominio tempo-aspettual...       1
264  Subject: honor two keynote speaker\n\ninternat...       1
195  Subject: vacation !\n\nfull detail : http : / ...       0
269  Subject: mt special issue slt : reminder\n\nre...       1
254  Subject: workshop announcement\n\ncall papers ...       1


## Binomial feature extraction

In [None]:
#vectorizer = CountVectorizer(binary=False, lowercase= True, stop_words=stop_words,)
stop_words =['_']
vectorizer = CountVectorizer(binary=True, lowercase= True, token_pattern=r'\b[^\d\W]{3,}\b',
                             stop_words=stop_words)
transformed_data = vectorizer.fit_transform(train_df.emails)

#print(vectorizer.get_feature_names())
document_vector = transformed_data.toarray()
document_df = pd.DataFrame(document_vector).transpose()
document_df['spam_count'] = document_df[spam_columns].sum(axis=1)
document_df['ham_count'] = document_df[ham_columns].sum(axis=1)

# indices represent word index in vocubulary, columns, is the emails
print(document_df.head(5))

#Test data
test_data = vectorizer.transform(test_df.emails)
testing_df = pd.DataFrame(test_data.toarray())

   0  1  2  3  4  5  6  ...  2597  2598  2599  2600  2601  spam_count  ham_count
0  0  0  0  0  0  0  0  ...     0     0     0     0     0           4          6
1  0  0  0  0  0  0  0  ...     0     0     0     0     0           0          6
2  0  0  0  0  0  0  0  ...     0     0     0     0     0           0          2
3  0  0  0  0  0  0  0  ...     0     0     0     0     0           1          0
4  0  0  0  0  0  0  0  ...     0     0     0     0     0           0          3

[5 rows x 2604 columns]


## Term Frequency Feature Extraction

In [None]:
stop_words =['_']
#set binary as False to have non-zero terms retain it's original count value
vectorizer2 = CountVectorizer(binary=False, lowercase= True, token_pattern=r'\b[^\d\W]{3,}\b',
                             stop_words=stop_words)
transformed_data = vectorizer2.fit_transform(train_df.emails)

#print(vectorizer.get_feature_names())
document_vector = transformed_data.toarray()
tf_df = pd.DataFrame(document_vector).transpose()
tf_df['spam_count'] = tf_df[spam_columns].sum(axis=1)
tf_df['ham_count'] = tf_df[ham_columns].sum(axis=1)
#tf_df['spam_count'] = document_df['spam_count']
#tf_df['ham_count'] = document_df['ham_count']

# indices represent word index in vocubulary, columns, is the emails
print(tf_df.sample(5))

#Test data
tf_test = vectorizer2.transform(test_df.emails)
tf_testing_df = pd.DataFrame(tf_test.toarray())

       0  1  2  3  4  5  6  ...  2597  2598  2599  2600  2601  spam_count  ham_count
27612  0  0  0  0  0  0  0  ...     0     0     0     0     0           0          2
3707   0  0  0  0  0  0  0  ...     0     0     0     0     0           1          0
10730  0  0  0  0  0  0  0  ...     0     0     0     0     0           0         12
932    0  0  0  0  0  0  0  ...     0     0     0     0     0           0          6
26862  0  0  0  0  0  0  0  ...     0     0     0     0     0           0          1

[5 rows x 2604 columns]


In [None]:
# For sanity check, both vocabularies are equal
print(vectorizer.vocabulary_ == vectorizer2.vocabulary_)

True


#Information Gain- Part1


## Calculating Information Gain


#### **UNCOMMENT AND RUN THIS BEFORE RUNNING THE IG CALCULATION A SECOND TIME OR ANY SUBSEQUENT NUMBER OF TIMES**

In [None]:
# if re running IG please run this
# undoing laplacian smoothing
'''document_df['ham_count'] = document_df['ham_count']-1
document_df['spam_count'] = document_df['spam_count']-1
spam_counter -= 2
ham_counter -= 2
#p = (ham_counter)/(spam_counter+ham_counter)
total_train_emails -= 4'''

"document_df['ham_count'] = document_df['ham_count']-1\ndocument_df['spam_count'] = document_df['spam_count']-1\nspam_counter -= 2\nham_counter -= 2\n#p = (ham_counter)/(spam_counter+ham_counter)\ntotal_train_emails -= 4"

In [None]:
#document vector is the vector where each row represents an email, each column 
#represents the occurence of a word, who's index can be retreived using the vectorizer.vocabulary_
IG = []

# laplacian smoothing
document_df['ham_count'] = document_df['ham_count']+1
document_df['spam_count'] = document_df['spam_count']+1
spam_counter += 2
ham_counter += 2
p = (ham_counter)/(spam_counter+ham_counter)
total_train_emails = len(train_df) + 4

#print(spam_counter+2)
#print(ham_counter+2)

entropy = -p*np.log2(p) - (1-p)*np.log2(1-p)

for i in range(len(document_df)):
  #print(i)
  #print(document_df['ham_count'][i])
  p_Xi_given_x = spam_counter - document_df['spam_count'][i] + ham_counter - document_df['ham_count'][i]
  #print(p_Xi_given_x)
  #entropy legit word appears
  entropy_legit_word_appeared = document_df['ham_count'][i] / total_train_emails
  entropy_legit_word_appeared *= np.log2(document_df['ham_count'][i]/(document_df['spam_count'][i]+document_df['ham_count'][i]))

  #entropy legit, not appeared
  entropy_legit_word_not_appeared = (ham_counter - document_df['ham_count'][i])/total_train_emails
  entropy_legit_word_not_appeared *= np.log2((ham_counter - document_df['ham_count'][i])/p_Xi_given_x)
                                                                                                    
  #entropy spam, word appeared
  entropy_spam_word_appeared = document_df['spam_count'][i]/total_train_emails
  entropy_spam_word_appeared *= np.log2(document_df['spam_count'][i]/(document_df['spam_count'][i]+document_df['ham_count'][i]))

  #entropy spam, word not appeared
  entropy_spam_word_not_appeared = (spam_counter - document_df['spam_count'][i])/total_train_emails
  entropy_spam_word_not_appeared *= np.log2((spam_counter - document_df['spam_count'][i])/p_Xi_given_x)

  infogain = entropy + entropy_legit_word_appeared + entropy_legit_word_not_appeared + entropy_spam_word_appeared + entropy_spam_word_not_appeared
  #print(infogain)
  IG.append(infogain)
  
document_df['IG'] = IG


### Check IG Values are non negative

In [None]:
(document_df['IG']>0).value_counts()

True    44864
Name: IG, dtype: int64

## N largest IG features (N=10,100,1000)
 

In [None]:
document_df['IG'] = document_df['IG'].fillna(0)
ten_largest = document_df.nlargest(10,'IG')
hundred_largest = document_df.nlargest(100,'IG')
thousand_largest = document_df.nlargest(1000,'IG')

In [None]:
key_list = list(vectorizer.vocabulary_.keys()) 
val_list = list(vectorizer.vocabulary_.values()) 
ten_largest_words = []
hundred_largest_words = []
thousand_largest_words = []
for each in ten_largest.index:
  ten_largest_words.append(key_list[val_list.index(each)])
for each in hundred_largest.index:
  hundred_largest_words.append(key_list[val_list.index(each)])
for each in thousand_largest.index:
  thousand_largest_words.append(key_list[val_list.index(each)])

In [None]:
print('Ten largest IG features :' )
print(*ten_largest_words,sep=', ')
print('Hundred largest IG features :' )
print(*hundred_largest_words,sep=', ')
print('Thousand largest IG features :' )
print(*thousand_largest_words,sep=', ')


Ten largest IG features :
language, remove, free, linguistic, university, money, click, market, our, business
Hundred largest IG features :
language, remove, free, linguistic, university, money, click, market, our, business, today, advertise, product, company, sell, million, internet, english, income, linguistics, easy, save, guarantee, thousand, best, check, purchase, buy, win, cash, day, over, bulk, want, cost, dollar, every, service, mailing, edu, com, yourself, hundred, papers, earn, linguist, hour, theory, customer, offer, profit, success, fun, month, abstract, here, yours, conference, watch, receive, pay, ever, speaker, credit, bonus, start, zip, sale, amaze, live, discussion, toll, syntax, investment, financial, anywhere, online, department, dream, huge, grammar, friend, simply, science, structure, week, need, wait, deadline, mlm, fresh, study, security, marketing, xxx, ship, analysis, workshop, off, line
Thousand largest IG features :
language, remove, free, linguistic, univers

# Naive Bayes- Part2

In [None]:
accuracy_df = pd.DataFrame(columns=['model','precision','recall'])

## Bernoulli Naive Bayes with Binary Features

**Please note** the data has already applied Laplacian Smoothing in IG calculation

In [None]:
# Function to determine Naive Bayes with Binary Features
def predict_binary_BernoulliNB(training_data, test_data, smoothing_applied=True):
  predicted_labels = []
  global spam_counter
  global ham_counter
  if not smoothing_applied:
    training_data['ham_count'] = training_data['ham_count']+1
    training_data['spam_count'] = training_data['spam_count']+1
    spam_counter += 2
    ham_counter += 2

  p_spam = (spam_counter)/(spam_counter+ham_counter)
  p_legit = (ham_counter)/(spam_counter+ham_counter)

  for i in range(len(test_data)):
    test_email = test_data.iloc[i]
    p_spam_i = ((training_data['spam_count']/spam_counter)**test_email).prod()
    one_minus_p_subi_spam = ((1-(training_data['spam_count']/spam_counter))**(1-test_email)).prod()
    p_legit_i = ((training_data['ham_count']/ham_counter)**test_email).prod()
    one_minus_p_subi_legit = ((1-(training_data['ham_count']/ham_counter))**(1-test_email)).prod()
    probability = (p_spam*p_spam_i*one_minus_p_subi_spam/(p_legit*p_legit_i*one_minus_p_subi_legit)).prod()
    if probability>1:
      predicted_labels.append(0)
    else:
      predicted_labels.append(1)
  return np.asarray(predicted_labels)

### Ten Largest Features

In [None]:
# Ten Largest Features
training_data = ten_largest.drop(['spam_count','ham_count','IG'], axis = 1).transpose()

predicted_NB_binary = predict_binary_BernoulliNB(ten_largest, testing_df[ten_largest.index])

# Using inbuilt_library
clf = BernoulliNB()
clf.fit(X=training_data, y=train_labels)
y_BNB_ten = clf.predict(testing_df[ten_largest.index])
print("Comparing function results to that of  sklearn library.\nNumber of values that are same: {}. Test dataset length: {}"
      .format((y_BNB_ten==predicted_NB_binary).sum(),len(testing_df)))

recall_BNB_ten = recall_score(1-np.array(test_labels),1-predicted_NB_binary)
precision_BNB_ten = precision_score(1-np.array(test_labels), 1-predicted_NB_binary)
print("Bernoulli Naive Bayes with 10 top IG features.")
print("Precision: {:.2f}, Recall: {:.2f}".format(precision_BNB_ten,recall_BNB_ten))
accuracy_df = accuracy_df.append({'model':'Bernoulli NB Binary 10 Features', 'precision':precision_BNB_ten,
                                  'recall':recall_BNB_ten},ignore_index=True)



Comparing function results to that of  sklearn library.
Number of values that are same: 291. Test dataset length: 291
Bernoulli Naive Bayes with 10 top IG features.
Precision: 0.87, Recall: 0.82


### Hundred Largest Features

In [None]:
# Hundred Largest Features
predicted_NB_binary_hundred = predict_binary_BernoulliNB(hundred_largest, testing_df[hundred_largest.index])
recall_BNB_hundred = recall_score(1-np.array(test_labels),1-predicted_NB_binary_hundred)
precision_BNB_hundred = precision_score(1-np.array(test_labels), 1-predicted_NB_binary_hundred)
print("Bernoulli Naive Bayes with 100 top IG features.")
print("Precision:{:.2f}, Recall:{:.2f}".format(precision_BNB_hundred,recall_BNB_hundred))
accuracy_df = accuracy_df.append({'model':'Bernoulli NB Binary 100 Features', 'precision':precision_BNB_hundred,
                                  'recall':recall_BNB_hundred},ignore_index=True)

Bernoulli Naive Bayes with 100 top IG features.
Precision:1.00, Recall:0.67


### Thousand Largest Features

In [None]:
# Thousand Largest Features
predicted_NB_binary_thousand = predict_binary_BernoulliNB(thousand_largest, testing_df[thousand_largest.index])
recall_BNB_thousand = recall_score(1-np.array(test_labels),1-predicted_NB_binary_thousand)
precision_BNB_thousand = precision_score(1-np.array(test_labels),1-predicted_NB_binary_thousand)
print("Bernoulli Naive Bayes with 1000 top IG features.")
print("Precision:{:.2f}, Recall:{:.2f}".format(precision_BNB_thousand,recall_BNB_thousand))
accuracy_df = accuracy_df.append({'model':'Bernoulli NB Binary 1000 Features', 'precision':precision_BNB_thousand,
                                  'recall':recall_BNB_thousand},ignore_index=True)

Bernoulli Naive Bayes with 1000 top IG features.
Precision:1.00, Recall:0.61


## Multinomial Naive Bayes with Binary Features


In [None]:
# Function to determine multinomial Naive Bayes.
def predict_multinomial_NB(training_data, test_data, smoothing_applied = True):
  predicted_labels = []
  global spam_counter
  global ham_counter
  # the global values have been smoothed already
  if not smoothing_applied:
    training_data['ham_count'] = training_data['ham_count']+1
    training_data['spam_count'] = training_data['spam_count']+1

  p_spam = (spam_counter)/(spam_counter+ham_counter)
  p_legit = (ham_counter)/(spam_counter+ham_counter)
  summation_M_spam = training_data['spam_count'].sum()
  summation_M_legit = training_data['ham_count'].sum()

  for i in range(len(test_data)):
    test_email = test_data.iloc[i]
    p_spam_i = ((training_data['spam_count']/summation_M_spam)**test_email).prod()
    p_legit_i = ((training_data['ham_count']/summation_M_legit)**test_email).prod()
    probability = (p_spam*p_spam_i/(p_legit*p_legit_i)).prod()
    if probability>1:
      predicted_labels.append(0)
    else:
      predicted_labels.append(1)

  return np.asarray(predicted_labels)

### Ten Largest Features

In [None]:
# Ten Largest Features
training_data = ten_largest.drop(['spam_count','ham_count','IG'], axis = 1).transpose()

predicted_NB_multinomial = predict_multinomial_NB(ten_largest, testing_df[ten_largest.index])

# Using inbuilt_library
clf = MultinomialNB()
clf.fit(X=training_data, y=train_labels)
y_MNB_ten = clf.predict(testing_df[ten_largest.index])
print("Comparing function results to that of  sklearn library.\nNumber of values that are same: {}. Test dataset length: {}"
      .format((y_BNB_ten==predicted_NB_multinomial).sum(),len(testing_df)))

recall_MNB_ten = recall_score(1-np.array(test_labels),1-predicted_NB_multinomial)
precision_MNB_ten = precision_score(1-np.array(test_labels), 1-predicted_NB_multinomial)
print("Multinomial binary features Naive Bayes with 10 top IG features.")
print("Precision: {:.2f}, Recall: {:.2f}".format(precision_MNB_ten,recall_MNB_ten))
accuracy_df = accuracy_df.append({'model':'Multinomial NB Binary 10 Features', 'precision':precision_MNB_ten,
                                  'recall':recall_MNB_ten},ignore_index=True)

Comparing function results to that of  sklearn library.
Number of values that are same: 290. Test dataset length: 291
Multinomial binary features Naive Bayes with 10 top IG features.
Precision: 0.89, Recall: 0.82


### Hundred Largest Features

In [None]:
predicted_NB_multinomial_hundred = predict_multinomial_NB(hundred_largest, testing_df[hundred_largest.index])

recall_MNB_hundred = recall_score(1-np.array(test_labels),1-predicted_NB_multinomial_hundred)
precision_MNB_hundred = precision_score(1-np.array(test_labels), 1-predicted_NB_multinomial_hundred)
print("Multinomial binary features Naive Bayes with 100 top IG features.")
print("Precision: {:.2f}, Recall: {:.2f}".format(precision_MNB_hundred,recall_MNB_hundred))
accuracy_df = accuracy_df.append({'model':'Multinomial NB Binary 100 Features', 'precision':precision_MNB_hundred,
                                  'recall':recall_MNB_hundred},ignore_index=True)

Multinomial binary features Naive Bayes with 100 top IG features.
Precision: 0.96, Recall: 0.92


### Thousand Largest Features

In [None]:
predicted_NB_multinomial_thousand = predict_multinomial_NB(thousand_largest, testing_df[thousand_largest.index])

recall_MNB_thousand = recall_score(1-np.array(test_labels),1-predicted_NB_multinomial_thousand)
precision_MNB_thousand = precision_score(1-np.array(test_labels), 1-predicted_NB_multinomial_thousand)
print("Multinomial binary features Naive Bayes with 1000 top IG features.")
print("Precision: {:.2f}, Recall: {:.2f}".format(precision_MNB_thousand,recall_MNB_thousand))
accuracy_df = accuracy_df.append({'model':'Multinomial NB Binary 1000 Features', 'precision':precision_MNB_thousand,
                                  'recall':recall_MNB_thousand},ignore_index=True)

Multinomial binary features Naive Bayes with 1000 top IG features.
Precision: 1.00, Recall: 0.84


## Multinomial Naive Bayes with Term Frequencies


In [None]:
# First assigning IG to retrieve top 10,100,1000 largest df
tf_df['IG'] = document_df['IG']
ten_largest_tf = tf_df.nlargest(10,'IG')
hundred_largest_tf = tf_df.nlargest(100,'IG')
thousand_largest_tf = tf_df.nlargest(1000,'IG')

### Ten Largest Features

In [None]:
predicted_NB_multinomial_tf = predict_multinomial_NB(ten_largest_tf, tf_testing_df[ten_largest_tf.index], False)

recall_MNB_ten_tf = recall_score(1-np.array(test_labels),1-predicted_NB_multinomial_tf)
precision_MNB_ten_tf = precision_score(1-np.array(test_labels), 1-predicted_NB_multinomial_tf)
print("Multinomial TF features Naive Bayes with 10 top IG features.")
print("Precision: {:.2f}, Recall: {:.2f}".format(precision_MNB_ten_tf,recall_MNB_ten_tf))
accuracy_df = accuracy_df.append({'model':'Multinomial NB TF 10 Features', 'precision':precision_MNB_ten_tf,
                                  'recall':recall_MNB_ten_tf},ignore_index=True)

Multinomial TF features Naive Bayes with 10 top IG features.
Precision: 0.85, Recall: 0.94


### Hundred Largest Features

In [None]:
predicted_NB_multinomial_hundred_tf = predict_multinomial_NB(hundred_largest_tf, tf_testing_df[hundred_largest_tf.index],False)

recall_MNB_hundred_tf = recall_score(1-np.array(test_labels),1-predicted_NB_multinomial_hundred_tf)
precision_MNB_hundred_tf = precision_score(1-np.array(test_labels), 1-predicted_NB_multinomial_hundred_tf)
print("Multinomial TF features Naive Bayes with 100 top IG features.")
print("Precision: {:.2f}, Recall: {:.2f}".format(precision_MNB_hundred_tf,recall_MNB_hundred_tf))
accuracy_df = accuracy_df.append({'model':'Multinomial NB TF 100 Features', 'precision':precision_MNB_hundred_tf,
                                  'recall':recall_MNB_hundred_tf},ignore_index=True)

Multinomial TF features Naive Bayes with 100 top IG features.
Precision: 0.96, Recall: 0.96


### Thousand Largest Features

In [None]:
predicted_NB_multinomial_thousand_tf = predict_multinomial_NB(thousand_largest_tf, tf_testing_df[thousand_largest.index], False)

recall_MNB_thousand_tf = recall_score(1-np.array(test_labels),1-predicted_NB_multinomial_thousand_tf)
precision_MNB_thousand_tf = precision_score(1-np.array(test_labels), 1-predicted_NB_multinomial_thousand_tf)
print("Multinomial TF Naive Bayes with 1000 top IG features.")
print("Precision: {:.2f}, Recall: {:.2f}".format(precision_MNB_thousand_tf,recall_MNB_thousand_tf))
accuracy_df = accuracy_df.append({'model':'Multinomial NB TF 1000 Features', 'precision':precision_MNB_thousand_tf,
                                  'recall':recall_MNB_thousand_tf},ignore_index=True)

Multinomial TF Naive Bayes with 1000 top IG features.
Precision: 1.00, Recall: 0.67


## Precision and Recall Results

In [None]:
print(accuracy_df)

                                 model  precision    recall
0      Bernoulli NB Binary 10 Features   0.869565  0.816327
1     Bernoulli NB Binary 100 Features   1.000000  0.673469
2    Bernoulli NB Binary 1000 Features   1.000000  0.612245
3    Multinomial NB Binary 10 Features   0.888889  0.816327
4   Multinomial NB Binary 100 Features   0.957447  0.918367
5  Multinomial NB Binary 1000 Features   1.000000  0.836735
6        Multinomial NB TF 10 Features   0.851852  0.938776
7       Multinomial NB TF 100 Features   0.959184  0.959184
8      Multinomial NB TF 1000 Features   1.000000  0.673469


#SVM- Part3

### Data Definition
Since we will be using the entire dataset with cross validation, creating a new dataset as the combination of both training and testing dataframes

In [None]:
# Recovering the original dataset from the training and testing dataframes.
training_data = document_df.drop(['spam_count','ham_count','IG'],axis=1).transpose().append(testing_df,ignore_index=True)

# Doing 1 - labels to flip the labels and have 0 represent ham, 1 represent spam. 
# All previous experiments were run with the basis that 1 represents ham, 0 represent spam. 
labels = 1-np.array(train_labels+test_labels)

### Model Evaluation

We will be using IG to select top features.

In [None]:
# Using 5 consecutive crossvalidated scores
def evaluate_svm(model, X, y, splits = 5):
  scores = cross_val_score(model, X, y, cv = splits)
  return scores
  

In [None]:
kernels = ['linear', 'poly', 'rbf', 'sigmoid']
SVM_accuracy = pd.DataFrame(columns=['model', 'number_of_features', 'accuracy_mean', 'accuracy_std'])
features = {10:ten_largest.index, 100:hundred_largest.index, 1000:thousand_largest.index}
for kernel in kernels:
  model = SVC(kernel=kernel)
  for number in features:
    scores = evaluate_svm(model, training_data[features[number]], train_labels+test_labels)
    SVM_accuracy = SVM_accuracy.append({'model':'SVM {} kernel'.format(kernel), 'accuracy_mean':scores.mean(), 
                        'accuracy_std':scores.std(),'number_of_features': number},ignore_index=True)
  
print('The model accuracies are shown below:')
print(SVM_accuracy)
print('\n\nThe best model scores are given by:')
print(SVM_accuracy[SVM_accuracy['accuracy_mean']==SVM_accuracy['accuracy_mean'].max()])

The model accuracies are shown below:
                 model number_of_features  accuracy_mean  accuracy_std
0    SVM linear kernel                 10       0.963359      0.007128
1    SVM linear kernel                100       0.980987      0.007737
2    SVM linear kernel               1000       0.986518      0.007609
3      SVM poly kernel                 10       0.960589      0.013866
4      SVM poly kernel                100       0.938123      0.004741
5      SVM poly kernel               1000       0.898025      0.014808
6       SVM rbf kernel                 10       0.965430      0.008345
7       SVM rbf kernel                100       0.984099      0.006417
8       SVM rbf kernel               1000       0.991358      0.002445
9   SVM sigmoid kernel                 10       0.935356      0.019702
10  SVM sigmoid kernel                100       0.975801      0.010945
11  SVM sigmoid kernel               1000       0.984789      0.006511


The best model scores are given by:
 

# Adversarial Classification- Part4

## Baseline classifier

In [None]:
training_data = ten_largest.drop(['spam_count','ham_count','IG'], axis = 1).transpose()

# Using inbuilt_library
clf = MultinomialNB()
clf.fit(X=training_data, y=train_labels)
baseline_predicted = clf.predict(testing_df[ten_largest.index])
spam_classified = np.where(baseline_predicted==0)

## $LO(x_i)$ on top 10 words

In [None]:
def calculate_LO(training_data,testing_data):
  LO = np.zeros(10)
  LO_compliment = np.zeros(10)
  for i in predicted_true_spam:
    sample = testing_data.loc[i]
    p_spam_i = training_data['spam_count']/spam_counter
    p_x_spam = (p_spam_i ** sample)
    p_ham_i = training_data['ham_count']/ham_counter
    p_x_ham = (p_ham_i ** sample)
    LO += np.log2(p_x_spam/p_x_ham)
    LO_compliment += np.log2(1-(p_x_spam/p_x_ham))
  return LO

In [None]:
training_data = ten_largest
training_data['spam_count'] = training_data['spam_count'] + 1
training_data['ham_count'] = training_data['ham_count'] + 1
testing_data = testing_df[ten_largest.index]
predicted_true_spam = np.intersect1d(spam_classified,np.where(np.array(test_labels)==0))
testing_data = testing_data.iloc[predicted_true_spam]

LO = calculate_LO(training_data, testing_data)
LO_compliment = calculate_LO(training_data, 1-testing_data)

Confirming all $LO(x_i)$ values are positive

In [None]:
print(*zip(LO,LO>=0))

(0.0, True) (83.74383707620471, True) (79.70818938184435, True) (0.0, True) (0.0, True) (38.342190746372616, True) (84.97894916894892, True) (23.77848954669334, True) (45.73007877092342, True) (23.762492520176533, True)


## Using minimum N words as ADDWORDS and calculating cost. $N\in[1,3]$

Since 3 terms in the LO array equate to 0, we shall use those words as ADDWORDS in all test emails where the term is 0. 

In [None]:
# Since 3 terms are 0 in LO, we shall add these words to the emails.
zero_indices = np.where(LO==0) 
word_indices = ten_largest.index[zero_indices]

# helper
def sub_lists (l): 
    base = []   
    lists = [base] 
    for i in range(len(l)): 
        orig = lists[:] 
        new = l[i] 
        for j in range(len(lists)): 
            lists[j] = lists[j] + [new] 
        lists = orig + lists 
          
    return lists[1:] 
word_indices = sub_lists(word_indices)
  

In [None]:
adversarial_df = pd.DataFrame(columns=['ADDWORD_indices', 'COST', 
                                       'False_Negatives_before', 'False_Negatives_after'])
# Although this stands for false positive since, spam is denoted by 0, using false positive
_, false_positive, _, _ = confusion_matrix(np.array(test_labels), baseline_predicted).ravel()
for indices in word_indices:
  temp_test = testing_df[ten_largest.index]
  temp_test[indices] = 1
  predicted = clf.predict(temp_test)
  LO_temp = calculate_LO(training_data, temp_test)
  #LO_compliment_temp = calculate_LO(training_data,1-temp_test)
  cost = max(np.sum(LO_temp),0)
  _, fp, _, _ = confusion_matrix(np.array(test_labels), predicted).ravel()
  adversarial_df = adversarial_df.append({'ADDWORD_indices':indices, 'COST':cost,
                                          'False_Negatives_before':false_positive, 
                                          'False_Negatives_after':fp},ignore_index=True)


In [None]:
adversarial_df

Unnamed: 0,ADDWORD_indices,COST,False_Negatives_before,False_Negatives_after
0,[22426],183.019742,9,29
1,[23322],128.373259,9,39
2,"[22426, 23322]",0.0,9,47
3,[41630],230.499505,9,25
4,"[22426, 41630]",35.99513,9,43
5,"[23322, 41630]",0.0,9,47
6,"[22426, 23322, 41630]",0.0,9,49


## Classifier updation strategy
To do