In [5]:
import numpy as np
import pandas as pd
from collections import Counter
from random import randint

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler, normalize

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Vectorization
This function takes a pandas df of subject + email content and vectorizes them using a bag-of-words approach

Output is a df of size (n, m) where n is the number of datapoints and m is the size of the vocabulary




## Vectorization Function

In [None]:
def vectorize(df, vocab):
    
    # make initialized dict (words with all counts = 0)
    init_dict = dict()
    for w,count in vocab:
        init_dict[w] = 0

    # covert data to counts of word frequencies
    BOW_vectors = []
    for i, row in df.iterrows():
        row_dict = init_dict.copy()
        words = row[0].split()
        for word in words:
            if word in row_dict.keys():
                row_dict[word] += 1
        BOW_vectors.append(list(row_dict.values()))

    BOW = pd.DataFrame(BOW_vectors)
    return BOW

def make_vocabulary(df):
    vocab = set()
    for i, row in df.iterrows():
        words = row[0].split()

        for w in words:
            vocab.add(w)
    return list(vocab) # vocabulary as 1-row list of words

def most_common_vocab(df):
    all_words = []       
    for i, row in df.iterrows():
        words = row[0].split()
        all_words += words
    dictionary = Counter(all_words)
    dictionary_to_alter = dictionary.copy()

    for item in dictionary.keys():
        if item.isalpha() == False: 
            del dictionary_to_alter[item]
        elif len(item) == 1:
            del dictionary_to_alter[item]

    dictionary = dictionary_to_alter.copy().most_common(6000)
    return dictionary



## Vectorization Example

In [None]:
example = ['this is an email', 'this is spam', 'how are you doing doing doing doing', 'snake oil for sale', 'this is an widget']

example_pd = pd.DataFrame({"Content": example})

print("example data:\n")
print(example_pd)
print("\nvocabulary:\n")
print(make_vocabulary(example_pd))
print("\nBag of words output (vectorization), dims: (n x len(vocab)):\n")
vectorize(example_pd, most_common_vocab(example_pd))

example data:

                               Content
0                     this is an email
1                         this is spam
2  how are you doing doing doing doing
3                   snake oil for sale
4                    this is an widget

vocabulary:

['how', 'for', 'widget', 'email', 'an', 'doing', 'sale', 'snake', 'is', 'this', 'oil', 'you', 'spam', 'are']

Bag of words output (vectorization), dims: (n x len(vocab)):



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0,1,1,1,1,0,0,0,0,0,0,0,0,0
1,0,1,1,0,0,1,0,0,0,0,0,0,0,0
2,4,0,0,0,0,0,1,1,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1,1,1,1,0
4,0,1,1,1,0,0,0,0,0,0,0,0,0,1


# Importing Dataset and Vectorizing

In [4]:
# Put email content into pd dataframe to pass into vectorization function
dataset_path = "/content/drive/MyDrive/6220 Project/SpamDatasets/processed_data/fully_combined_data.csv"
df = pd.read_csv(dataset_path, names=['Content', 'Label'])
emails_df = df.iloc[:, [0]]
# print(emails_df)

In [5]:
labels = df.iloc[:, [1]]
# print(labels)

## Find unique vocabulary list

In [None]:
vocab = make_vocabulary(emails_df)
print(len(vocab))

157312


In [None]:
most_common_vocab_list = most_common_vocab(emails_df)
# print(most_common_vocab_list)

### Vectorize based on most common words and then save to Drive

In [None]:
most_common_path = "/content/drive/MyDrive/6220 Project/SpamDatasets/processed_data/vectorized_most_common.csv"

In [None]:
most_common_vectorized_df = vectorize(emails_df, most_common_vocab_list)

In [None]:
most_common_vectorized_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,5960,5961,5962,5963,5964,5965,5966,5967,5968,5969,5970,5971,5972,5973,5974,5975,5976,5977,5978,5979,5980,5981,5982,5983,5984,5985,5986,5987,5988,5989,5990,5991,5992,5993,5994,5995,5996,5997,5998,5999
0,0,0,0,0,0,8,0,0,0,0,0,0,3,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,2,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33747,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,2,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
33748,4,0,0,0,0,4,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
33749,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
33750,0,0,4,0,0,0,0,0,2,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
most_common_vectorized_df.to_csv(most_common_path, mode='w', header=True)

## Append partitions to csv file in Drive
Doing this to avoid losing data whenever the runtime crashes

In [None]:
path = "/content/drive/MyDrive/6220 Project/SpamDatasets/processed_data/vectorized_data.csv"

In [None]:
test_partition = vectorize(emails_df.iloc[:3000], vocab)
print(test_partition.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Columns: 157312 entries, 0 to 157311
dtypes: int64(157312)
memory usage: 3.5 GB
None


In [None]:
partition_1 = vectorize(emails_df.iloc[:3000], vocab)
partition_1.to_csv(path, mode='a', header=True)

In [None]:
partition_2 = vectorize(emails_df.iloc[3000:6000], vocab)
partition_2.to_csv(path, mode='a', header=True)

In [None]:
partition_3 = vectorize(emails_df.iloc[6000:9000], vocab)
partition_3.to_csv(path, mode='a', header=True)

In [None]:
partition_4 = vectorize(emails_df.iloc[9000:12000], vocab)
partition_4.to_csv(path, mode='a', header=True)

In [None]:
partition_5 = vectorize(emails_df.iloc[12000:15000], vocab)
partition_5.to_csv(path, mode='a', header=True)

In [None]:
partition_6 = vectorize(emails_df.iloc[15000:18000], vocab)
partition_6.to_csv(path, mode='a', header=True)

In [None]:
partition_7 = vectorize(emails_df.iloc[18000:21000], vocab)
partition_7.to_csv(path, mode='a', header=True)

In [None]:
partition_8 = vectorize(emails_df.iloc[21000:24000], vocab)
partition_8.to_csv(path, mode='a', header=True)

In [None]:
partition_9 = vectorize(emails_df.iloc[24000:27000], vocab)
partition_9.to_csv(path, mode='a', header=True)

In [None]:
partition_10 = vectorize(emails_df.iloc[27000:30000], vocab)
partition_10.to_csv(path, mode='a', header=True)

In [None]:
# did this make it to the csv file?
partition_11 = vectorize(emails_df.iloc[30000:33000], vocab)
partition_11.to_csv(path, mode='a', header=True)

In [None]:
# did this make it to the csv file?
partition_12 = vectorize(emails_df.iloc[33000:] ,vocab)
partition_12.to_csv(path, mode='a', header=True)
#done up to here

In [None]:
path = "/content/drive/MyDrive/6220 Project/SpamDatasets/processed_data/vectorized_data.csv"

In [None]:
# make df out of CSV
# get 500 random columns
cols = []
for _ in range(7000):
	cols.append(randint(0, 157312))
vectorized = pd.read_csv(path, usecols=cols)
# sum across the rows (df.sum(axis=1))
# sort (retain indices) and get 3000 largest occurrences
# filter original df based on the 3000 indices

# vectorized_df = dd.read_csv(path, sample=100000, dtype={'Unnamed: 0': 'float64'})
# just added this dtype thingie.. try this out and see what happens

In [None]:
print(vectorized)
counts = vectorized.sum(axis=0).sort_values()
print(counts)

       30  214  305  306  355  ...  156822  156886  157239  157244  157295
0       0    0    0    0    0  ...       0       0       0       0       0
1       0    0    0    0    0  ...       0       0       0       0       0
2       0    0    0    0    0  ...       0       0       0       0       0
3       0    0    0    0    0  ...       0       0       0       0       0
4       0    0    0    0    0  ...       0       0       0       0       0
...    ..  ...  ...  ...  ...  ...     ...     ...     ...     ...     ...
33758   0    0    0    0    0  ...       0       0       0       0       0
33759   0    0    0    0    0  ...       0       0       0       0       0
33760   0    0    0    0    0  ...       0       0       0       0       0
33761   0    0    0    0    0  ...       0       0       0       0       0
33762   0    0    0    0    0  ...       0       0       0       0       0

[33763 rows x 1994 columns]
30            332
214          2359
305          3358
306          3420

In [None]:
high_counts = counts[counts >= 900000]
print(high_counts)

81835      900196
81943      901394
82013      902149
82377      906150
82486      907349
           ...   
156822    1725046
156886    1725748
157239    1729648
157244    1729692
157295    1730247
Length: 934, dtype: int64


# Train Test Split

In [None]:
most_common_vectorized_df = pd.read_csv(most_common_path, index_col=0)

In [None]:
most_common_vectorized_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,5960,5961,5962,5963,5964,5965,5966,5967,5968,5969,5970,5971,5972,5973,5974,5975,5976,5977,5978,5979,5980,5981,5982,5983,5984,5985,5986,5987,5988,5989,5990,5991,5992,5993,5994,5995,5996,5997,5998,5999
0,0,0,0,0,0,8,0,0,0,0,0,0,3,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,2,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33747,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,2,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
33748,4,0,0,0,0,4,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
33749,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
33750,0,0,4,0,0,0,0,0,2,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
# data is in most_common_vectorized_df, labels are in labels
labels = labels.squeeze()
X_train, X_test, y_train, y_test = train_test_split(most_common_vectorized_df, labels, test_size=0.4)
print(X_train, X_test, y_train, y_test)

       0  1  2  3  4  5  6  7  ...  5992  5993  5994  5995  5996  5997  5998  5999
1381   0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0     0     0
6700   0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0     0     0
22436  1  0  0  0  0  0  0  0  ...     0     0     0     0     0     0     0     0
9897   0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0     0     0
20846  0  0  0  0  0  0  2  0  ...     0     0     0     0     0     0     0     0
...   .. .. .. .. .. .. .. ..  ...   ...   ...   ...   ...   ...   ...   ...   ...
27809  0  0  0  0  1  0  1  0  ...     0     0     0     0     0     0     0     0
1478   1  0  0  0  0  1  0  0  ...     0     0     0     0     0     0     0     0
23263  0  0  0  0  1  0  3  0  ...     0     0     0     0     0     0     0     0
7050   2  0  0  0  0  2  0  0  ...     0     0     0     0     0     0     0     0
35     1  0  0  0  0  1  0  0  ...     0     0     0     0     0     0     0     0

[20

In [None]:
X_combined = pd.concat([X_train, X_test])
y_combined = pd.concat([y_train, y_test])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,5960,5961,5962,5963,5964,5965,5966,5967,5968,5969,5970,5971,5972,5973,5974,5975,5976,5977,5978,5979,5980,5981,5982,5983,5984,5985,5986,5987,5988,5989,5990,5991,5992,5993,5994,5995,5996,5997,5998,5999
1381,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6700,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
22436,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9897,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
20846,0,0,0,0,0,0,2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,2,0,1,0,2,1,0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24497,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
21251,1,0,0,0,0,2,2,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
29010,2,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2294,1,0,0,0,0,1,0,0,1,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
print(X_combined)
print(y_combined)

       0  1  2  3  4  5  6  7  ...  5992  5993  5994  5995  5996  5997  5998  5999
1381   0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0     0     0
6700   0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0     0     0
22436  1  0  0  0  0  0  0  0  ...     0     0     0     0     0     0     0     0
9897   0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0     0     0
20846  0  0  0  0  0  0  2  0  ...     0     0     0     0     0     0     0     0
...   .. .. .. .. .. .. .. ..  ...   ...   ...   ...   ...   ...   ...   ...   ...
24497  1  0  0  0  0  0  0  0  ...     0     0     0     0     0     0     0     0
21251  1  0  0  0  0  2  2  0  ...     0     0     0     0     0     0     0     0
29010  2  0  0  0  0  0  0  0  ...     0     0     0     0     0     0     0     0
2294   1  0  0  0  0  1  0  0  ...     0     0     0     0     0     0     0     0
1287   3  0  0  0  0  4  0  0  ...     0     0     0     0     0     0     0     0

[33

# MultinomialNB Model

In [None]:
# X_train, X_test, y_train, y_test
model2 = MultinomialNB()

model2.fit(X_train,y_train)

result2 = model2.predict(X_test)

print(confusion_matrix(y_test,result2))
print(model2.score(X_test, y_test))

[[5343 1072]
 [ 477 6609]]
0.885267757943856


# GaussianNB Model

In [None]:
model1 = GaussianNB()

model1.fit(X_train,y_train)

result1 = model1.predict(X_test)

print(confusion_matrix(y_test,result1))
print(model1.score(X_test, y_test))

[[5367 1048]
 [ 960 6126]]
0.8512702762758314


# LinearSVC Model

In [None]:
model = LinearSVC(dual=False)

model.fit(X_train,y_train)

result = model.predict(X_test)

print(confusion_matrix(y_test,result))
print(model.score(X_test, y_test))



[[5610  805]
 [ 596 6490]]
0.8962299088956374


In [None]:
#linearsvc
overall_result = model.predict(X_combined)
#gaussianNB
overall_result1 = model1.predict(X_combined)
#multinomialNB
overall_result2 = model2.predict(X_combined)
overall_result_series = pd.Series(overall_result, index=y_combined.index)
overall_result1_series = pd.Series(overall_result1, index=y_combined.index)
overall_result2_series = pd.Series(overall_result2, index=y_combined.index)

result_df = pd.concat([overall_result_series, overall_result1_series, overall_result2_series, y_combined], axis=1)
result_df

Unnamed: 0,0,1,2,Label
1381,1,1,1,1
6700,0,0,0,0
22436,0,0,0,0
9897,1,1,1,1
20846,1,1,1,1
...,...,...,...,...
24497,1,1,1,0
21251,1,1,1,1
29010,1,1,1,1
2294,1,1,1,1


In [None]:
results_path = "/content/drive/MyDrive/6220 Project/SpamDatasets/processed_data/sklearn_results.csv"
result_df.to_csv(results_path, mode='w', header=True)

# Export Models

In [None]:
import joblib

In [None]:
multi_path = "/content/drive/MyDrive/6220 Project/exported_models/multinomial_nb.sav"

In [None]:
gauss_path = "/content/drive/MyDrive/6220 Project/exported_models/gaussian_nb.sav"

In [None]:
lin_path = "/content/drive/MyDrive/6220 Project/exported_models/linear_svc.sav"

In [None]:
joblib.dump(model, lin_path)

['/content/drive/MyDrive/6220 Project/exported_models/linear_svc.sav']

In [None]:
joblib.dump(model1, gauss_path)

['/content/drive/MyDrive/6220 Project/exported_models/gaussian_nb.sav']

In [None]:
joblib.dump(model2, multi_path)

['/content/drive/MyDrive/6220 Project/exported_models/multinomial_nb.sav']

# How to Import Models w/joblib


```
lin_path = "/content/drive/MyDrive/6220 Project/exported_models/linear_svc.sav"
linear_model = joblib.load(lin_path)
```

