In [1]:
import numpy as np
import pandas as pd
from collections import Counter
from random import randint
import sklearn

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler, normalize

# Vectorization
This function takes a pandas df of subject + email content and vectorizes them using a bag-of-words approach

Output is a df of size (n, m) where n is the number of datapoints and m is the size of the vocabulary




## Vectorization Function

In [3]:
def vectorize(df, vocab):
    
    # make initialized dict (words with all counts = 0)
    init_dict = dict()
    for w,count in vocab:
        init_dict[w] = 0

    # covert data to counts of word frequencies
    BOW_vectors = []
    for i, row in df.iterrows():
        row_dict = init_dict.copy()
        words = row[0].split()
        for word in words:
            if word in row_dict.keys():
                row_dict[word] += 1
        BOW_vectors.append(list(row_dict.values()))

    BOW = pd.DataFrame(BOW_vectors)
    return BOW

def make_vocabulary(df):
    vocab = set()
    for i, row in df.iterrows():
        words = row[0].split()

        for w in words:
            vocab.add(w)
    return list(vocab) # vocabulary as 1-row list of words

# gets the 6000 most common vocabulary words from the full vocabulary set
def most_common_vocab(df):
    all_words = []       
    for i, row in df.iterrows():
        words = row[0].split()
        all_words += words
    dictionary = Counter(all_words)
    dictionary_to_alter = dictionary.copy()

    for item in dictionary.keys():
        if item.isalpha() == False: 
            del dictionary_to_alter[item]
        elif len(item) == 1:
            del dictionary_to_alter[item]

    dictionary = dictionary_to_alter.copy().most_common(6000)
    return dictionary



## Vectorization Example

In [4]:
example = ['this is an email', 'this is spam', 'how are you doing doing doing doing', 'snake oil for sale', 'this is an widget']

example_pd = pd.DataFrame({"Content": example})

print("example data:\n")
print(example_pd)
print("\nvocabulary:\n")
print(make_vocabulary(example_pd))
print("\nBag of words output (vectorization), dims: (n x len(vocab)):\n")
vectorize(example_pd, most_common_vocab(example_pd))

example data:

                               Content
0                     this is an email
1                         this is spam
2  how are you doing doing doing doing
3                   snake oil for sale
4                    this is an widget

vocabulary:

['you', 'widget', 'are', 'this', 'doing', 'snake', 'sale', 'for', 'spam', 'is', 'email', 'how', 'an', 'oil']

Bag of words output (vectorization), dims: (n x len(vocab)):



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0,1,1,1,1,0,0,0,0,0,0,0,0,0
1,0,1,1,0,0,1,0,0,0,0,0,0,0,0
2,4,0,0,0,0,0,1,1,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1,1,1,1,0
4,0,1,1,1,0,0,0,0,0,0,0,0,0,1


# Importing Dataset and Vectorizing

In [5]:
# Put email content into pd dataframe to pass into vectorization function
dataset_path = "../../data/fully_combined_data.csv"
df = pd.read_csv(dataset_path, names=['Content', 'Label'])
emails_df = df.iloc[:, [0]]
# print(emails_df)

In [6]:
labels = df.iloc[:, [1]]
# print(labels)

## Find most common vocabulary list

In [7]:
most_common_vocab_list = most_common_vocab(emails_df)
# print(most_common_vocab_list)

### Vectorize based on most common words and then save to Drive

In [8]:
most_common_path = "../../data/vectorized_most_common.csv"

In [9]:
most_common_vectorized_df = vectorize(emails_df, most_common_vocab_list)

In [10]:
most_common_vectorized_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5990,5991,5992,5993,5994,5995,5996,5997,5998,5999
0,0,0,0,0,0,8,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33747,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
33748,4,0,0,0,0,4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
33749,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
33750,0,0,4,0,0,0,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
most_common_vectorized_df.to_csv(most_common_path, mode='w', header=True)

# Train Test Split

In [13]:
most_common_vectorized_df = pd.read_csv(most_common_path, index_col=0)

In [14]:
most_common_vectorized_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5990,5991,5992,5993,5994,5995,5996,5997,5998,5999
0,0,0,0,0,0,8,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33747,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
33748,4,0,0,0,0,4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
33749,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
33750,0,0,4,0,0,0,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
# data is in most_common_vectorized_df, labels are in labels
labels = labels.squeeze()
X_train, X_test, y_train, y_test = train_test_split(most_common_vectorized_df, labels, test_size=0.4)
print(X_train, X_test, y_train, y_test)

        0   1  2  3  4  5  6  7  8  9  ...  5990  5991  5992  5993  5994  \
22846   0   0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0   
15725   0   0  0  0  6  0  0  0  0  0  ...     0     0     0     0     0   
22988  12  18  0  4  1  0  1  3  1  5  ...     0     0     0     0     0   
1375    1   0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0   
2600    1   0  1  0  0  0  1  0  1  0  ...     0     0     0     0     0   
...    ..  .. .. .. .. .. .. .. .. ..  ...   ...   ...   ...   ...   ...   
3584    6   9  0  8  0  0  3  6  0  8  ...     0     0     0     0     0   
29596   0   0  0  0  0  0  0  0  2  0  ...     0     0     0     0     0   
22920   0   0  0  0  0  0  1  0  0  0  ...     0     0     0     0     0   
6590    1   0  0  0  0  1  0  0  0  0  ...     0     0     0     0     0   
9330    2   0  0  0  0  2  0  0  1  0  ...     0     0     0     0     0   

       5995  5996  5997  5998  5999  
22846     0     0     0     0     0  
15725     0

In [16]:
X_combined = pd.concat([X_train, X_test])
y_combined = pd.concat([y_train, y_test])

In [17]:
print(X_combined)
print(y_combined)

        0   1  2  3  4  5  6  7  8  9  ...  5990  5991  5992  5993  5994  \
22846   0   0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0   
15725   0   0  0  0  6  0  0  0  0  0  ...     0     0     0     0     0   
22988  12  18  0  4  1  0  1  3  1  5  ...     0     0     0     0     0   
1375    1   0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0   
2600    1   0  1  0  0  0  1  0  1  0  ...     0     0     0     0     0   
...    ..  .. .. .. .. .. .. .. .. ..  ...   ...   ...   ...   ...   ...   
26062   1   0  0  0  0  0  0  0  1  0  ...     0     0     0     0     0   
23748   0   0  0  0  0  5  5  0  1  0  ...     0     0     0     0     0   
1421    0   0  0  0  0  0  1  0  1  0  ...     0     0     0     0     0   
8655    0   0  0  0  0  0  1  0  0  0  ...     0     0     0     0     0   
11094   1   0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0   

       5995  5996  5997  5998  5999  
22846     0     0     0     0     0  
15725     0

# MultinomialNB Model

In [None]:
# X_train, X_test, y_train, y_test
model2 = MultinomialNB()

model2.fit(X_train,y_train)

result2 = model2.predict(X_test)

print(confusion_matrix(y_test,result2))
print(model2.score(X_test, y_test))

[[5343 1072]
 [ 477 6609]]
0.885267757943856


# GaussianNB Model

In [None]:
model1 = GaussianNB()

model1.fit(X_train,y_train)

result1 = model1.predict(X_test)

print(confusion_matrix(y_test,result1))
print(model1.score(X_test, y_test))

[[5367 1048]
 [ 960 6126]]
0.8512702762758314


# LinearSVC Model

In [None]:
model = LinearSVC(dual=False)

model.fit(X_train,y_train)

result = model.predict(X_test)

print(confusion_matrix(y_test,result))
print(model.score(X_test, y_test))



[[5610  805]
 [ 596 6490]]
0.8962299088956374


In [None]:
#linearsvc
overall_result = model.predict(X_combined)
#gaussianNB
overall_result1 = model1.predict(X_combined)
#multinomialNB
overall_result2 = model2.predict(X_combined)
overall_result_series = pd.Series(overall_result, index=y_combined.index)
overall_result1_series = pd.Series(overall_result1, index=y_combined.index)
overall_result2_series = pd.Series(overall_result2, index=y_combined.index)

result_df = pd.concat([overall_result_series, overall_result1_series, overall_result2_series, y_combined], axis=1)
result_df

Unnamed: 0,0,1,2,Label
1381,1,1,1,1
6700,0,0,0,0
22436,0,0,0,0
9897,1,1,1,1
20846,1,1,1,1
...,...,...,...,...
24497,1,1,1,0
21251,1,1,1,1
29010,1,1,1,1
2294,1,1,1,1


In [None]:
results_path = "../ensemble/sklearn_results.csv"
result_df.to_csv(results_path, mode='w', header=True)