In [79]:
import pandas as pd
import tensorflow as tf

from sklearn.model_selection import train_test_split
import sklearn.feature_extraction.text as sk_text

import re

import os
import plotly.express as px
import plotly.graph_objects as go
import numpy as np

In [47]:
def cleanText(text):
    # remove newline
    text = text.strip()
    return re.sub(r"\s+", " ", text)

def readFileContents(filePath):
    with open(filePath, "r", encoding = 'latin-1') as f:
        return cleanText(f.read())
        

In [54]:
def createDataFrame(path = "./"):

    if not os.path.exists(path):
        return None
    
    data = []

    for dirs in  ["ham", "spam"]:

        for file in os.listdir(path + "/" + dirs):
            tmp = dict(label = dirs, text = readFileContents(path + "/" + dirs + '/' + file), 
                spam = 0 if dirs == "ham" else 1)
            data.append(tmp) 
    return pd.DataFrame(data = data, columns=["label", "text", "spam"])

In [55]:
df = createDataFrame("./")
df

Unnamed: 0,label,text,spam
0,ham,From exmh-workers-admin@redhat.com Thu Aug 22 ...,0
1,ham,From Steve_Burt@cursor-system.com Thu Aug 22 1...,0
2,ham,From timc@2ubh.com Thu Aug 22 13:52:59 2002 Re...,0
3,ham,From irregulars-admin@tb.tf Thu Aug 22 14:23:3...,0
4,ham,From exmh-users-admin@redhat.com Thu Aug 22 14...,0
...,...,...,...
3047,spam,From biz2biz2446@Flashmail.com Mon Oct 7 22:42...,1
3048,spam,From cna@insiq.us Tue Oct 8 00:10:39 2002 Retu...,1
3049,spam,From bounce2@u-answer.com Tue Oct 8 11:02:30 2...,1
3050,spam,From beautyinfufuxxxmeb13mxy@aol.com Tue Oct 8...,1


In [56]:
df.to_csv("./spamHamData.csv")

In [57]:
# plot distrubution of message length of messages (histogram)

hist1 = go.Histogram(x = [len(row.text) for ind, row in df[df.label == "spam"].iterrows()],
        nbinsx=10, name = "Spam Length")
hist2 = go.Histogram(x = [len(row.text) for ind, row in df[df.label == "ham"].iterrows()], 
         nbinsx=10, name = "Ham Length")

fig = go.Figure(data = [hist1, hist2])

fig.update_layout(
    title = "Histogram of message length of Spam vs Non spam emails",
    xaxis_title = "Message length",
    yaxis_title = "Count of Range",
    legend_title="Type of Email",
)

fig.show()

In [86]:
fig = go.Figure(
    go.Bar(x = list(set(df.label)), y = df.groupby("label").count().text.tolist())
)

fig.update_layout(
    title = "Distribution of Classes",
    yaxis_title = "Count",
    xaxis_title = "Class"
)

fig.show()

In [71]:
# 20 percent needs to be downsampled
df_spam = df[df.label == "spam"]
df_ham = df[df.label == "ham"]

In [96]:
df_downsampled = df_ham.sample(df_spam.shape[0])
print(df_downsampled.sample(5).text)

2216    From rssfeeds@jmason.org Tue Oct 1 10:36:50 20...
2511    From rssfeeds@jmason.org Wed Oct 9 10:52:55 20...
337     From rpm-list-admin@freshrpms.net Wed Oct 9 10...
2101    From rssfeeds@jmason.org Thu Sep 26 16:43:20 2...
373     From fork-admin@xent.com Mon Aug 26 21:47:40 2...
Name: text, dtype: object


In [99]:
df_bal = df_spam.append(df_downsampled)
df_bal = df_bal.reindex(np.random.permutation(df_bal.index))

In [101]:
df_bal["label"].value_counts()

spam    501
ham     501
Name: label, dtype: int64

In [102]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['text'], df['spam'], 
        train_size = 0.7) 

In [103]:
print(X_train.head())
print(f"Training set : {X_train.shape}")

234     From fork-admin@xent.com Wed Aug 28 10:50:30 2...
286     From timc@2ubh.com Wed Aug 28 13:55:30 2002 Re...
1063    From exmh-users-admin@redhat.com Tue Sep 10 11...
2694    From donaldbae@purplehotel.com Wed Aug 28 11:0...
109     From fork-admin@xent.com Mon Sep 2 16:22:23 20...
Name: text, dtype: object
Training set : (2136,)
