# Data Import and Cleaning

In [1]:
import pandas as pd
from os import walk
from os.path import join

# Constants

In [2]:
HAM_1= "SpamData/01_Processing/spam_assassin_corpus/easy_ham_1"
HAM_2= "SpamData/01_Processing/spam_assassin_corpus/easy_ham_2"
SPAM_1= "SpamData/01_Processing/spam_assassin_corpus/spam_1"
SPAM_2= "SpamData/01_Processing/spam_assassin_corpus/spam_2"

SPAM_CAT = 1
HAM_CAT = 0


# Generator Functions

In [3]:
def email_body_generator(path):
    
    # os.walk generates a tuple
    for root, dirnames, filenames in walk(path):
        
        # loop to extract the email body from each file
        for file_name in filenames:
        
            file_path = join(root, file_name)

            with open(file_path, encoding= 'latin-1') as f:

                is_body = False
                lines = []
                
                # new line character seperates email body from other content
                for line in f:
                    if is_body:
                        lines.append(line)
                    elif line == '\n':
                        is_body = True
                        
            email_body = '\n'.join(lines)

            yield file_name, email_body

In [4]:
def df_from_directory(path, classification):
    
    rows = []
    row_names = []
    
    for file_name, email_body in email_body_generator(path):
        
        rows.append({'Message': email_body, 'Category': classification})
        
        row_names.append(file_name)
        
    return pd.DataFrame(rows, index=row_names)


In [5]:
spam_emails = df_from_directory(SPAM_1, SPAM_CAT)
spam_emails = pd.concat([spam_emails, df_from_directory(SPAM_2, SPAM_CAT)], axis=0)   # append method adds both rows and new columns
print(spam_emails.shape)
spam_emails.head()

(1898, 2)


Unnamed: 0,Message,Category
00001.7848dde101aa985090474a91ec93fcf0,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.0 Tr...",1
00002.d94f1b97e48ed3b553b3508d116e6a09,1) Fight The Risk of Cancer!\n\nhttp://www.adc...,1
00003.2ee33bc6eacdb11f38d052c44819ba6c,1) Fight The Risk of Cancer!\n\nhttp://www.adc...,1
00004.eac8de8d759b7e74154f142194282724,##############################################...,1
00005.57696a39d7d84318ce497886896bf90d,I thought you might like these:\n\n1) Slim Dow...,1


In [6]:
ham_emails = df_from_directory(HAM_1, HAM_CAT)
ham_emails = pd.concat([ham_emails, df_from_directory(HAM_2, HAM_CAT)], axis=0)  
print(ham_emails.shape)
ham_emails.head()

(3901, 2)


Unnamed: 0,Message,Category
00001.7c53336b37003a9286aba55d2945844c,"Date: Wed, 21 Aug 2002 10:54:46 -05...",0
00002.9c4069e25e1ef370c078db7ee85ff9ac,"Martin A posted:\n\nTassos Papadopoulos, the G...",0
00003.860e3c3cee1b42ead714c5c874fe25f7,Man Threatens Explosion In Moscow \n\n\n\nThur...,0
00004.864220c5b6930b209cc287c361c99af1,Klez: The Virus That Won't Die\n\n \n\nAlready...,0
00005.bf27cdeaf0b8c4647ecd61b1d09da613,"> in adding cream to spaghetti carbonara, whi...",0


In [7]:
data = pd.concat([ham_emails, spam_emails], axis=0)
print(data.shape)

(5799, 2)


In [8]:
data['Message'].info()

<class 'pandas.core.series.Series'>
Index: 5799 entries, 00001.7c53336b37003a9286aba55d2945844c to cmds
Series name: Message
Non-Null Count  Dtype 
--------------  ----- 
5799 non-null   object
dtypes: object(1)
memory usage: 90.6+ KB


In [9]:
rows_with_empty_strings = data.loc[data['Message'] == '', :]
rows_with_empty_strings.head()

Unnamed: 0,Message,Category
cmds,,0
cmds,,1
cmds,,1


In [10]:
data.index.get_loc('cmds')

array([False, False, False, ..., False, False,  True])

In [11]:
data = data.drop(['cmds'], axis=0)
data.loc[data['Message'] == '', :]

Unnamed: 0,Message,Category


In [12]:
data['file_name'] = data.index
data = data.reset_index(drop=True)
data.head()

Unnamed: 0,Message,Category,file_name
0,"Date: Wed, 21 Aug 2002 10:54:46 -05...",0,00001.7c53336b37003a9286aba55d2945844c
1,"Martin A posted:\n\nTassos Papadopoulos, the G...",0,00002.9c4069e25e1ef370c078db7ee85ff9ac
2,Man Threatens Explosion In Moscow \n\n\n\nThur...,0,00003.860e3c3cee1b42ead714c5c874fe25f7
3,Klez: The Virus That Won't Die\n\n \n\nAlready...,0,00004.864220c5b6930b209cc287c361c99af1
4,"> in adding cream to spaghetti carbonara, whi...",0,00005.bf27cdeaf0b8c4647ecd61b1d09da613


In [15]:
data.to_json('cleaned_data.json')