In [1]:
# Import dependencies
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import word_tokenize
import os
import pandas as pd
import string


In [2]:
# Download NLTK resources (if not already downloaded)
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\brnka\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Creat a function to extract the words if the html file contains a body tag
def extract_words_from_body(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()

        # Parse HTML using BeautifulSoup
        soup = BeautifulSoup(html_content, 'html5lib')

        # Check if <body> tag exists in the HTML content
        body_content = soup.body
        if body_content:
            # Extract text from the body tag
            body_text = body_content.get_text()
        else:
            # Skip processing for files without <body> tag
            print(f"No <body> tag found in {file_path}. Skipping...")
            return None

        # Tokenize words using NLTK and filter out punctuation
        words = word_tokenize(body_text)
        words = [word for word in words if word.isalpha()]  # Exclude non-alphabetic tokens

        return words


In [4]:
# Replace 'path_to_html_files_directory' with the path to your HTML files directory
directory = 'emails'


In [5]:
# Create an empty dictionary to store word counts for each file
word_count_dict = {}


In [6]:
# Loop through HTML files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.html'):
        file_path = os.path.join(directory, filename)

        # Extract words from the body of each HTML file and remove punctuation
        words = extract_words_from_body(file_path)

        # If words is None (indicating no <body> tag), skip to the next file
        if words is None:
            continue

        # Count word occurrences for the current email
        word_count = {}
        for word in set(words):  # Use set to get unique words per email
            word_count[word] = words.count(word)

        # Store word counts for the current email in word_count_dict
        word_count_dict[filename] = word_count



In [7]:
# Create a DataFrame from word_count_dict
df = pd.DataFrame.from_dict(word_count_dict, orient='index')
df.fillna(0, inplace=True)  # Fill NaN values with 0



In [8]:
# Add a 'Spam Indicator' column filled with zeros
df['Spam Indicator'] = 0


In [9]:
# Change 'Spam Indicator' column value to 1 if the filename ends with 'spam.html'
df.loc[df.index.str.endswith('spam.html'), 'Spam Indicator'] = 1

In [10]:
# List of specific words to omit (Words are case sensitive)
words_to_omit = ['brian', 'Brian', 'brnkath', 'kabr0501']  # Add the words you want to omit here

# Filter words that are less than or equal to 20 characters and not in the list of words to omit
filtered_words = [word for word in df.columns[1:-1] if len(word) <= 20 and word not in words_to_omit]

# Summing up the word counts across all emails for filtered words
word_counts_total = df[filtered_words].sum(axis=0)

# Sort the words based on their total counts in descending order and select the top 5000
top_5000_words = word_counts_total.nlargest(5000).index

# Filter the DataFrame to keep only the top 5000 columns and include the first and last columns
df_top_5000 = df.iloc[:, [0] + [df.columns.get_loc(col) for col in top_5000_words] + [-1]]

In [11]:
# Reset the index of the DataFrame to ascending numbers
df_top_5000.reset_index(drop=True, inplace=True)

# Set index to start at 1
df_top_5000.index += 1

# Convert columns to integer type
df_top_5000 = df_top_5000.astype(int)

In [12]:
df_top_5000.columns.unique()

Index(['Discover', 'the', 'to', 'of', 'and', 'a', 'in', 'important', 'https',
       'for',
       ...
       'Jonathan', 'ble', 'Version', 'yields', 'silver', 'wildfires',
       'accredited', 'ther', 'mounting', 'Spam Indicator'],
      dtype='object', length=5002)

In [13]:
# Display the DataFrame
df_top_5000.head()

Unnamed: 0,Discover,the,to,of,and,a,in,important,https,for,...,Jonathan,ble,Version,yields,silver,wildfires,accredited,ther,mounting,Spam Indicator
1,2,30,25,23,29,16,13,0,0,9,...,0,0,0,0,0,0,0,0,0,1
2,2,30,27,23,24,23,14,33,0,15,...,0,0,0,0,0,0,0,0,0,1
3,2,24,19,21,18,13,10,0,0,8,...,0,0,0,0,0,0,0,0,0,1
4,3,26,22,25,22,16,14,0,0,8,...,0,0,0,0,0,0,0,0,0,1
5,1,91,57,65,70,40,39,50,39,25,...,0,0,0,2,0,0,0,0,0,0


In [14]:
# Save the DataFrame to a CSV file in the 'outputs' directory
output_directory = 'outputs'
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

output_file_path = os.path.join(output_directory, 'emails_df.csv')
df_top_5000.to_csv(output_file_path, index=False)

print(f"DataFrame saved to {output_file_path}")

DataFrame saved to outputs\emails_df.csv
