In [2]:
import pandas as pd
import os
import re
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [3]:
def read_spam():
    category = 'spam'
    directory = './enron1/spam'
    return read_category(category, directory)

def read_ham():
    category = 'ham'
    directory = './enron1/ham'
    return read_category(category, directory)

def read_category(category, directory):
    emails = []
    for filename in os.listdir(directory):
        if not filename.endswith(".txt"):
            continue
        with open(os.path.join(directory, filename), 'r') as fp:
            try:
                content = fp.read()
                emails.append({'name': filename, 'content': content, 'category': category})
            except:
                print(f'skipped {filename}')
    return emails


In [None]:
spam = read_spam()
ham = read_ham()

spam_df = pd.DataFrame.from_records(spam)
ham_df = pd.DataFrame.from_records(ham)


In [5]:
df = pd.concat([spam_df, ham_df], ignore_index=True)

In [6]:
def preprocessor(e):
    return re.sub('[^A-Za-z]', ' ', e).lower()

custom_stop_words = [
    # Pronouns
    'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them',
    'my', 'your', 'his', 'her', 'its', 'our', 'their', 'mine', 'yours', 'hers', 'ours', 'theirs',
    'myself', 'yourself', 'himself', 'herself', 'itself', 'ourselves', 'yourselves', 'themselves',
    'this', 'that', 'these', 'those', 'who', 'whom', 'whose', 'which', 'that',
    'anyone', 'everyone', 'someone', 'no one', 'anybody', 'everybody', 'somebody', 'nobody',
    'anything', 'everything', 'something', 'nothing', 'all', 'each', 'few', 'many', 'none', 'some', 'one',
    'who', 'whom', 'whose', 'which', 'what',
    
    # Prepositions
    'about', 'above', 'across', 'after', 'against', 'along', 'amid', 'among', 'around', 'as', 'at',
    'before', 'behind', 'below', 'beneath', 'beside', 'besides', 'between', 'beyond', 'but', 'by',
    'concerning', 'considering', 'despite', 'down', 'during', 'except', 'for', 'from', 'in', 'inside', 'into',
    'like', 'near', 'of', 'off', 'on', 'onto', 'opposite', 'out', 'outside', 'over', 'past', 'regarding',
    'round', 'since', 'through', 'throughout', 'till', 'to', 'toward', 'under', 'underneath', 'until', 'up', 
    'upon', 'with', 'within', 'without',
    
    # Articles
    'the', 'a', 'an',
    
    # Conjunctions
    'and', 'but', 'or', 'nor', 'for', 'so', 'yet', 'although', 'because', 'since', 'unless', 'while',

    # Auxiliary Verbs
    'be', 'am', 'is', 'are', 'was', 'were', 'been', 'being',
    'have', 'has', 'had', 'having',
    'do', 'does', 'did', 'doing',
    'can', 'could', 'shall', 'should', 'will', 'would', 'may', 'might', 'must', 'ought',
    
    # Common Action Verbs (Infinitive and Present)
    'go', 'come', 'get', 'make', 'take', 'give', 'say', 'know', 'see', 'think', 'want', 'use', 
    'find', 'tell', 'ask', 'work', 'seem', 'feel', 'try', 'leave', 'call',
    
    # Common Action Verbs (Past and Past Participle Forms)
    'went', 'came', 'got', 'made', 'took', 'gave', 'said', 'knew', 'saw', 'thought', 'wanted', 
    'used', 'found', 'told', 'asked', 'worked', 'seemed', 'felt', 'tried', 'left', 'called',
    
    #Common words
    "subject","not","no","more","here","any","if","only","please"
]


In [None]:
vectorizer_spam = CountVectorizer(preprocessor=preprocessor, stop_words=custom_stop_words)
X_spam = vectorizer_spam.fit_transform(spam_df['content'])

total_word_frequencies_spam = X_spam.sum(axis=0)
word_frequency_dict_spam = dict(zip(vectorizer_spam.get_feature_names_out(), total_word_frequencies_spam.A1))

sorted_word_frequencies_spam = sorted(word_frequency_dict_spam.items(), key=lambda item: item[1], reverse=True)

top_n = 20
print(f"Top {top_n} words in Spam Emails:")
for word, frequency in sorted_word_frequencies_spam[:top_n]:
    print(f"{word}: {frequency}")

wordcloud_spam = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_frequency_dict_spam)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud_spam, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud for Spam Emails')
plt.show()


In [None]:
vectorizer_ham = CountVectorizer(preprocessor=preprocessor, stop_words=custom_stop_words)
X_ham = vectorizer_ham.fit_transform(ham_df['content'])

total_word_frequencies_ham = X_ham.sum(axis=0)
word_frequency_dict_ham = dict(zip(vectorizer_ham.get_feature_names_out(), total_word_frequencies_ham.A1))

sorted_word_frequencies_ham = sorted(word_frequency_dict_ham.items(), key=lambda item: item[1], reverse=True)

top_n = 20
print(f"Top {top_n} words in Ham Emails:")
for word, frequency in sorted_word_frequencies_ham[:top_n]:
    print(f"{word}: {frequency}")

wordcloud_ham = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_frequency_dict_ham)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud_ham, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud for Ham Emails')
plt.show()


In [None]:
class_distribution = df['category'].value_counts()

print("Class Distribution:")
print(class_distribution)

class_distribution.plot(kind='bar', color=['skyblue', 'lightcoral'])
plt.title('Class Distribution of Emails')
plt.xlabel('Category')
plt.ylabel('Number of Emails')
plt.xticks(rotation=0)
plt.show()

In [None]:
top_spam_words = sorted_word_frequencies_spam[:10]
spam_words, spam_frequencies = zip(*top_spam_words)
plt.figure(figsize=(10, 5))
plt.bar(spam_words, spam_frequencies, color='lightcoral')
plt.title('Top 10 Words in Spam Emails')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()

In [None]:
top_ham_words = sorted_word_frequencies_ham[:10]
ham_words, ham_frequencies = zip(*top_ham_words)
plt.figure(figsize=(10, 5))
plt.bar(ham_words, ham_frequencies, color='skyblue')
plt.title('Top 10 Words in Ham Emails')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()