In [5]:
import pandas as pd
import os

def read_spam():
    category = 'spam'
    directory = './enron1/spam'
    return read_category(category, directory)

def read_ham():
    category = 'ham'
    directory = './enron1/ham'
    return read_category(category, directory)

def read_category(category, directory):
    emails = []
    for filename in os.listdir(directory):
        if not filename.endswith(".txt"):
            continue
        with open(os.path.join(directory, filename), 'r') as fp:
            try:
                content = fp.read()
                emails.append({'name': filename, 'content': content, 'category': category})
            except:
                print(f'skipped {filename}')
    return emails

ham = read_ham()
spam = read_spam()

df = pd.DataFrame.from_records(ham)
df = df.append(pd.DataFrame.from_records(spam))
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
df.head()

skipped 2248.2004-09-23.GP.spam.txt
skipped 2526.2004-10-17.GP.spam.txt
skipped 2698.2004-10-31.GP.spam.txt
skipped 4566.2005-05-24.GP.spam.txt


  df = df.append(pd.DataFrame.from_records(spam))


Unnamed: 0,name,content,category
0,4992.2001-10-18.farmer.ham.txt,Subject: update - supported internet email add...,ham
1,4347.2001-04-23.farmer.ham.txt,Subject: re : noms / actual vol for april 20 t...,ham
2,3229.2001-01-02.farmer.ham.txt,Subject: re : dow trspt .\ni ' m assuming yes ...,ham
3,3766.2005-02-07.GP.spam.txt,Subject: 10 stuff better than v i a g r a\nhey...,spam
4,3097.2000-12-14.farmer.ham.txt,"Subject: re : hpl discrepancy\nrita , please h...",ham


In [6]:
import re

def preprocessor(e):
    return re.sub('[^A-Za-z]', ' ', e).lower()

custom_stop_words = [
    # Pronouns
    'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them',
    'my', 'your', 'his', 'her', 'its', 'our', 'their', 'mine', 'yours', 'hers', 'ours', 'theirs',
    'myself', 'yourself', 'himself', 'herself', 'itself', 'ourselves', 'yourselves', 'themselves',
    'this', 'that', 'these', 'those', 'who', 'whom', 'whose', 'which', 'that',
    'anyone', 'everyone', 'someone', 'no one', 'anybody', 'everybody', 'somebody', 'nobody',
    'anything', 'everything', 'something', 'nothing', 'all', 'each', 'few', 'many', 'none', 'some', 'one',
    'who', 'whom', 'whose', 'which', 'what',
    
    # Prepositions
    'about', 'above', 'across', 'after', 'against', 'along', 'amid', 'among', 'around', 'as', 'at',
    'before', 'behind', 'below', 'beneath', 'beside', 'besides', 'between', 'beyond', 'but', 'by',
    'concerning', 'considering', 'despite', 'down', 'during', 'except', 'for', 'from', 'in', 'inside', 'into',
    'like', 'near', 'of', 'off', 'on', 'onto', 'opposite', 'out', 'outside', 'over', 'past', 'regarding',
    'round', 'since', 'through', 'throughout', 'till', 'to', 'toward', 'under', 'underneath', 'until', 'up', 
    'upon', 'with', 'within', 'without',
    
    # Articles
    'the', 'a', 'an',
    
    # Conjunctions
    'and', 'but', 'or', 'nor', 'for', 'so', 'yet', 'although', 'because', 'since', 'unless', 'while',

    # Auxiliary Verbs
    'be', 'am', 'is', 'are', 'was', 'were', 'been', 'being',
    'have', 'has', 'had', 'having',
    'do', 'does', 'did', 'doing',
    'can', 'could', 'shall', 'should', 'will', 'would', 'may', 'might', 'must', 'ought',
    
    # Common Action Verbs (Infinitive and Present)
    'go', 'come', 'get', 'make', 'take', 'give', 'say', 'know', 'see', 'think', 'want', 'use', 
    'find', 'tell', 'ask', 'work', 'seem', 'feel', 'try', 'leave', 'call',
    
    # Common Action Verbs (Past and Past Participle Forms)
    'went', 'came', 'got', 'made', 'took', 'gave', 'said', 'knew', 'saw', 'thought', 'wanted', 
    'used', 'found', 'told', 'asked', 'worked', 'seemed', 'felt', 'tried', 'left', 'called',
    
    #Common words
    "subject","not","no","more","here","any","if","only","please"
]

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

vectorizer = CountVectorizer(preprocessor=preprocessor, stop_words=custom_stop_words)

X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    df['content'], 
    df['category'].apply(lambda x: 1 if x == 'spam' else 0), 
    test_size=0.2, 
    random_state=42
)

X_train = vectorizer.fit_transform(X_train_raw)
X_test = vectorizer.transform(X_test_raw)

scaler = StandardScaler(with_mean=False)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LogisticRegression(max_iter=200, solver='lbfgs')

model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

new_email = """Dear Bhaskar,

We have 15+ projects from top-tech companies where you can practice, and get certificates.

You can put these certificates in your resume & can get shortlisted for any internship or job."""  # Replace with your email content

# Preprocess the email (use the same preprocessing function used in CountVectorizer)
preprocessed_email = preprocessor(new_email)

# Vectorize the email content
email_vectorized = vectorizer.transform([preprocessed_email])

# Scale the vectorized email
email_scaled = scaler.transform(email_vectorized)

# Predict using the trained model
prediction = model.predict(email_scaled)[0]  # Model outputs 0 (ham) or 1 (spam)

# Interpret the result
if prediction == 1:
    print("The email is SPAM.")
else:
    print("The email is HAM.")


accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)

The email is SPAM.
Accuracy: 0.9583333333333334
Confusion Matrix:
 [[300  17]
 [  8 275]]
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.95      0.96       317
           1       0.94      0.97      0.96       283

    accuracy                           0.96       600
   macro avg       0.96      0.96      0.96       600
weighted avg       0.96      0.96      0.96       600



In [10]:
print("Class Distribution:")
print(df['category'].value_counts())

Class Distribution:
ham     1500
spam    1496
Name: category, dtype: int64
