In [9]:
import pandas as pd
import os

def read_spam():
    category = 'spam'
    directory = r"D:\project\the forge\enron1\spam"
    return read_category(category, directory)

def read_ham():
    category = 'ham'
    directory =r"D:\project\the forge\enron1\ham"
    return read_category(category, directory)

def read_category(category, directory):
    emails = []
    for filename in os.listdir(directory):
        if not filename.endswith(".txt"):
            continue
        with open(os.path.join(directory, filename), 'r', encoding='latin1') as fp:  # Added encoding to handle non-ASCII characters
            try:
                content = fp.read()
                emails.append({'name': filename, 'content': content, 'category': category})
            except Exception as e:
                print(f'skipped {filename} due to {e}')
    return emails

ham = read_ham()
spam = read_spam()

df_ham = pd.DataFrame.from_records(ham)
df_spam = pd.DataFrame.from_records(spam)

df = pd.concat([df_ham, df_spam], ignore_index=True)

# Optional: Display the DataFrame
print(df.head())


                             name  \
0  0001.1999-12-10.farmer.ham.txt   
1  0002.1999-12-13.farmer.ham.txt   
2  0003.1999-12-14.farmer.ham.txt   
3  0004.1999-12-14.farmer.ham.txt   
4  0005.1999-12-14.farmer.ham.txt   

                                             content category  
0            Subject: christmas tree farm pictures\n      ham  
1  Subject: vastar resources , inc .\ngary , prod...      ham  
2  Subject: calpine daily gas nomination\n- calpi...      ham  
3  Subject: re : issue\nfyi - see note below - al...      ham  
4  Subject: meter 7268 nov allocation\nfyi .\n- -...      ham  


In [10]:
import re

def preprocessor(e):
    e = re.sub(r'[^a-zA-Z]', ' ', e)
    e = e.lower()
    return e
text = "Hello, World! This is a TEST."
cleaned_text = preprocessor(text)
print(cleaned_text)


hello  world  this is a test 


In [13]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
vectorizer = CountVectorizer(preprocessor=preprocessor)

X = df['content'] 
y = df['category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_test)

1566    Subject: eastrans nomination change effective ...
1988    Subject: re : personal information needs to be...
1235    Subject: re : saudi arabia\ni spoke to mr . ma...
3276    Subject: hpl nom for may 24 , 2001\n( see atta...
3438    Subject: re : error repairs\njay ,\nfor june ,...
                              ...                        
1175    Subject: re : new production - sitara deals ne...
2594    Subject: crosstex energy services - camden res...
3377    Subject: lng mtg\nwhen : wednesday , july 11 ,...
5065    Subject: looking for ci _ . a . _ lis ? we ` r...
2142    Subject: beaumont methanol\nthis is to confirm...
Name: content, Length: 1035, dtype: object


In [14]:
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

print(f"Training data shape: {X_train_vectorized.shape}")

Training data shape: (4137, 39999)


In [15]:
log_reg_model = LogisticRegression()
log_reg_model.fit(X_train_vectorized, y_train)
train_accuracy = log_reg_model.score(X_train_vectorized, y_train)
print(f"Training accuracy: {train_accuracy}")


Training accuracy: 0.9997582789460963


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [16]:
X_test_vectorized = vectorizer.transform(X_test)
y_pred = log_reg_model.predict(X_test_vectorized)
print(f"Predictions: {y_pred[:5]}")
print(f"True labels: {y_test[:5].values}")

Predictions: ['ham' 'ham' 'ham' 'ham' 'ham']
True labels: ['ham' 'ham' 'ham' 'ham' 'ham']


In [19]:
test_accuracy = accuracy_score(y_test, y_pred)
print(f"Test accuracy: {test_accuracy:.2f}")
conf_matrix = confusion_matrix(y_test, y_pred, labels=['ham', 'spam'])
print("Confusion Matrix:")
print(conf_matrix)
class_report = classification_report(y_test, y_pred, target_names=['ham', 'spam'])
print("Classification Report:")
print(class_report)

Test accuracy: 0.98
Confusion Matrix:
[[732  17]
 [  8 278]]
Classification Report:
              precision    recall  f1-score   support

         ham       0.99      0.98      0.98       749
        spam       0.94      0.97      0.96       286

    accuracy                           0.98      1035
   macro avg       0.97      0.97      0.97      1035
weighted avg       0.98      0.98      0.98      1035



In [20]:
feature_names = vectorizer.get_feature_names_out()
print("Feature names:")
print(feature_names[:20])
print(f"Total number of features: {len(feature_names)}")


Feature names:
['aa' 'aaa' 'aaas' 'aabda' 'aabvmmq' 'aac' 'aaer' 'aafco' 'aaiabe'
 'aaigrcrb' 'aaihmqv' 'aalland' 'aambique' 'aamlrg' 'aaoeuro' 'aarhus'
 'aaron' 'aashqcsny' 'aavilable' 'aaxrzm']
Total number of features: 39999


In [21]:
import numpy as np

coefficients = log_reg_model.coef_[0]

feature_importance = dict(zip(feature_names, coefficients))

sorted_features = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)

print("Top positive features:")
for feature, importance in sorted_features[:20]: 
    print(f"{feature}: {importance:.4f}")

print("\nTop negative features:")
for feature, importance in sorted_features[-20:]:  
    print(f"{feature}: {importance:.4f}")


Top positive features:
prices: 0.8814
http: 0.8448
no: 0.8321
hello: 0.7732
pain: 0.7499
paliourg: 0.7062
remove: 0.7043
removed: 0.6887
here: 0.6746
only: 0.6382
money: 0.6323
more: 0.6306
mobile: 0.5919
hi: 0.5664
laptop: 0.5504
vi: 0.5362
loading: 0.5340
meds: 0.5163
software: 0.5160
rolex: 0.5125

Top negative features:
june: -0.7698
deals: -0.7850
numbers: -0.8223
gas: -0.8307
wassup: -0.8564
know: -0.8870
nom: -0.9132
sitara: -0.9571
revised: -0.9967
pictures: -1.0126
xls: -1.0805
hpl: -1.0860
neon: -1.1450
meter: -1.1756
deal: -1.1818
daren: -1.2874
doc: -1.3422
thanks: -1.3592
enron: -1.5275
attached: -1.5785


In [22]:
import pandas as pd

feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients
})

feature_importance_df['Absolute_Coefficient'] = feature_importance_df['Coefficient'].abs()

sorted_features = feature_importance_df.sort_values(by='Absolute_Coefficient', ascending=False)

top_positive_features = sorted_features[sorted_features['Coefficient'] > 0].head(10)
print("Top 10 positive features:")
print(top_positive_features[['Feature', 'Coefficient']])

top_negative_features = sorted_features[sorted_features['Coefficient'] < 0].head(10)
print("\nTop 10 negative features:")
print(top_negative_features[['Feature', 'Coefficient']])


Top 10 positive features:
        Feature  Coefficient
28167    prices     0.881353
17554      http     0.844817
24839        no     0.832126
16806     hello     0.773189
26256      pain     0.749867
26293  paliourg     0.706160
30028    remove     0.704295
30029   removed     0.688692
16867      here     0.674572
25654      only     0.638214

Top 10 negative features:
        Feature  Coefficient
2539   attached    -1.578474
12439     enron    -1.527466
35224    thanks    -1.359199
10878       doc    -1.342169
9327      daren    -1.287420
9524       deal    -1.181765
23111     meter    -1.175568
24532      neon    -1.145046
17474       hpl    -1.085955
39237       xls    -1.080489
