# Preparation

In [127]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [129]:
df = pd.read_csv('data\spam.csv')
df['Category'] = df['Category'].map({'ham': 0, 'spam': 1})
df

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will ü b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


## Feature engineering

### Frequency of URLS and links

In [133]:
import re
def count_url(text):
    url_pattern = re.compile(r'http[s]?://\S+|www\.\S+')
    return len(url_pattern.findall(text))

df['freq_urls'] = df['Message'].apply(count_url)

### Frequency of 'Urgent' words

In [136]:
urgency_words = [
    "immediate", "urgent", "critical", "important", "now", "ASAP", "as soon as possible",
    "emergency", "priority", "alert", "rush", "prompt", "hasten", "swift", "instantly",
    "right away", "without delay", "high priority", "imminent", "pressing", "time-sensitive",
    "expedite", "top priority", "crucial", "vital", "necessary", "quick", "speedy", "at once",
    "rapid", "flash", "instantaneous", "accelerated", "breakneck", "hurry", "immediately",
    "fast-track", "at the earliest", "act now", "don't delay", "on the double", "without hesitation",
    "fast", "soon", "now or never", "urgent action", "right now", "straightaway", "double-time",
    "speed", "express", "high-priority", "pressing need", "at your earliest convenience", "this instant",
    "forthwith", "like a shot", "snap to it", "on the spot", "no time to lose", "no delay",
    "in a hurry", "right this minute", "get going", "with haste"
]

def count_urgency_words(text, urgency_words):
    words = re.findall(r'\b\w+\b', text.lower())
    count = sum(1 for word in words if word in urgency_words)
    return count

In [138]:
df['freq_urgent_words'] = df['Message'].apply(count_urgency_words, urgency_words=urgency_words)

### Capital run length total

In [141]:
def count_capital_run_length(text, min_length_count=2):
    capital_runs = re.findall(r'[A-Z]+', text)
    run_lengths = [len(run) for run in capital_runs if len(run) >= min_length_count]
    return sum(run_lengths)

In [143]:
df['capital_run_length_total'] = df['Message'].apply(count_capital_run_length)

### important special characters

In [146]:
def count_special_chars(text, char):
    return text.count(char)

In [148]:
df['freq_exclamation'] = df['Message'].apply(count_special_chars, char='!')

## Clean text using nltk

In [151]:
# imports
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

# stop words
# nltk.download('punkt') <------ need these lines to 
# nltk.download('stopwords') <-- load stopwords
stop_words = stopwords.words()
stop_words.append('u')
stop_words.append('ur')

# lemmatizer initialization
# nltk.download('averaged_perceptron_tagger') <---- need these lines to downnload
# nltk.download('wordnet') <----------------------- wordnet used for lemmitization
lemmatizer = WordNetLemmatizer()

### Lemmatize

In [154]:
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def lemmatize_email(text, lemmatizer):
    words = nltk.word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in words]
    return ' '.join(lemmatized_words)

In [156]:
df['Message'] = df['Message'].apply(lemmatize_email, lemmatizer=lemmatizer)

### Remove stop words

In [158]:
def remove_stop_words(text, stop_words):
    word_tokens = word_tokenize(text)
    new_text = [w for w in word_tokens if not w.lower() in stop_words]

    return ' '.join(new_text)

In [159]:
df['Message'] = df['Message'].apply(remove_stop_words, stop_words=stop_words)

## Bag of words

In [237]:
vectorizer = CountVectorizer()

# vectorize the emails
X = vectorizer.fit_transform(df['Message'])

# Create new data frame
df_bow = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# add engineered features
df_bow['freq_urls'] = df['freq_urls']
df_bow = df_bow.drop(columns='www') # turns out this column basically has the same counts as url frequency
df_bow['freq_urgent_words'] = df['freq_urgent_words']
df_bow['freq_exclamation'] = df['freq_exclamation']
df_bow['capital_run_length_total'] = df['capital_run_length_total']

# add class
df_bow['Category'] = df['Category']

In [238]:
# find totals in order to slim features
df_bow.loc['Total'] = df_bow.sum()

In [241]:
# remove features with less than n appearances
min_num_appearances = 30 # n
cols_to_drop = df_bow.columns[df_bow.loc['Total'] < min_num_appearances] 
df_bow = df_bow.drop(columns=cols_to_drop) 

In [242]:
# remove entries with no appearances of the word freq features while keeping the engineered features intact
non_word_freq_columns = {'Category', 'freq_urls', 'freq_urgent_words', 'capital_run_length_total',
                         'freq_exclamation'}
df_bow = df_bow.loc[~(df_bow[df_bow.drop(columns=non_word_freq_columns).columns] == 0).all(axis=1)]

In [243]:
# remove total row
df_bow = df_bow.drop(index='Total')

In [244]:
# reset index
df_bow = df_bow.reset_index(drop=True)

# Models

In [250]:
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, classification_report

In [251]:
X = df_bow.drop(columns='Category')
y = df_bow['Category']

## Random Forest Classifier

### Baseline

In [256]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2140)

rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

y_pred_train = rfc.predict(X_train)
y_pred = rfc.predict(X_test)

print(f'Training precision: {precision_score(y_train, y_pred_train)}')
print(f'Testing precision: {precision_score(y_test, y_pred)}\n')
print(f'Classification Report Training:\n {classification_report(y_train, y_pred_train)}')
print(f'Classification Report Testing:\n {classification_report(y_test, y_pred)}')

Training precision: 1.0
Testing precision: 0.9703703703703703

Classification Report Training:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      3062
           1       1.00      0.99      1.00       588

    accuracy                           1.00      3650
   macro avg       1.00      1.00      1.00      3650
weighted avg       1.00      1.00      1.00      3650

Classification Report Testing:
               precision    recall  f1-score   support

           0       0.98      0.99      0.99       765
           1       0.97      0.89      0.93       148

    accuracy                           0.98       913
   macro avg       0.97      0.94      0.96       913
weighted avg       0.98      0.98      0.98       913



In [257]:
# Get feature importance
feature_importance = pd.DataFrame(rfc.feature_importances_, index=X.columns)

# Displays with values of 0.0 importance dropped
feature_importance[(feature_importance >= 0.005).all(axis=1)]

Unnamed: 0,0
100,0.005734
1000,0.005953
150p,0.020565
16,0.00621
18,0.010412
50,0.012536
500,0.00784
award,0.011048
call,0.058251
camera,0.005595


### KFold CV

In [263]:
model = RandomForestClassifier()
kf = StratifiedKFold(n_splits=5)
# Store results
training_precision = []
testing_precision = []
i = 1

for train_index, test_index in kf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on training and testing set
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Calculate precision
    train_precision = precision_score(y_train, y_train_pred, average='weighted')
    test_precision = precision_score(y_test, y_test_pred, average='weighted')
    training_precision.append(train_precision)
    testing_precision.append(test_precision)
    
    print(f'Fold {i}')
    print(f'Train precision: {train_precision}')
    print(f'Test precision: {test_precision}')
    i += 1
    
# Display results
print("\nAVERAGE RESULTS:")
print("Training Precision:", np.mean(training_precision))
print("Testing Precision:", np.mean(testing_precision))


Fold 1
Train precision: 0.9991788869415931
Test precision: 0.9826549480956582
Fold 2
Train precision: 0.9989055397885986
Test precision: 0.9802461071207007
Fold 3
Train precision: 0.9989055393221278
Test precision: 0.9748581266501603
Fold 4
Train precision: 0.9989058390922395
Test precision: 0.9665878700754713
Fold 5
Train precision: 0.9986327448063279
Test precision: 0.9768596513104886

AVERAGE RESULTS:
Training Precision: 0.9989057099901772
Testing Precision: 0.9762413406504958


## XGBoost

### Baseline

In [265]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=2140)

xgb = XGBClassifier()
xgb.fit(X_train, y_train)

y_pred_train = xgb.predict(X_train)
y_pred = xgb.predict(X_test)

print(f'Training precision: {precision_score(y_train, y_pred_train)}')
print(f'Testing precision: {precision_score(y_test, y_pred)}\n')
print(f'Classification Report Training:\n {classification_report(y_train, y_pred_train)}')
print(f'Classification Report Testing:\n {classification_report(y_test, y_pred)}')

Training precision: 0.9946902654867257
Testing precision: 0.9782608695652174

Classification Report Training:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00      3061
           1       0.99      0.95      0.97       589

    accuracy                           0.99      3650
   macro avg       0.99      0.98      0.98      3650
weighted avg       0.99      0.99      0.99      3650

Classification Report Testing:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       766
           1       0.98      0.92      0.95       147

    accuracy                           0.98       913
   macro avg       0.98      0.96      0.97       913
weighted avg       0.98      0.98      0.98       913



### KFold CV

In [268]:
model = XGBClassifier()
kf = StratifiedKFold(n_splits=5)
# Store results
training_precision = []
testing_precision = []
i = 1

for train_index, test_index in kf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on training and testing set
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Calculate precision
    train_precision = precision_score(y_train, y_train_pred, average='weighted')
    test_precision = precision_score(y_test, y_test_pred, average='weighted')
    training_precision.append(train_precision)
    testing_precision.append(test_precision)
    
    print(f'Fold {i}')
    print(f'Train precision: {train_precision}')
    print(f'Test precision: {test_precision}')
    i += 1
    
# Display results
print("\nAVERAGE RESULTS:")
print("Training Precision:", np.mean(training_precision))
print("Testing Precision:", np.mean(testing_precision))


Fold 1
Train precision: 0.9915697993654171
Test precision: 0.9756321588848282
Fold 2
Train precision: 0.9898911139399236
Test precision: 0.9801625866026444
Fold 3
Train precision: 0.9931733127428478
Test precision: 0.9703923884691807
Fold 4
Train precision: 0.991572083428005
Test precision: 0.9744912394327789
Fold 5
Train precision: 0.9918627975181267
Test precision: 0.977854490103892

AVERAGE RESULTS:
Training Precision: 0.991613821398864
Testing Precision: 0.9757065726986649


# Test Models on user input "Emails"

In [345]:
def user_input_test(message, rfc, xgb):
    input_x = pd.DataFrame({'Message': [message]})

    # get engineered features
    input_x['freq_urls'] = input_x['Message'].apply(count_url)
    input_x['freq_urgent_words'] = input_x['Message'].apply(count_urgency_words, urgency_words=urgency_words)
    input_x['freq_exclamation'] = input_x['Message'].apply(count_special_chars, char='!')
    input_x['capital_run_length_total'] = input_x['Message'].apply(count_capital_run_length)

    # vectorize 
    X_new = vectorizer.transform(input_x['Message'])
    X_new = pd.DataFrame(X_new.toarray(), columns=vectorizer.get_feature_names_out())

    # add engineered features
    X_new['freq_urls'] = input_x['freq_urls']
    X_new['freq_urgent_words'] = input_x['freq_urgent_words']
    X_new['freq_exclamation'] = input_x['freq_exclamation']
    X_new['capital_run_length_total'] = input_x['capital_run_length_total']
    
    # match the data frame to the training (add and remove columns not in the original frame)
    missing_cols = set(df_bow.columns) - set(X_new.columns) - {'Category'}
    for col in missing_cols:
        X_new[col] = 0

    X_new = X_new[df_bow.columns.drop('Category')]
    predictions = [rfc.predict(X_new), xgb.predict(X_new)]

    print('\n')
    if predictions[0] == 0:
        print('Random Forest classified as not spam.')
    else:
        print('Random Forest classified as spam.')
    
    if predictions[1] == 0:
        print('XGBoost classified as not spam.')
    else:
        print('XGBoost classified as spam.')

In [347]:
message = "Hi, im emailing to inform you that there is a free iphone waiting for you to claim! just go to www.freeiphone.com to claim it! Act now!"
user_input_test(message, rfc, xgb)



Random Forest classified as spam.
XGBoost classified as spam.


In [349]:
# On your own
message = input('type email for model classification:\n')
user_input_test(message, rfc, xgb)

type email for model classification:
 Hi, im emailing to inform you that there is a free iphone waiting for you to claim! just go to www.freeiphone.com to claim it! Act now!




Random Forest classified as spam.
XGBoost classified as spam.


## Test emails 

Please excuse the lack of creativity, and feel free to try your own emails to see my model in action!

### Example test emails (spam)

Hi, im emailing to inform you that there is a free iphone waiting for you to claim! just go to www.freeiphone.com to claim it! Act now!

You're invited to the BEST and MOST AMAZING and FREE concert ever! Your favorite artists will attend!! All you need is a credit card! If you register ASAP you will be added to the VIP list and get to sit backstage with your fav artists! 

Your amazon account was used without your permission! A charge for 5,000 cash was deducted from your bank. Call us now or visit www.amazonnn.com or else you will lose your house!

### Example test emails (not spam)

Hello my name is Brendan from the shipping department. Your order was canceled due to delivery complications. Please contact us so we can fix your order.

Hey John, James from accounting said you havent paid for the damage you did to the water heater. Please send that over to me ASAP.