In [1]:
# Import dependencies
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
path = "outputs/emails_df.csv"
email_df = pd.read_csv(path)

# Review the DataFrame
email_df.head()

Unnamed: 0,Discover,the,to,of,and,a,in,important,https,for,...,Jonathan,ble,Version,yields,silver,wildfires,accredited,ther,mounting,Spam Indicator
0,2,30,25,23,29,16,13,0,0,9,...,0,0,0,0,0,0,0,0,0,1
1,2,30,27,23,24,23,14,33,0,15,...,0,0,0,0,0,0,0,0,0,1
2,2,24,19,21,18,13,10,0,0,8,...,0,0,0,0,0,0,0,0,0,1
3,3,26,22,25,22,16,14,0,0,8,...,0,0,0,0,0,0,0,0,0,1
4,1,91,57,65,70,40,39,50,39,25,...,0,0,0,2,0,0,0,0,0,0


In [3]:
# Drop the email name column
email_df = email_df.drop(columns=email_df.columns[0])

# Logistic Regression

In [4]:
# Separate the data into labels and features
# Separate the y variable, the labels
y = email_df['Spam Indicator']

# Separate the X variable, the features
X = email_df.drop(columns=['Spam Indicator'])

In [5]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [6]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver='lbfgs', max_iter=200, random_state=1)

# Fit the model using training data
classifier.fit(X_train, y_train)

In [7]:
# Make a prediction using the testing data
predictions = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)

In [8]:
# Print the balanced_accuracy score of the model
balanced_accuracy_score(y_test, predictions)

0.975

In [9]:
# Generate a confusion matrix for the model
confusion_matrix(y_test, predictions)

array([[38,  2],
       [ 0, 35]], dtype=int64)

In [10]:
# Print the classification report for the model
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       1.00      0.95      0.97        40
           1       0.95      1.00      0.97        35

    accuracy                           0.97        75
   macro avg       0.97      0.97      0.97        75
weighted avg       0.97      0.97      0.97        75



In [11]:
# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

# Instantiate the random oversampler model
# # Assign a random_state parameter of 1 to the model
model = RandomOverSampler(random_state=1)

# Fit the original training data to the random_oversampler model
X_over, y_over = model.fit_resample(X_train, y_train)

In [12]:
# Count the distinct values of the resampled labels data
y_over.nunique()

2

In [13]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver='lbfgs', max_iter=200, random_state=1)

# Fit the model using the resampled training data
classifier.fit(X_over, y_over)

# Make a prediction using the testing data
predictions = classifier.predict(X_test)
results = pd.DataFrame({"Predictions": predictions, "Actual": y_test}).reset_index(drop=True)

In [14]:
# Print the balanced_accuracy score of the model
balanced_accuracy_score(y_test, predictions)

0.975

In [15]:
# Generate a confusion matrix for the model
confusion_matrix(y_test, predictions)

array([[38,  2],
       [ 0, 35]], dtype=int64)

In [16]:
# Print the classification report for the model
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       1.00      0.95      0.97        40
           1       0.95      1.00      0.97        35

    accuracy                           0.97        75
   macro avg       0.97      0.97      0.97        75
weighted avg       0.97      0.97      0.97        75



# Support Vector Machine (SVM)

In [17]:
# Get the target variables.
target = email_df["Spam Indicator"]
target_names = ["ham", "spam"]

In [18]:
# Get the features.
data = email_df.drop(columns=['Spam Indicator'])
feature_names = data.columns

In [19]:
# Split data into training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=1)

In [20]:
# Support vector machine linear classifier
from sklearn.svm import SVC
model = SVC(kernel='linear')
model.fit(X_train, y_train)

In [21]:
# Model Accuracy
print('Test Acc: %.3f' % model.score(X_test, y_test))

Test Acc: 0.973


In [22]:
# Calculate the classification report
from sklearn.metrics import classification_report
predictions = model.predict(X_test)
print(classification_report(y_test, predictions,
                            target_names=target_names))

              precision    recall  f1-score   support

         ham       0.98      0.98      0.98        43
        spam       0.97      0.97      0.97        32

    accuracy                           0.97        75
   macro avg       0.97      0.97      0.97        75
weighted avg       0.97      0.97      0.97        75



# Random Forest

In [23]:
# Separate the data into labels and features
# Separate the y variable, the labels
y = email_df['Spam Indicator']

# Separate the X variable, the features
X = email_df.drop(columns=['Spam Indicator'])

In [24]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [25]:
# Create the StandardScaler instance
scaler = StandardScaler()

In [26]:
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

In [27]:
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [28]:
# Create the random forest classifier instance
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [29]:
# Fit the model and use .ravel()on the "y_train" data.
rf_model = rf_model.fit(X_train_scaled, y_train.ravel())

In [30]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

In [31]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [32]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,38,2
Actual 1,0,35


Accuracy Score : 0.9733333333333334
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.95      0.97        40
           1       0.95      1.00      0.97        35

    accuracy                           0.97        75
   macro avg       0.97      0.97      0.97        75
weighted avg       0.97      0.97      0.97        75



In [33]:
# Get the feature importance array
importances = rf_model.feature_importances_
# List the top 10 most important features
importances_sorted = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
importances_sorted[:10]

[(0.02122514563235986, 'From'),
 (0.01879905908995658, 'https'),
 (0.01832941913316902, 'a'),
 (0.018231766194545133, 'our'),
 (0.01792962753137966, 'we'),
 (0.016125840955334217, 'to'),
 (0.01589146662791404, 'inherit'),
 (0.014855549863386061, 'none'),
 (0.014518109437077735, 'p'),
 (0.014333409303235572, 'ing')]