In [None]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
path = "Resources/emails.csv"
email_df = pd.read_csv(path)

# Review the DataFrame
email_df.head()

## Model 1 - Logistic Regression Long Words Only (>6 letters)
Our dataset includes 3000 columns.  For our first model we want to narrow down the number of columns used in our model to only those with more than 6 letters. 

In [None]:
# Drop the Email No. column
email_df = email_df.drop(columns=['Email No.'])

In [None]:
# Narrow the columns to only words that are over 4 letters
email_df_filtered = [col for col in email_df.columns if len(col) > 6]
email_df_longwords = email_df[email_df_filtered]


In [None]:
# Separate the data into labels and features
# Separate the y variable, the labels
y = email_df_longwords['Prediction']

# Separate the X variable, the features
X = email_df_longwords.drop(columns=['Prediction'])

In [None]:
# Check the balance of our target values
y.value_counts()

In [None]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [None]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver='lbfgs', max_iter=200, random_state=1)

# Fit the model using training data
classifier.fit(X_train, y_train)

In [None]:
# Make a prediction using the testing data
predictions = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
results.head(10)

In [None]:
# Print the balanced_accuracy score of the model
balanced_accuracy_score(y_test, predictions)

In [None]:
# Generate a confusion matrix for the model
confusion_matrix(y_test, predictions)

In [None]:
# Print the classification report for the model
print(classification_report(y_test, predictions))

In [None]:
# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

# Instantiate the random oversampler model
# # Assign a random_state parameter of 1 to the model
model = RandomOverSampler(random_state=1)

# Fit the original training data to the random_oversampler model
X_over, y_over = model.fit_resample(X_train, y_train)


In [None]:
# Count the distinct values of the resampled labels data
y_over.nunique()

In [None]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver='lbfgs', max_iter=200, random_state=1)

# Fit the model using the resampled training data
classifier.fit(X_over, y_over)

# Make a prediction using the testing data
predictions = classifier.predict(X_test)
results = pd.DataFrame({"Predictions": predictions, "Actual": y_test}).reset_index(drop=True)

results.head(10)

In [None]:
# Print the balanced_accuracy score of the model
balanced_accuracy_score(y_test, predictions)

In [None]:
# Generate a confusion matrix for the model
confusion_matrix(y_test, predictions)


In [None]:
# Print the classification report for the model
print(classification_report(y_test, predictions))

Model 1 Logistic Regression Long Words Only : Accuracy 92%

## Model 2 - Logistic Regression Common Words Only 
Even though our first model surpassed our accuracy threshold we are interested if rather than reducing the number of columns based on letter count, if we instead reduce the number of columns based on the frequency (or how common) a word occurred in emails impacts the accuracy.

In [None]:
# Filter columns based on the total value being greater than 1000
selected_columns = []
for col in email_df.columns:
    total_value = email_df[col].sum()
    if total_value > 1000:
        selected_columns.append(col)

email_df_common = email_df[selected_columns]

email_df_common.head(10)

In [None]:
# Separate the data into labels and features
# Separate the y variable, the labels
y = email_df_common['Prediction']

# Separate the X variable, the features
X = email_df_common.drop(columns=['Prediction'])

In [None]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [None]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver='lbfgs', max_iter=200, random_state=1)

# Fit the model using training data
classifier.fit(X_train, y_train)

In [None]:
# Make a prediction using the testing data
predictions = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
results.head(10)

In [None]:
# Print the balanced_accuracy score of the model
balanced_accuracy_score(y_test, predictions)

In [None]:
# Generate a confusion matrix for the model
confusion_matrix(y_test, predictions)

In [None]:
# Print the classification report for the model
print(classification_report(y_test, predictions))

In [None]:
# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

# Instantiate the random oversampler model
# # Assign a random_state parameter of 1 to the model
model = RandomOverSampler(random_state=1)

# Fit the original training data to the random_oversampler model
X_over, y_over = model.fit_resample(X_train, y_train)

In [None]:
# Count the distinct values of the resampled labels data
y_over.nunique()

In [None]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver='lbfgs', max_iter=200, random_state=1)

# Fit the model using the resampled training data
classifier.fit(X_over, y_over)

# Make a prediction using the testing data
predictions = classifier.predict(X_test)
results = pd.DataFrame({"Predictions": predictions, "Actual": y_test}).reset_index(drop=True)

results.head(10)


In [None]:
# Print the balanced_accuracy score of the model
balanced_accuracy_score(y_test, predictions)

In [None]:
# Generate a confusion matrix for the model
confusion_matrix(y_test, predictions)

In [None]:
# Print the classification report for the model
print(classification_report(y_test, predictions))

Model 2 Logistic Regression Common Words Only : Accuracy 95%

## Model 3 Logistic Regression All Columns
Finally we are interested in how utilizing all the columns of data impacts the accuracy of the logistic regression models. 

In [None]:
# Separate the data into labels and features
# Separate the y variable, the labels
y = email_df['Prediction']

# Separate the X variable, the features
X = email_df.drop(columns=['Prediction'])

In [None]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [None]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver='lbfgs', max_iter=200, random_state=1)

# Fit the model using training data
classifier.fit(X_train, y_train)

In [None]:
# Make a prediction using the testing data
predictions = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
results.head(10)

In [None]:
# Print the balanced_accuracy score of the model
balanced_accuracy_score(y_test, predictions)

In [None]:
# Generate a confusion matrix for the model
confusion_matrix(y_test, predictions)

In [None]:
# Print the classification report for the model
print(classification_report(y_test, predictions))

In [None]:
# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

# Instantiate the random oversampler model
# # Assign a random_state parameter of 1 to the model
model = RandomOverSampler(random_state=1)

# Fit the original training data to the random_oversampler model
X_over, y_over = model.fit_resample(X_train, y_train)

In [None]:
# Count the distinct values of the resampled labels data
y_over.nunique()

In [None]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver='lbfgs', max_iter=200, random_state=1)

# Fit the model using the resampled training data
classifier.fit(X_over, y_over)

# Make a prediction using the testing data
predictions = classifier.predict(X_test)
results = pd.DataFrame({"Predictions": predictions, "Actual": y_test}).reset_index(drop=True)

results.head(10)

In [None]:
# Print the balanced_accuracy score of the model
balanced_accuracy_score(y_test, predictions)

In [None]:
# Generate a confusion matrix for the model
confusion_matrix(y_test, predictions)

In [None]:
# Print the classification report for the model
print(classification_report(y_test, predictions))

Model 3 Logistic Regression All Columns : Accuracy 97%

## Model 4 Support Vector Machine (SVM)
Despite all Logistic Regression Models meeting our threshold, we were interested in how other classification models would perform with the data. Since utilizing all of the columns resulted in the most accurate logistic regression model, we want to run this model with all of the columns. 

In [None]:
# Get the target variables. 
target = email_df["Prediction"]
target_names = ["ham", "spam"]

In [None]:
# Get the features. 
data = email_df.drop(columns=['Prediction'])
feature_names = data.columns
data.head()

In [None]:
# Split data into training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=1)

In [None]:
# Support vector machine linear classifier
from sklearn.svm import SVC 
model = SVC(kernel='linear')
model.fit(X_train, y_train)

In [None]:
# Model Accuracy
print('Test Acc: %.3f' % model.score(X_test, y_test))

In [None]:
# Calculate the classification report
from sklearn.metrics import classification_report
predictions = model.predict(X_test)
print(classification_report(y_test, predictions,
                            target_names=target_names))

Model 4 SVM : Accuracy 95%

## Model 5 Random Forest
Since the SVM model did not perform as well as the Logistic Regression model we want to continue to explore model types to find which would perform best. We will use a Random Forest model to see how it's performance compares.  Furthermore, we are interested in the features importances element to provide additional insight into our data set. 

In [None]:
# Separate the data into labels and features
# Separate the y variable, the labels
y = email_df['Prediction']

# Separate the X variable, the features
X = email_df.drop(columns=['Prediction'])

In [None]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [None]:
# Create the StandardScaler instance
scaler = StandardScaler()

In [None]:
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

In [None]:
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

In [None]:
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# Create the random forest classifier instance
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [None]:
# Fit the model and use .ravel()on the "y_train" data. 
rf_model = rf_model.fit(X_train_scaled, y_train.ravel())

In [None]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

In [None]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [None]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

In [None]:
# Get the feature importance array
importances = rf_model.feature_importances_
# List the top 10 most important features
importances_sorted = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
importances_sorted[:10]

Model 5 Random Forest : Accuracy 98%

### Random Forest Importances
| Feature | Score |
|-------|----------|
|'enron'| .0215 |
|'http'| .0149 |
|'hpl' | .0144 |
|'thanks' | .0126 |
|'hanks'| .0123 |
|'ali'| .0113 |
|'thank' | .0096 |
|'daren' | .0095 |
|'our'| .0093 |
|'subject' | .0089 |

## Summary of Models and Results
| Model | Accuracy |
|-------|----------|
|Logistic Regression Long Words Only| 92% |
|Logistic Regression Common Words Only| 95% |
|Logistic Regression All Columns | 97% |
|Support Vector Machine | 95% |
|Random Forest| 98% |

The Random Forest model utilizing all the columns performed with the highest accuracy of all the models. 

