In [71]:
import os
# Find the latest version of spark 3.x  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.5.0'
spark_version = 'spark-3.5.0'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Hit:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]
Hit:4 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease
Hit:5 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:6 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:7 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Hit:8 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:9 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]
Hit:10 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Fetched 229 kB in 16s (14.5 kB/s)
Reading package lists... Done


In [72]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("emailsFilter").getOrCreate()


In [73]:
# Read in data from git hub utilizing Spark

from pyspark import SparkFiles
url = "https://raw.githubusercontent.com/brnkath/project-4-group-8/main/Resources/emails.csv"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("emails.csv"), sep=",", header=True)

# Show DataFrame
df.show()

+---------+---+---+---+---+---+---+---+---+---+---+---+---+----+-----+---+---+----+----+----+----+----+---+---+---+---+---+---+---+---+----+---+---+---+---+----+---+-----+---+------+---+---+---+---+----+---+---+---+---+---+----+----+---+---------+---+---+---+---+---+-----+------+---+---+---+---+---+---+-----+-----+-----+---+----+-------+---+-----+---+---+-----+---+----+---+------+--------+---+-----------+----+-------+---+----+---+---+----+----+----+---+------+----+--------+-----+-----+----+------+------+-----+-----+---+------+---+---------+---+-----+-------+---+---+---+-----+----+----+-----+----+----+----+------+-----+----+----+---+----+---+---------+---+----------+----+----+----+----+--------+----+-----+---+------+---+---+-----+----+---+------+-----+-------+----+----+----+----------+------+---+-----+------+----+---+---+-----+------+--------+----+---+-------+----+-------+----+----+-----+---+-----+-----+---+---+---+---+-----+----------+---+----+---+----+-----+---+---------+----+-------+

In [74]:
#Drop the email id column
df = df.drop('Email No.')

In [75]:
#check for null values
is_not_empty = df.dropna().count() == df.count()

print("All rows and columns are not empty:", is_not_empty)

All rows and columns are not empty: True


In [76]:
# Check the balance of our target values
counts_df = df.groupBy('Prediction').count()
counts_df.show()

+----------+-----+
|Prediction|count|
+----------+-----+
|         0| 3672|
|         1| 1500|
+----------+-----+



In [77]:
#Calculate summary statistics on the column headers (email words)
from pyspark.sql import Row
from pyspark.sql.functions import length

#get a list of the columns
columns_list = df.columns

#turn the list of column names into a df
columns_df = spark.createDataFrame([Row(Column_Name=col) for col in columns_list])

#create a new column with the character counts of each word
columns_df_with_length= columns_df.withColumn("Character_Count", length("Column_Name"))

#summarize the character count stats
summary_stats = columns_df_with_length.select('Character_Count').describe()

summary_stats.show()

+-------+-----------------+
|summary|  Character_Count|
+-------+-----------------+
|  count|             3001|
|   mean|6.058980339886705|
| stddev|2.433143404448933|
|    min|                1|
|    max|               16|
+-------+-----------------+



In [78]:
#Convert the entire dataframe to a pandas dataframe to run with SciKit Learn (Models 3, 4 & 5)
email_df = df.toPandas()

In [79]:
#Narrow the columns to only words that are over 6 letters for Model 1

df_filtered = [col for col in df.columns if len(col) >6]
df_longwords = df.select(*df_filtered)

# Convert filtered dataframe to pandas to run with SciKit Learn
email_df_longwords = df_longwords.toPandas()

In [80]:
# Filter columns based on the total value being greater than 1000 for Model 2, this time utilizing a Spark SQL query

#Create a temporary view to query
df.createOrReplaceTempView("email_table")

#Write the query
sql_query = """
  SELECT *
  FROM email_table
  WHERE(
    SELECT SUM({col}) AS total_value
    FROM email_table
  ) >1000
"""

#Replace col with actual column name
columns = df.columns
sql_query = sql_query.format(col=columns[0])

#execute the query
df_common = spark.sql(sql_query)

#Convert filtered dataframe to pandas to run with SciKit Learn
email_df_common = df_common.toPandas()



In [98]:
#Query to find the average usage of all words (columns)
columns = df.columns[0]

final_query = f"""
  SELECT AVG(total_value) AS average_sum
  FROM (
    SELECT SUM( {columns} ) AS total_value
    FROM email_table
    )
"""

df_average_sum = spark.sql(final_query)

df_average_sum.show()

+-----------+
|average_sum|
+-----------+
|    34345.0|
+-----------+



In [None]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

## Model 1 - Logistic Regression Long Words Only (>6 letters)
Our dataset includes 3000 columns.  For our first model we want to narrow down the number of columns used in our model to only those with more than 6 letters.

In [None]:
# Separate the data into labels and features
# Separate the y variable, the labels
y = email_df_longwords['Prediction']

# Separate the X variable, the features
X = email_df_longwords.drop(columns=['Prediction'])

In [None]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [None]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver='lbfgs', max_iter=200, random_state=1)

# Fit the model using training data
classifier.fit(X_train, y_train)

In [None]:
# Make a prediction using the testing data
predictions = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)

In [None]:
# Print the balanced_accuracy score of the model
balanced_accuracy_score(y_test, predictions)

0.9333856209150326

In [None]:
# Generate a confusion matrix for the model
confusion_matrix(y_test, predictions)

array([[852,  66],
       [ 23, 352]])

In [None]:
# Print the classification report for the model
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.97      0.93      0.95       918
           1       0.84      0.94      0.89       375

    accuracy                           0.93      1293
   macro avg       0.91      0.93      0.92      1293
weighted avg       0.94      0.93      0.93      1293



In [None]:
# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

# Instantiate the random oversampler model
# # Assign a random_state parameter of 1 to the model
model = RandomOverSampler(random_state=1)

# Fit the original training data to the random_oversampler model
X_over, y_over = model.fit_resample(X_train, y_train)


In [None]:
# Count the distinct values of the resampled labels data
y_over.nunique()

2

In [None]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver='lbfgs', max_iter=200, random_state=1)

# Fit the model using the resampled training data
classifier.fit(X_over, y_over)

# Make a prediction using the testing data
predictions = classifier.predict(X_test)
results = pd.DataFrame({"Predictions": predictions, "Actual": y_test}).reset_index(drop=True)


In [None]:
# Print the balanced_accuracy score of the model
balanced_accuracy_score(y_test, predictions)

0.9341350762527233

In [None]:
# Generate a confusion matrix for the model
confusion_matrix(y_test, predictions)


array([[824,  94],
       [ 11, 364]])

In [None]:
# Print the classification report for the model
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.99      0.90      0.94       918
           1       0.79      0.97      0.87       375

    accuracy                           0.92      1293
   macro avg       0.89      0.93      0.91      1293
weighted avg       0.93      0.92      0.92      1293



Model 1 Logistic Regression Long Words Only : Accuracy 92%

## Model 2 - Logistic Regression Common Words Only
Even though our first model surpassed our accuracy threshold we are interested if rather than reducing the number of columns based on letter count, if we instead reduce the number of columns based on the frequency (or how common) a word occurred in emails impacts the accuracy.

In [None]:
# Separate the data into labels and features
# Separate the y variable, the labels
y = email_df_common['Prediction']

# Separate the X variable, the features
X = email_df_common.drop(columns=['Prediction'])

In [None]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [None]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver='lbfgs', max_iter=1000, random_state=1)

# Fit the model using training data
classifier.fit(X_train, y_train)

In [None]:
# Make a prediction using the testing data
predictions = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)


In [None]:
# Print the balanced_accuracy score of the model
balanced_accuracy_score(y_test, predictions)

0.9669281045751634

In [None]:
# Generate a confusion matrix for the model
confusion_matrix(y_test, predictions)

array([[894,  24],
       [ 15, 360]])

In [None]:
# Print the classification report for the model
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.98      0.97      0.98       918
           1       0.94      0.96      0.95       375

    accuracy                           0.97      1293
   macro avg       0.96      0.97      0.96      1293
weighted avg       0.97      0.97      0.97      1293



In [None]:
# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

# Instantiate the random oversampler model
# # Assign a random_state parameter of 1 to the model
model = RandomOverSampler(random_state=1)

# Fit the original training data to the random_oversampler model
X_over, y_over = model.fit_resample(X_train, y_train)

In [None]:
# Count the distinct values of the resampled labels data
y_over.nunique()

2

In [None]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver='lbfgs', max_iter=1000, random_state=1)

# Fit the model using the resampled training data
classifier.fit(X_over, y_over)

# Make a prediction using the testing data
predictions = classifier.predict(X_test)
results = pd.DataFrame({"Predictions": predictions, "Actual": y_test}).reset_index(drop=True)

In [None]:
# Print the balanced_accuracy score of the model
balanced_accuracy_score(y_test, predictions)

0.9674161220043573

In [None]:
# Generate a confusion matrix for the model
confusion_matrix(y_test, predictions)

array([[890,  28],
       [ 13, 362]])

In [None]:
# Print the classification report for the model
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.99      0.97      0.98       918
           1       0.93      0.97      0.95       375

    accuracy                           0.97      1293
   macro avg       0.96      0.97      0.96      1293
weighted avg       0.97      0.97      0.97      1293



Model 2 Logistic Regression Common Words Only : Accuracy 95%

## Model 3 Logistic Regression All Columns
Finally we are interested in how utilizing all the columns of data impacts the accuracy of the logistic regression models.

In [None]:
# Separate the data into labels and features
# Separate the y variable, the labels
y = email_df['Prediction']

# Separate the X variable, the features
X = email_df.drop(columns=['Prediction'])

In [None]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [None]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver='lbfgs', max_iter=1000, random_state=1)

# Fit the model using training data
classifier.fit(X_train, y_train)

In [None]:
# Make a prediction using the testing data
predictions = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)


In [None]:
# Print the balanced_accuracy score of the model
balanced_accuracy_score(y_test, predictions)

0.9669281045751634

In [None]:
# Generate a confusion matrix for the model
confusion_matrix(y_test, predictions)

array([[894,  24],
       [ 15, 360]])

In [None]:
# Print the classification report for the model
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.98      0.97      0.98       918
           1       0.94      0.96      0.95       375

    accuracy                           0.97      1293
   macro avg       0.96      0.97      0.96      1293
weighted avg       0.97      0.97      0.97      1293



In [None]:
# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

# Instantiate the random oversampler model
# # Assign a random_state parameter of 1 to the model
model = RandomOverSampler(random_state=1)

# Fit the original training data to the random_oversampler model
X_over, y_over = model.fit_resample(X_train, y_train)

In [None]:
# Count the distinct values of the resampled labels data
y_over.nunique()

2

In [None]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver='lbfgs', max_iter=1000, random_state=1)

# Fit the model using the resampled training data
classifier.fit(X_over, y_over)

# Make a prediction using the testing data
predictions = classifier.predict(X_test)
results = pd.DataFrame({"Predictions": predictions, "Actual": y_test}).reset_index(drop=True)

In [None]:
# Print the balanced_accuracy score of the model
balanced_accuracy_score(y_test, predictions)

0.9674161220043573

In [None]:
# Generate a confusion matrix for the model
confusion_matrix(y_test, predictions)

array([[890,  28],
       [ 13, 362]])

In [None]:
# Print the classification report for the model
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.99      0.97      0.98       918
           1       0.93      0.97      0.95       375

    accuracy                           0.97      1293
   macro avg       0.96      0.97      0.96      1293
weighted avg       0.97      0.97      0.97      1293



Model 3 Logistic Regression All Columns : Accuracy 97%

## Model 4 Support Vector Machine (SVM)
Despite all Logistic Regression Models meeting our threshold, we were interested in how other classification models would perform with the data. Since utilizing all of the columns resulted in the most accurate logistic regression model, we want to run this model with all of the columns.

In [None]:
# Get the target variables.
target = email_df["Prediction"]
target_names = ["ham", "spam"]

In [None]:
# Get the features.
data = email_df.drop(columns=['Prediction'])
feature_names = data.columns

In [None]:
# Split data into training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=1)

In [None]:
# Support vector machine linear classifier
from sklearn.svm import SVC
model = SVC(kernel='linear')
model.fit(X_train, y_train)

In [None]:
# Model Accuracy
print('Test Acc: %.3f' % model.score(X_test, y_test))

Test Acc: 0.952


In [None]:
# Calculate the classification report
from sklearn.metrics import classification_report
predictions = model.predict(X_test)
print(classification_report(y_test, predictions,
                            target_names=target_names))

              precision    recall  f1-score   support

         ham       0.97      0.97      0.97       909
        spam       0.92      0.92      0.92       384

    accuracy                           0.95      1293
   macro avg       0.94      0.94      0.94      1293
weighted avg       0.95      0.95      0.95      1293



Model 4 SVM : Accuracy 95%

## Model 5 Random Forest
Since the SVM model did not perform as well as the Logistic Regression model we want to continue to explore model types to find which would perform best. We will use a Random Forest model to see how it's performance compares.  Furthermore, we are interested in the features importances element to provide additional insight into our dataset.

In [None]:
# Separate the data into labels and features
# Separate the y variable, the labels
y = email_df['Prediction']

# Separate the X variable, the features
X = email_df.drop(columns=['Prediction'])

In [None]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [None]:
# Create the StandardScaler instance
scaler = StandardScaler()

In [None]:
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

In [None]:
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

In [None]:
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# Create the random forest classifier instance
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [None]:
# Fit the model and use .ravel()on the "y_train" data.
rf_model = rf_model.fit(X_train_scaled, y_train.ravel())

In [None]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

In [None]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [None]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,900,18
Actual 1,14,361


Accuracy Score : 0.9752513534416086
Classification Report
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       918
           1       0.95      0.96      0.96       375

    accuracy                           0.98      1293
   macro avg       0.97      0.97      0.97      1293
weighted avg       0.98      0.98      0.98      1293



In [None]:
# Get the feature importance array
importances = rf_model.feature_importances_
# List the top 10 most important features
importances_sorted = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
importances_sorted[:10]

[(0.02156687312277583, 'enron'),
 (0.014895284618837367, 'http'),
 (0.014422464385800613, 'hpl'),
 (0.012607624252360472, 'thanks'),
 (0.012299429110483333, 'hanks'),
 (0.011368140797780764, 'ali'),
 (0.009555594412565172, 'thank'),
 (0.009500108699916318, 'daren'),
 (0.009330608546977679, 'our'),
 (0.008940569860094765, 'subject')]

Model 5 Random Forest : Accuracy 98%

### Random Forest Importances
| Feature | Score |
|-------|----------|
|'enron'| .0215 |
|'http'| .0149 |
|'hpl' | .0144 |
|'thanks' | .0126 |
|'hanks'| .0123 |
|'ali'| .0113 |
|'thank' | .0096 |
|'daren' | .0095 |
|'our'| .0093 |
|'subject' | .0089 |

## Summary of Models and Results
| Model | Accuracy |
|-------|----------|
|Logistic Regression Long Words Only| 92% |
|Logistic Regression Common Words Only| 95% |
|Logistic Regression All Columns | 97% |
|Support Vector Machine | 95% |
|Random Forest| 98% |

The Random Forest model utilizing all the columns performed with the highest accuracy of all the models.

