In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report


In [2]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
path = "Resources/emails.csv"
email_df = pd.read_csv(path)

# Review the DataFrame
email_df.head()


Unnamed: 0,Email No.,the,to,ect,and,for,of,a,you,hou,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,Email 1,0,0,1,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Email 2,8,13,24,6,6,2,102,1,27,...,0,0,0,0,0,0,0,1,0,0
2,Email 3,0,0,1,0,0,0,8,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Email 4,0,5,22,0,5,1,51,2,10,...,0,0,0,0,0,0,0,0,0,0
4,Email 5,7,6,17,1,5,2,57,0,9,...,0,0,0,0,0,0,0,1,0,0


In [3]:
# Drop the Email No. column
email_df = email_df.drop(columns=['Email No.'])


In [6]:
# Narrow the columns to only words that are over 6 letters
email_df_filtered = [col for col in email_df.columns if len(col) > 6]
email_df = email_df[email_df_filtered]

email_df.head(10)

Unnamed: 0,forwarded,company,attached,information,message,contract,questions,volumes,following,production,...,imbalances,reallocated,australia,remains,enhancements,connevey,infrastructure,military,allowing,Prediction
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3,0,1,0,0,0,0,0,1,3,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2,0,0,0,0,4,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
8,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
9,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
# Separate the data into labels and features
# Separate the y variable, the labels
y = email_df['Prediction']

# Separate the X variable, the features
X = email_df.drop(columns=['Prediction'])


In [8]:
# Review the y variable Series
y.head(10)


0    0
1    0
2    0
3    0
4    0
5    1
6    0
7    1
8    0
9    0
Name: Prediction, dtype: int64

In [9]:
# Review the X variable DataFrame
X.head(10)


Unnamed: 0,forwarded,company,attached,information,message,contract,questions,volumes,following,production,...,solmonson,imbalances,reallocated,australia,remains,enhancements,connevey,infrastructure,military,allowing
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3,0,1,0,0,0,0,0,1,3,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2,0,0,0,0,4,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
9,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# Check the balance of our target values
y.value_counts()


0    3672
1    1500
Name: Prediction, dtype: int64

In [11]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)


In [12]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver='lbfgs', max_iter=200, random_state=1)

# Fit the model using training data
classifier.fit(X_train, y_train)


In [13]:
# Make a prediction using the testing data
predictions = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
results.head(10)


Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,0
3,0,1
4,0,0
5,0,0
6,1,1
7,0,0
8,0,0
9,0,0


In [14]:
# Print the balanced_accuracy score of the model
balanced_accuracy_score(y_test, predictions)


0.9333856209150326

In [15]:
# Generate a confusion matrix for the model
confusion_matrix(y_test, predictions)


array([[852,  66],
       [ 23, 352]], dtype=int64)

In [16]:
# Print the classification report for the model
print(classification_report(y_test, predictions))


              precision    recall  f1-score   support

           0       0.97      0.93      0.95       918
           1       0.84      0.94      0.89       375

    accuracy                           0.93      1293
   macro avg       0.91      0.93      0.92      1293
weighted avg       0.94      0.93      0.93      1293



In [17]:
# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

# Instantiate the random oversampler model
# # Assign a random_state parameter of 1 to the model
model = RandomOverSampler(random_state=1)

# Fit the original training data to the random_oversampler model
X_over, y_over = model.fit_resample(X_train, y_train)


In [18]:
# Count the distinct values of the resampled labels data
y_over.nunique()


2

In [19]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver='lbfgs', max_iter=200, random_state=1)

# Fit the model using the resampled training data
classifier.fit(X_over, y_over)

# Make a prediction using the testing data
predictions = classifier.predict(X_test)
results = pd.DataFrame({"Predictions": predictions, "Actual": y_test}).reset_index(drop=True)

results.head(10)


Unnamed: 0,Predictions,Actual
0,0,0
1,0,0
2,0,0
3,1,1
4,0,0
5,0,0
6,1,1
7,0,0
8,0,0
9,0,0


In [20]:
# Print the balanced_accuracy score of the model
balanced_accuracy_score(y_test, predictions)


0.9341350762527233

In [21]:
# Generate a confusion matrix for the model
confusion_matrix(y_test, predictions)


array([[824,  94],
       [ 11, 364]], dtype=int64)

In [22]:
# Print the classification report for the model
print(classification_report(y_test, predictions))


              precision    recall  f1-score   support

           0       0.99      0.90      0.94       918
           1       0.79      0.97      0.87       375

    accuracy                           0.92      1293
   macro avg       0.89      0.93      0.91      1293
weighted avg       0.93      0.92      0.92      1293

