In [None]:
# Breanna Powell and Melody Behdarvandian
# CSS 576
# Assignment 3

# IDE: Visual Studio Code 
# Follow these steps if you have not used Jupyter Notebooks in VS Code before:
# https://code.visualstudio.com/docs/languages/python

# -------------------------------------------------------------------------------------------------
#                    FOLLOW THESE STEPS TO INSTALL TENSORFLOW in VS CODE
# -------------------------------------------------------------------------------------------------
# Open Anaconda Navigator 
# Launch VS Code through Anaconda Navigator
# Terminal > New Terminal
#
# https://docs.anaconda.com/anaconda/user-guide/tasks/tensorflow/
# 1) Use the commands to create a tensorflow environment:
#    $ conda create -n tf tensorflow
#    $ conda activate tf
#
# https://code.visualstudio.com/docs/datascience/jupyter-notebooks#_create-or-open-a-jupyter-notebook
# 2) In the upper right hand corner, switch the kernel from "base" over to "tf(Python 3.10.9)"
# This will change the kernel over to tensorflow's kernel.
#
# 3) Close this document and reopen it from Anaconda Navigator, but instead of "base" select "tf" from the dropdown menu
# If you don't see "tf" in the dropdown menu, try closing Anaconda Navigator and reopening it.
#
# 4) Install the following:
#    $ conda install ipykernel
#    $ conda install pandas matplotlib scikit-learn seaborn
#    $ conda install -c conda-forge tensorflow keras
# -------------------------------------------------------------------------------------------------

import numpy as np
import pandas as pd

emailData = pd.read_csv('emails.csv')

## DATA EXPLORATION

In [None]:
emailData.shape # Check the size of the dataset

In [None]:
# Check to see if there are missing values (NaN or null)
emailData.info()

In [None]:
emailData.head()

# PRE-PROCESSING TECHNIQUES

In [None]:
# Replace labels with 0 for ham and 1 for spam
emailData["label"] = (emailData["label"] == "spam").astype(int)

# Check to make sure it worked
emailData.head()

In [None]:
# Separate the features (x) from the labels (y)
x = emailData['email']
y = emailData["label"]

In [None]:
# Apply a count vectorizer to the training data to convert from text to token counts
# Count vectorizer -- will it ensure the testing data has no impact on the training data's normalization?
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(strip_accents= "unicode", stop_words="english", max_features=40) # This cuts the features to 30 words
features = cv.fit_transform(x)
features.get_shape()


In [None]:
columns = cv.get_feature_names_out()

# Features is now in csr format (rather than a series or dataframe)
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html
type(features)
x = pd.DataFrame(features.toarray()) # convert it to an dataframe instead
x.describe()

In [None]:
# Checking to make sure that there are no NaN's
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.isna.html
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sum.html
x.isna().sum().sum()

# Should print 0

In [None]:
x.columns = columns
x = x.drop(["gt", "lt"], axis=1) # In the dataset we are using, &gt;  and &lt; are less than or greater than signs in html - not words
x.describe()

In [None]:
x.max()

In [None]:
# ----- INITIAL FEATURE REDUCTION: THIS PART OF THE CODE MAY TAKE UP TO 30 MINUTES TO RUN ----
# maxIndex = len(x.columns)-1

# for i in range(maxIndex,0,-1):
    
#     if x[i].max() < 5.0:
#         x = x.drop(x.columns[i], 1)

In [None]:

# x = x.drop(x.columns[0], 1)

# Feature Reduction

In [None]:
#Substract the mean
x = x-x.mean()
x = x-x.std()


In [None]:
zVar = np.dot(x.T,x)
eigenvalues, eigenvectors = np.linalg.eig(zVar)

In [None]:
diagonalMatrix = np.diag(eigenvalues)

In [None]:
newMatrix = np.dot(zVar, eigenvectors)

In [None]:
#1. Calculate the proportion of variance explained by each feature
sum_eigenvalues = np.sum(eigenvalues)

prop_var = [i/sum_eigenvalues for i in eigenvalues]

#2. Calculate the cumulative variance
cum_var = [np.sum(prop_var[:i+1]) for i in range(len(prop_var))]


# Plot scree plot from PCA
import matplotlib.pyplot as plt

x_labels = ['PC{}'.format(i+1) for i in range(len(prop_var))]

plt.plot(x_labels, prop_var, marker='o', markersize=6, color='skyblue', linewidth=2, label='Proportion of variance')
plt.plot(x_labels, cum_var, marker='o', color='orange', linewidth=2, label="Cumulative variance")
plt.legend()
plt.title('Scree plot')
plt.xlabel('Principal components')
plt.ylabel('Proportion of variance')
plt.show()

In [None]:
display(prop_var)

In [None]:
u, s, vh = np.linalg.svd(x, full_matrices=True)

In [None]:
nfeatures = len(x.columns);
nfeatures

In [None]:
Vsquare = [[0 for x in range(0,nfeatures)] for y in range(0,nfeatures)] 

for i in range(0,nfeatures):
    for j in range(0,nfeatures):
        Vsquare[i][j] = vh[i][j]**2
        if vh[i][j]<0:

            Vsquare[i][j] = Vsquare[i][j]*-1; 
        else:
            Vsquare[i][j] = Vsquare[i][j]*1; 
    

In [None]:
Vsquare

In [None]:
from matplotlib.pyplot import bar, grid, title, xlabel, ylabel


bar(range(0,nfeatures), Vsquare[:][0], 0.5); # CHANGED FROM 1.5 TO 1.0
    
grid; 
    
    
xlabel('Feature index')
    
ylabel('Importance of feature')
    

    
title("Loading Vector")

In [None]:
x.describe()

# Split into Train and Test

In [None]:
# Split into training and testing data 
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

In [None]:
x_train.shape

In [None]:
len(x_train)

In [None]:
len(y_train)

# Neural Network

In [None]:
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers

train_shape = x_train.shape
b = train_shape[1]
input_shape = (b,)

print(input_shape)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
# https://keras.io/api/models/sequential/


# Activation function options:
# - relu
# - sigmoid - good for last layer of binary classification
# - elu - need to research this one

model = Sequential()
model.add(Dense(8, input_shape = input_shape, activation = 'relu'))
model.add(Dense(4, activation = 'relu'))
model.add(Dense(1, activation = 'sigmoid')) # Always use a sigmoid function for final layer when performing binary classification.

In [None]:
# https://keras.io/api/callbacks/early_stopping/
# https://machinelearningmastery.com/early-stopping-to-avoid-overtraining-neural-network-models/
# Monitor =
#  - loss
#  - accuracy
#  - val_loss (val_loss only applies if we use the validation data in the model.fit function)
#  - val_binary_crossentropy (val stands for "validation" -- as in validation set)
#  - precision_metric
#  - prec (BAD!!!!)

# Mode =
#  - min
#  - max

# min_delta = 
#  - 0.0001

# https://lambdalabs.com/blog/tensorflow-2-0-tutorial-04-early-stopping
# callback = keras.callbacks.EarlyStopping(monitor='prec', min_delta=0.0001,patience=1) #TERRIBLE
# callback = keras.callbacks.EarlyStopping(monitor='precision_metric', min_delta=0.0001,patience=1) # error
# callback = keras.callbacks.EarlyStopping(monitor='precision_metric', patience=3, mode=max) # error
# callback = keras.callbacks.EarlyStopping(monitor='precision_metric', patience=3) # error
# callback = keras.callbacks.EarlyStopping(monitor='auc', patience=3) # doesn't work
# callback = keras.callbacks.EarlyStopping(monitor='loss', min_delta=0.0001, patience=3) # BAD
# callback = keras.callbacks.EarlyStopping(monitor='mse', min_delta=0.0001, patience=3) # BAD
# callback = keras.callbacks.EarlyStopping(monitor='loss', patience=3) # monitors loss if the performance goes down
callback = keras.callbacks.EarlyStopping(monitor='precision_metric', patience=1) # monitors loss if the performance goes down

In [None]:
model.summary() #This tells us what was in this network

In [None]:
# https://www.geeksforgeeks.org/choose-optimal-number-of-epochs-to-train-a-neural-network-in-keras/
epochs = 50

In [None]:
# https://keras.io/api/metrics/classification_metrics/#precision-class
# Optimizer options:
# - sgd (stochastic gradient descent)
# - adam?
# - rmsprop?

# https://keras.io/api/losses/
# https://www.tensorflow.org/api_docs/python/tf/keras/losses 
# Loss parameter options: 
# - binary_crossentropy https://keras.io/api/losses/probabilistic_losses/#binarycrossentropy-class
# - mse 

# https://www.tensorflow.org/api_docs/python/tf/keras/Sequential
# https://www.tensorflow.org/api_docs/python/tf/keras/metrics
# metrics can be a list, like this: metrics=["mae", "acc"]
# Metrics parameter options:
# - accuracy
# - precision https://www.tensorflow.org/api_docs/python/tf/keras/metrics/Precision
# - recall

# TO DO: decide if we should use any optimizers (rmsprop, adam, sgd, adagrad, adadelta)
from keras.optimizers import Adam, SGD, Adagrad, Adadelta, RMSprop

# https://www.tensorflow.org/api_docs/python/tf/keras/metrics/Precision
# https://keras.io/api/metrics/classification_metrics/#precision-class

# OPTIMIZING FOR PRECISION
# Attempt 1:
# Note: With a loss function using from_logits=True, thresholds must be 0
# model.compile(optimizer='adam',
#               loss=keras.losses.BinaryCrossentropy(from_logits=True),
#               metrics=[keras.metrics.Precision(thresholds=0)]) 
# Attempt 2:
model.compile(optimizer='sgd',
              loss='mse',
              metrics=[keras.metrics.Precision()])

# OPTIMIZING FOR AUC
# model.compile(optimizer='sgd',
#               loss=keras.losses.BinaryCrossentropy(from_logits=True),
#               metrics=[keras.metrics.AUC(from_logits=True)])

model.fit(x = x_train, y = y_train, epochs=epochs, callbacks=[callback])

In [None]:
score = model.evaluate(x_test, y_test, verbose=0)
print("Test loss:", score[0])
print("Test precision:", score[1])

# Metrics

In [None]:
from sklearn.metrics import f1_score, precision_score,recall_score

y_predict = model.predict(x_test)


# I wonder about this metric? -- it uses spam and ham in the example!
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.brier_score_loss.html#sklearn.metrics.brier_score_loss

# Visual

In [None]:
# https://towardsdatascience.com/spam-detection-in-emails-de0398ea3b48

# from sklearn.metrics import confusion_matrix
# import seaborn as sns
# import matplotlib.pyplot as plt

# cf_matrix =confusion_matrix(y_test,y_predict)

# ax= plt.subplot()
# #annot=True to annotate cells
# sns.heatmap(cf_matrix, annot=True, ax = ax,cmap='Blues',fmt='');
# # labels, title and ticks
# ax.set_xlabel('Predicted labels');
# ax.set_ylabel('True labels');
# ax.set_title('Confusion Matrix');
# ax.xaxis.set_ticklabels(['Not Spam', 'Spam']); ax.yaxis.set_ticklabels(['Not Spam', 'Spam']);

# Clustering Techniques

## DBSCAN

In [None]:
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

# https://scikit-learn.org/stable/auto_examples/cluster/plot_dbscan.html#sphx-glr-auto-examples-cluster-plot-dbscan-py
#------------------------------DBSCAN ------------------------------------
x = StandardScaler().fit_transform(x)

plt.scatter(x[:, 0], x[:, 1])
plt.show()



In [None]:
# https://towardsdatascience.com/machine-learning-clustering-dbscan-determine-the-optimal-value-for-epsilon-eps-python-example-3100091cfbc
import numpy as np
from sklearn.neighbors import NearestNeighbors
from matplotlib import pyplot as plt
import seaborn as sns
sns.set()

neigh = NearestNeighbors(n_neighbors=2) # Computes the distances to all neighbors to help find the ideal eps value
nbrs = neigh.fit(x)
distances, indices = nbrs.kneighbors(x)

In [None]:
distances = np.sort(distances, axis=0)
distances = distances[:,1]
plt.ylabel("epsilon values")
plt.plot(distances)

# The ideal value for epsilon will be near the elbow of this graph
# I will choose to search for values between 100 and 200 to find a good epsilon in the next code section

In [None]:
# NOTE: TAKES ABOUT A MINUTE TO RUN

from sklearn.cluster import DBSCAN
from sklearn import metrics
def find_ideal_eps_and_samples_for_2_clusters():
    # eps = [100, 125, 150, 200] # Before feature reduction
    eps = [1, 2, 3, 4, 5, 6, 7, 9, 10]
    min_samples = [3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15]
    for i in eps:
        for j in min_samples:
            db = DBSCAN(eps=i, min_samples=j).fit(x)
            labels = db.labels_
            n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
            if n_clusters_ == 2:
                print("The ideal eps is: %d" % i)
                print("The ideal min_samples is: %d" % j)
                return db

# Print out the ideal value for eps and for min_samples parameters:
db = find_ideal_eps_and_samples_for_2_clusters()

In [None]:
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)

In [None]:
print(f"Homogeneity: {metrics.homogeneity_score(y, labels):.3f}")
print(f"Completeness: {metrics.completeness_score(y, labels):.3f}")
print(f"V-measure: {metrics.v_measure_score(y, labels):.3f}")
print(f"Adjusted Rand Index: {metrics.adjusted_rand_score(y, labels):.3f}")
print(
    "Adjusted Mutual Information:"
    f" {metrics.adjusted_mutual_info_score(y, labels):.3f}"
)
print(f"Silhouette Coefficient: {metrics.silhouette_score(x, labels):.3f}")

In [None]:
unique_labels = set(labels)
core_samples_mask = np.zeros_like(labels, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True

colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
    if k == -1:
        # Black used for noise.
        col = [0, 0, 0, 1]

    class_member_mask = labels == k

    xy = x[class_member_mask & core_samples_mask]
    plt.plot(
        xy[:, 0],
        xy[:, 1],
        "o",
        markerfacecolor=tuple(col),
        markeredgecolor="k",
        markersize=14,
    )

    xy = x[class_member_mask & ~core_samples_mask]
    plt.plot(
        xy[:, 0],
        xy[:, 1],
        "o",
        markerfacecolor=tuple(col),
        markeredgecolor="k",
        markersize=6,
    )

plt.title(f"Estimated number of clusters: {n_clusters_}")
plt.ylabel("")
plt.show()