In [None]:
import torch
import torch.nn as nn
import cv2
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.manifold import LocallyLinearEmbedding as LLE
from utils import normalize, Assessment, plot_time_comparison
%load_ext autoreload
%autoreload 2

# About the dataset 

The data set contains some information about mails (frequency of some words, special characters, number of capital letters etc...). Each mail is then classified as spam (class = 1) or not spam (class = 0). Each datapoint has 56 features, if you want to get more information about these features, you can go in the data folder where all these information are detailed (or you can go directly on the website where the data set is available https://archive.ics.uci.edu/ml/datasets/Spambase). The goal of this project is to find a manifold where the data of interest lies (if it exists one). Here, the data of interest is of course to know if a mail is a spam or not. Let's then try to find a lower-dimensional space where we can easily separate these two classes. 

In [None]:
columns = ["Class"]
columns += ["Frequence word " + str(i) for i in range(1,49)]
columns += ["Special character " + str(i) for i in range(1,7)]
columns += ["Capital length mean"]
columns += ["Capital length longest"]
columns += ["Sum of captital length"]
data = pd.read_csv("../Data/spambasedata.csv",names=columns)
data.shape # Let's have a look at the sructure of the data set
data_np = data.to_numpy().astype(float) # Convert it to a numpy array

In [None]:
embed = LLE(random_state=10)
embed.fit(data_np)
print(embed.get_params())

In [None]:
class_dist = data.groupby(["Class"]).count().iloc[:,0].reset_index()
class_dist.rename(columns={"Frequence word 1":"Number of data points"},inplace=True)
sns.barplot(data=class_dist,x="Class",y="Number of data points")
sns.set_style("darkgrid")
plt.show()

# Dimensionality Reduction

We will test and compare two dimensionality reduction methods : Locally Linear Embeddings (LLE) and one of its derivative Modified Locally Linear Embeddings (MLLE). To do so we will see if this these algorithms are able to extract the relevant information, idest to separate the 2 classes in two different clusters. 

### Sanity checks

In [None]:
range_components = np.arange(2,51,4)
range_neighbors = np.arange(10,41,5)
myalgo = Assessment(data_np,range_components,range_neighbors,check=False)

In [None]:
train_set, test_set = myalgo.crossksets(data_np)
print("number of sets",len(train_set))
print("size of one training set",train_set[1].shape)
print("size of one testing set",test_set[1].shape)
print("size of last training set",train_set[-1].shape)
print("size of last testing set",test_set[-1].shape)
print("ratio of classes",train_set[0][train_set[0][:,0] == 1].shape[0]/train_set[0].shape[0]*100)

In [None]:
train_set_2, test_set_2 = myalgo.crossksets(data_np)
print("Number of different values :",(train_set[0] != train_set_2[0]).sum())
print("Number of different values :",(train_set[1] != train_set_2[1]).sum())
print("Number of different values :",(test_set[0] != test_set_2[0]).sum())

# Locally Linear Embeddings

In [None]:
range_components = np.arange(1,4,2)
range_neighbors = np.arange(10,101,10)
LLE_algo = Assessment(data_np,range_components,range_neighbors,k=5,check=False,run=1)

### Have a look on the first 4 components

In [None]:
LLE_algo.generate_pairplot(700,4,save_file="LLE_pairplot",title="teub",norm_0100=True)

In [None]:
LLE_algo.generate_3Dplot(10,"LLE")

## Hyperparameters analysis

1 classical metric and two additional ones have been chosen to determine the number of neighbors to use:
* The reconstruction error, or the cumulative sum of the eigenvalues
* The accuracy and the F1 measure using linear SVM to separate the classes
* The accuracy and the F1 measure using KNN to separate the classes

In [None]:
LLE_algo.reset()
LLE_algo.find_hyper()

In [None]:
LLE_algo.generate_all(save_file="LLE_test")

In [None]:
LLE_algo.plot_cumulative_error(title="Reconstruction error",save_file="LLE_cum_test")

# Modified Locally Linear Embeddings 

In [None]:
range_components = np.arange(2,8,5)
range_neighbors = np.arange(70,101,10)
MLLE_algo = Assessment(data_np,range_components,range_neighbors,k=5,method="modified")

### Have a look on the first 4 components

In [None]:
MLLE_algo.generate_pairplot(100,20,title="yeah")

In [None]:
fig = plt.figure(figsize=[10,8])
MLLE_algo.generate_3Dplot(50,fig,[1,1,1],"MLLE")
fig.savefig("test.svg")

## Hyperparameters analysis

1 classical metric and two additional ones have been chosen to determine the number of neighbors to use:
* The reconstruction error, or the cumulative sum of the eigenvalues
* The accuracy and the F1 measure using linear SVM to separate the classes
* The accuracy and the F1 measure using KNN to separate the classes

In [None]:
MLLE_algo.reset()
MLLE_algo.find_hyper()

In [None]:
MLLE_algo.generate_all(save_file="not none")

In [None]:
MLLE_algo.plot_cumulative_error(title="Reconstruction error",save_file="MLLE_cum_test")