In [1]:
import regex as re
from charset_normalizer import from_path
from nltk import tokenize, stem, corpus, download
import numpy as np


# Préparation des données

Dans notre code on utilise deux fichier, un fichier "spam.txt" qui contient tout les chemins vers les fichiers spam, et un fichier "ham.txt" qui contient tout les chemins vers les fichiers ham.

Fonction de nettoyage des mails et d'extractions de tokens.

In [2]:
def preprocess(path):
    
    keywordsReg = r"^\s*(To:|From:|Subject:|Date:|Content\W(.*?):)" #Regex mot cles d'un email (subjec, to, from ...etc)
    HTMLreg = r"<(.|\n)+?>" #Regex balise HTML
    adressReg = r"\b[\w-]+@[\w-]+\.[A-Za-z]\b" #Regex adress email
    urlReg = r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)" #Regex URL

    i = 0
    tokensList = []
    fileName = path.split('/')[-1][:-1]
    encoding = from_path(path[:-1]).best().encoding
    spamMail = open(path[:-1], "r", encoding=encoding)
    
    content = ""
    while(spamMail.readline() != '\n'):
        pass

    for line in spamMail.readlines():
        if re.search(keywordsReg, line) == None:
            content = content + line
    
    content = re.sub(HTMLreg, " ", content)
    content = re.sub(adressReg, " emailaddr ", content)
    content = re.sub(urlReg, " httpaddr ", content)
    content = re.sub(r"\d+", " number " , content)
    content = re.sub(r"$", " dollar ", content)
    content = content.lower()

    spamMail.close()

    tokenizer = tokenize.TreebankWordTokenizer()
    tokens = tokenizer.tokenize(content)

    stemmer = stem.SnowballStemmer("english")
    tokens = [stemmer.stem(token) for token in tokens if token.isalpha()]

    return tokens


Extraction des tokens des mails spam

In [3]:
spamPaths = open("spam.txt", "r")
spamTokens = []
for path in spamPaths.readlines():
    spamTokens = spamTokens + preprocess(path)


In [4]:
print(len(spamTokens))

633208


Extraction des tokens des mails ham

In [5]:
hamPaths = open("ham.txt", "r")
hamTokens = []

for path in hamPaths.readlines() :
    hamTokens = hamTokens + preprocess(path)

In [6]:
print(len(hamTokens))

488616


Construction du vocabulaire.
On garde les tokens qui ont au moins K occurrences.

In [7]:
from collections import Counter

initial_vocab = hamTokens + spamTokens
print('Nombre de tokens total: ' ,len(initial_vocab))

download('stopwords', quiet=True)
stopwords = corpus.stopwords.words('english')
initial_vocab = [token for token in initial_vocab if token not in stopwords]
print('Nombre de tokens sans stopwords: ' ,len(initial_vocab))

final_vocab = [] 
k = 10 #nombre d'occurence minimum d'un token pour qu'il fait partie du vocabulaire
frequencies = Counter(initial_vocab)
for key,value in frequencies.items():
    if value >= k:
        final_vocab.append(key)

print("Nombre de token dans le vocabulaire final: ", len(final_vocab))


Nombre de tokens total:  1121824
Nombre de tokens sans stopwords:  739353
Nombre de token dans le vocabulaire final:  5294


# Construction du DataSet

Construction de la liste d'index de mots

In [8]:
token_to_index = dict()
i = 0 
for i in range(len(final_vocab)):
    token_to_index[final_vocab[i]] = i 

Transformation des mail en séquence d'index de mot du vocabulaire

In [9]:
def transform(path):
    tokens = preprocess(path)
    transformed = [token_to_index[token] for token in tokens if token in final_vocab]
    return transformed
    

Extraction des caractéristiques par comptage

In [10]:
def counterFeatures(path):
    indices = transform(path)
    X = np.zeros((1,len(final_vocab)))

    for i in indices:
        X[0][i] += 1 

    return X 


Construction du DataSet

In [11]:
spamPaths.seek(0)
hamPaths.seek(0)
paths = spamPaths.readlines() + hamPaths.readlines()

n = len(final_vocab)
m = len(paths)
X_counter = np.zeros((m, n))
for i in range(m) :
    X_counter[i][:] = counterFeatures(paths[i]) 

X_binary = np.copy(X_counter)
X_binary[X_binary > 1] = 1 ##Extraction des caracteristiques binaires


Y = np.concatenate((np.ones((1396, 1)), np.zeros((2500, 1)))) #1396 email spam, 2500 non spam


# Classification

Séparer les données en un ensemble d’entraînement et un ensemble d'evaluation

In [12]:
from sklearn.model_selection import train_test_split
X_counter_train, X_counter_test, Y_counter_train, Y_counter_test = train_test_split(X_counter, Y, test_size=0.25) 
X_binary_train, X_binary_test, Y_binary_train, Y_binary_test = train_test_split(X_binary, Y, test_size=0.25) 

## SVM

### Approche caractéristiques par comptage:

In [13]:
from sklearn.svm import SVC
svc_counter = SVC(C=0.1,kernel ="linear")
svc_counter.fit(X_counter_train, Y_counter_train.ravel())

SVC(C=0.1, kernel='linear')

In [14]:
print("-----------------SVM Avec caracteristiques par comptages-----------------")
print("Training Accuracy:",(svc_counter.score(X_counter_train, Y_counter_train.ravel()))*100,"%")
print("Testing Accuracy:",(svc_counter.score(X_counter_test, Y_counter_test.ravel()))*100,"%")

-----------------SVM Avec caracteristiques par comptages-----------------
Training Accuracy: 99.7946611909651 %
Testing Accuracy: 98.4599589322382 %


### Approche caractéristiques binaires:

In [15]:
from sklearn.svm import SVC
svc_binary = SVC(C=0.1,kernel ="linear")
svc_binary.fit(X_binary_train, Y_binary_train.ravel())

SVC(C=0.1, kernel='linear')

In [16]:
print("-----------------SVM Avec caracteristiques binaires-----------------")
print("Training Accuracy:",(svc_binary.score(X_binary_train, Y_binary_train.ravel()))*100,"%")
print("Testing Accuracy:",(svc_binary.score(X_binary_test, Y_binary_test.ravel()))*100,"%")

-----------------SVM Avec caracteristiques binaires-----------------
Training Accuracy: 99.69199178644764 %
Testing Accuracy: 98.76796714579056 %


## Naive Bayes

### Approche caractéristiques par comptage:

In [17]:
from sklearn.naive_bayes import MultinomialNB
bayes_counter = MultinomialNB()
bayes_counter.fit(X_counter_train, Y_counter_train.ravel())

MultinomialNB()

In [18]:
print("-----------------Naive Bayes Avec caracteristiques par comptages-----------------")
print("Training Accuracy:",(bayes_counter.score(X_counter_train, Y_counter_train.ravel()))*100,"%")
print("Testing Accuracy:",(bayes_counter.score(X_counter_test, Y_counter_test.ravel()))*100,"%")

-----------------Naive Bayes Avec caracteristiques par comptages-----------------
Training Accuracy: 95.24298425735797 %
Testing Accuracy: 95.27720739219713 %


### Approche caractéristiques binaires

In [19]:
from sklearn.naive_bayes import MultinomialNB
bayes_binary = MultinomialNB()
bayes_binary.fit(X_binary_train, Y_binary_train.ravel())

MultinomialNB()

In [20]:
print("-----------------Naive Bayes Avec caracteristiques binaires-----------------")
print("Training Accuracy:",(bayes_binary.score(X_binary_train, Y_binary_train.ravel()))*100,"%")
print("Testing Accuracy:",(bayes_binary.score(X_binary_test, Y_binary_test.ravel()))*100,"%")

-----------------Naive Bayes Avec caracteristiques binaires-----------------
Training Accuracy: 98.97330595482546 %
Testing Accuracy: 98.35728952772074 %


## Réseau de neurones

### Approche caractéristiques par comptage:

In [21]:
from sklearn.neural_network import MLPClassifier
mlp_counter = MLPClassifier(hidden_layer_sizes=(50,25), activation = 'logistic')
mlp_counter.fit(X_counter_train, Y_counter_train.ravel())

MLPClassifier(activation='logistic', hidden_layer_sizes=(50, 25))

In [22]:
print("-----------------Reseau de neurones Avec caracteristiques par comptages-----------------")
print("Training Accuracy:",(mlp_counter.score(X_counter_train, Y_counter_train.ravel()))*100,"%")
print("Testing Accuracy:",(mlp_counter.score(X_counter_test, Y_counter_test.ravel()))*100,"%")

-----------------Reseau de neurones Avec caracteristiques par comptages-----------------
Training Accuracy: 99.96577686516085 %
Testing Accuracy: 98.4599589322382 %


### Approche caractéristiques binaires

In [23]:
from sklearn.neural_network import MLPClassifier
mlp_binary = MLPClassifier(hidden_layer_sizes=(50,25), activation = 'logistic')
mlp_binary.fit(X_counter_train, Y_counter_train.ravel())

MLPClassifier(activation='logistic', hidden_layer_sizes=(50, 25))

In [24]:
print("-----------------Naive Bayes Avec caracteristiques binaires-----------------")
print("Training Accuracy:",(mlp_binary.score(X_binary_train, Y_binary_train.ravel()))*100,"%")
print("Testing Accuracy:",(mlp_binary.score(X_binary_test, Y_binary_test.ravel()))*100,"%")

-----------------Naive Bayes Avec caracteristiques binaires-----------------
Training Accuracy: 97.98083504449008 %
Testing Accuracy: 98.04928131416838 %
