# Projet AARN

## 1. Importation des librairies

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
from bs4 import BeautifulSoup
import email
import nltk
# nltk.download('stopwords')
# nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
import string
from nltk.corpus import stopwords
import tensorflow as tf
#import CounterVectorizer
#import Counter class
from collections import Counter

from sklearn.feature_extraction.text import CountVectorizer

## 2. Importation des données

In [None]:
eham_folder_path = "./data/easy_ham/"
hham_folder_path = "./data/hard_ham/"
spam_folder_path = "./data/spam_2/"

def getEmails(folder_path: str):
    data = []
    for fileName in os.listdir(folder_path):
        file =open(folder_path+fileName, encoding = "ISO-8859-1")
        text =file.read()
        data.append(text)
        file.close()
    return np.array(data)
eham = getEmails(eham_folder_path)
hham = getEmails(hham_folder_path)
spam = getEmails(spam_folder_path)


## 3. traitement des données (preprocessing)

### 3.1. prendre le corps du mail 

    en utilisant la bibliothèque email, nous pouvons analyser l'email et en obtenir le corps (ce dont nous avons besoin)

In [None]:
def get_body(text_list: np.array):
    list_body = []
    for i, text in enumerate(text_list):
        text = email.message_from_string(text)
        if text.is_multipart():
            for part in text.walk():
                ctype = part.get_content_type()
                cdispo = str(part.get('Content-Disposition'))
                    # skip any text/plain (txt) attachments
                if ctype == 'text/plain' and 'attachment' not in cdispo:
                    body = part.get_payload(decode=True)  # get body of email
                    break
        else :
            body = text.get_payload(decode=True)
        list_body.append(body)
    return np.array(list_body)
eham = get_body(eham)
hham = get_body(hham)
spam = get_body(spam)

### 3.2. Suppression des balises HTML

    Nous avons utilisé la bibliothèque beatifulsoup pour analyser la structure html (si elle existe) du corps et supprimer toutes les balises html.

In [None]:
def suprimer_HTML(text_list: np.array):
    list_html = []
    for i, text in enumerate(text_list):
        soup = BeautifulSoup(text, 'html.parser')
        list_html.append(soup.get_text())
    return np.array(list_html)
eham = suprimer_HTML(eham)
hham = suprimer_HTML(hham)
spam = suprimer_HTML(spam)

### 3.3. Minusculisation

In [None]:
def minuscule(text_list: np.array):
    list_mins = []
    for i, text in enumerate(text_list):
        list_mins.append( text.lower())
    return np.array(list_mins)
eham = minuscule(eham)
hham = minuscule(hham)
spam = minuscule(spam)

### 3.4. Elimination des URLs et des adresses email

    Nous avons utilisé la bibliothèque re pour supprimer les URLs et les adresses email.

In [None]:
def normaliser_url_email(text_list: np.array):
    text_url = []
    for i, text in enumerate(text_list):
        #supprimer les emails
        text = re.sub(r'[\w\.-]+@[\w\.-]+', 'emailaddr ', text)
        #supprimer les urls
        link_pattern=r'(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))';
        text = re.sub(link_pattern, 'httpaddr ', text)
        text_url.append(text)
    return np.array(text_url)
eham = normaliser_url_email(eham)
hham = normaliser_url_email(hham)
spam = normaliser_url_email(spam)

### 3.5. Suppression des caractères spéciaux et les nombres

    Nous avons utilisé la bibliothèque re pour supprimer les caractères spéciaux et les nombres.

In [None]:
def normaliser_nombre_char_spec(text_list : np.array):
    list_text = []
    for i, text in enumerate(text_list):
        text = text.translate(str.maketrans("", "", string.punctuation))# supprimer les ponctuations
        #replace every number with the word number
        num_pattern =  r"\d+"
        text = re.sub(num_pattern, " nombre ", text)
        text.replace("$", " dollar ")
        #supprimer les stop words
        stop = stopwords.words("english")
        list = [m for m in text.split() if m not in stop]
        list_text.append(" ".join(list))
        # list_text.append(text)
    return np.array(list_text)
eham = normaliser_nombre_char_spec(eham)
hham = normaliser_nombre_char_spec(hham)
spam  = normaliser_nombre_char_spec(spam)

### 3.5. Radicalisation des mots

    Nous avons utilisé la bibliothèque nltk.PorterStemmer pour radicaliser les mots.

In [None]:
def radicalisation(text_list: np.array):
    list_radical = []
    # stemmer = PorterStemmer()
    stemmer = WordNetLemmatizer()
    for i, text in enumerate(text_list):
        list_mot = [stemmer.lemmatize(m) for m in text.split()]
        list_radical.append(" ".join(list_mot))
    return np.array(list_radical)

# text = """
# > Anyone knows how much it costs to host a web portal ?
# >
# Well, it depends on how many visitors youre expecting. anywhere from less than 10 bucks a month to a couple of $100. You should checkout http://www.rackspace.com/ or perhaps Amazon EC2 if youre running something big..
# To unsubscribe yourself from this mailing list, send an email to: groupname-unsubscribe@egroups.com
# """
# ar = np.array([text])
# ar = minuscule(ar)
# ar = normaliser_url_email(ar)
# ar = normaliser_nombre_char_spec(ar)
# ar = radicalisation(ar)
# print(ar[0])
hham = radicalisation(hham)
eham = radicalisation(eham)
spam = radicalisation(spam)
print(len(hham))
print(len(eham))
print(len(spam))

##  4. Construction du vocabulaire

    Nous avons utilisé la bibliothèque nltk.FreqDist pour construire le vocabulaire.

### 4.1. regroupement des mots


In [None]:
data = np.concatenate((hham, eham, spam), axis=0)

labels = np.concatenate((np.zeros(len(hham)), np.ones(len(eham)), np.ones(len(spam))), axis=0)

In [None]:
# write each data with its label in a file
with open("data.csv", "w") as f:
    for i, text in enumerate(data):
        f.write(text + "," + str(int(labels[i])) + "\n")

In [None]:
words = []
# for line in data:
for line in spam:
    for mot in line.split():
        words.append(mot)
# words = (words)
print(len(words)) 

In [None]:
pd.DataFrame(words).head()

In [None]:
wordCount = Counter(words)
mostCommon = [word for word, count in wordCount.items() if count >= 4 ]
print(mostCommon)

len(mostCommon)

In [None]:
#save the most common words with a token in a csv file
with open("vocab.csv", "w") as f:
    for i, word in enumerate(mostCommon):
        f.write(word + "," + str(i) + "\n")


### 4.2. Extraction des caractéristiques

on va utiliser la representation par comptage des mots

In [59]:
mails = pd.read_csv("data.csv", header=None)
mails.columns = ["text", "label"]
mails.dropna(inplace=True)
mails.head()

Unnamed: 0,text,label
0,tech update today vital sign july nombre nombr...,0
1,view newsletter fullcolor visit httpaddr mediu...,0
2,today headline register unsubscribe daily news...,0
3,cnet shopper newsletter mac edition shopper cn...,0
4,testing patch top today cv patch didnt help fo...,0


In [57]:
#read the vocab file
vocab = pd.read_csv("vocab.csv", header=None)
vocab.columns = ["word", "token"]
vocab.head()
vocab_words = vocab["word"].values
vocab_tokens = vocab["token"].values

In [60]:
y = np.array(mails["label"])
X = np.array(mails["text"])

pd.DataFrame(X).head()

Unnamed: 0,0
0,tech update today vital sign july nombre nombr...
1,view newsletter fullcolor visit httpaddr mediu...
2,today headline register unsubscribe daily news...
3,cnet shopper newsletter mac edition shopper cn...
4,testing patch top today cv patch didnt help fo...


In [None]:
# generate the caracteristic matrix
def generate_caracteristic_matrix(X, vocab):
    X_matrix = np.zeros((len(X), len(vocab)))
    for i, text in enumerate(X):
        words = text.split()
        for j, voc in enumerate(vocab):
            if voc in words:
                X_matrix[i, j] = 1
    return X_matrix



In [63]:
# extract chracteristic matrix using sklearn
def gen_carcateristic(X, vocab):
    vectorizer = CountVectorizer(vocabulary=vocab)
    X_matrix = vectorizer.fit_transform(X)
    return X_matrix

# 5. Classification des emails



## 5.1. avec sklearn

## 5.2. avec un reseau de neurones

    Nous avons utilisé la bibliothèque tensorflow.keras pour construire un reseau de neurones.