# Projet AARN

## 1. Importation des librairies

In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
from bs4 import BeautifulSoup
import email
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
import string
from nltk.corpus import stopwords
import tensorflow as tf
#import CounterVectorizer
#import Counter class
from collections import Counter

from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/abdennour/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/abdennour/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## 2. Importation des données

In [2]:
eham_folder_path = "./data/easy_ham/"
hham_folder_path = "./data/hard_ham/"
spam_folder_path = "./data/spam_2/"

def getEmails(folder_path: str):
    data = []
    for fileName in os.listdir(folder_path):
        file =open(folder_path+fileName, encoding = "ISO-8859-1")
        text =file.read()
        data.append(text)
        file.close()
    return np.array(data)
eham = getEmails(eham_folder_path)
hham = getEmails(hham_folder_path)
spam = getEmails(spam_folder_path)


## 3. traitement des données (preprocessing)

### 3.1. prendre le corps du mail 

    en utilisant la bibliothèque email, nous pouvons analyser l'email et en obtenir le corps (ce dont nous avons besoin)

In [3]:
def get_body(text_list: np.array):
    list_body = []
    for i, text in enumerate(text_list):
        text = email.message_from_string(text)
        if text.is_multipart():
            for part in text.walk():
                ctype = part.get_content_type()
                cdispo = str(part.get('Content-Disposition'))
                    # skip any text/plain (txt) attachments
                if ctype == 'text/plain' and 'attachment' not in cdispo:
                    body = part.get_payload(decode=True)  # get body of email
                    break
        else :
            body = text.get_payload(decode=True)
        list_body.append(body)
    return np.array(list_body)
eham = get_body(eham)
hham = get_body(hham)
spam = get_body(spam)

### 3.2. Suppression des balises HTML

    Nous avons utilisé la bibliothèque beatifulsoup pour analyser la structure html (si elle existe) du corps et supprimer toutes les balises html.

In [4]:
def suprimer_HTML(text_list: np.array):
    list_html = []
    for i, text in enumerate(text_list):
        soup = BeautifulSoup(text, 'html.parser')
        list_html.append(soup.get_text())
    return np.array(list_html)
eham = suprimer_HTML(eham)
hham = suprimer_HTML(hham)
spam = suprimer_HTML(spam)

  soup = BeautifulSoup(text, 'html.parser')
  soup = BeautifulSoup(text, 'html.parser')
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


### 3.3. Minusculisation

In [5]:
def minuscule(text_list: np.array):
    list_mins = []
    for i, text in enumerate(text_list):
        list_mins.append( text.lower())
    return np.array(list_mins)
eham = minuscule(eham)
hham = minuscule(hham)
spam = minuscule(spam)

### 3.4. Elimination des URLs et des adresses email

    Nous avons utilisé la bibliothèque re pour supprimer les URLs et les adresses email.

In [6]:
def normaliser_url_email(text_list: np.array):
    text_url = []
    for i, text in enumerate(text_list):
        #supprimer les emails
        text = re.sub(r'[\w\.-]+@[\w\.-]+', 'emailaddr ', text)
        #supprimer les urls
        link_pattern=r'(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))';
        text = re.sub(link_pattern, 'httpaddr ', text)
        text_url.append(text)
    return np.array(text_url)
eham = normaliser_url_email(eham)
hham = normaliser_url_email(hham)
spam = normaliser_url_email(spam)

### 3.5. Suppression des caractères spéciaux et les nombres

    Nous avons utilisé la bibliothèque re pour supprimer les caractères spéciaux et les nombres.

In [7]:
def normaliser_nombre_char_spec(text_list : np.array):
    list_text = []
    for i, text in enumerate(text_list):
        text = text.translate(str.maketrans("", "", string.punctuation))# supprimer les ponctuations
        #replace every number with the word number
        num_pattern =  r"\d+"
        text = re.sub(num_pattern, " nombre ", text)
        text.replace("$", " dollar ")
        #supprimer les stop words
        stop = stopwords.words("english")
        list = [m for m in text.split() if m not in stop]
        list_text.append(" ".join(list))
        # list_text.append(text)
    return np.array(list_text)
eham = normaliser_nombre_char_spec(eham)
hham = normaliser_nombre_char_spec(hham)
spam  = normaliser_nombre_char_spec(spam)

### 3.5. Radicalisation des mots

    Nous avons utilisé la bibliothèque nltk.PorterStemmer pour radicaliser les mots.

In [8]:
def radicalisation(text_list: np.array):
    list_radical = []
    # stemmer = PorterStemmer()
    stemmer = WordNetLemmatizer()
    for i, text in enumerate(text_list):
        list_mot = [stemmer.lemmatize(m) for m in text.split()]
        list_radical.append(" ".join(list_mot))
    return np.array(list_radical)

# text = """
# > Anyone knows how much it costs to host a web portal ?
# >
# Well, it depends on how many visitors youre expecting. anywhere from less than 10 bucks a month to a couple of $100. You should checkout http://www.rackspace.com/ or perhaps Amazon EC2 if youre running something big..
# To unsubscribe yourself from this mailing list, send an email to: groupname-unsubscribe@egroups.com
# """
# ar = np.array([text])
# ar = minuscule(ar)
# ar = normaliser_url_email(ar)
# ar = normaliser_nombre_char_spec(ar)
# ar = radicalisation(ar)
# print(ar[0])
hham = radicalisation(hham)
eham = radicalisation(eham)
spam = radicalisation(spam)
print(len(hham))
print(len(eham))
print(len(spam))

250
2551
1397


##  4. Construction du vocabulaire

    Nous avons utilisé la bibliothèque nltk.FreqDist pour construire le vocabulaire.

### 4.1. regroupement des mots


In [9]:
data = np.concatenate((hham, eham, spam), axis=0)

#save the data in a file
# with open('data.txt', 'w') as f:
#     for item in data:
#         f.write("%s," % item)

#save the labels in a file

labels = np.concatenate((np.zeros(len(hham)), np.ones(len(eham)), np.ones(len(spam))), axis=0)

In [17]:
words = []
for line in data:
   for word in line.split():
       words.append(word)
words = set(words)
print(len(words)) 

44927


In [21]:
#get the most frequent words using the Counter class
word_counts = Counter(words)
print(word_counts.most_common(10))
most_common_words = word_counts.most_common(30000)
len(most_common_words)

[('naveen', 1), ('ineptly', 1), ('port', 1), ('wirelessno', 1), ('radiouserland', 1), ('abit', 1), ('landspeed', 1), ('disenfranchised', 1), ('sona', 1), ('hdfrl', 1)]


30000

In [23]:
#store the most frequent words in a file
pd.DataFrame(most_common_words).to_csv("most_common_words.csv", index=False)


### 4.2. symbolisation des mots

    Nous avons utilisé la bibliothèque tensorflow.keras.preprocessing.text.Tokenizer pour symboliser les mots.

In [29]:

#reading the data from the file
data = np.genfromtxt('most_common_words.csv', delimiter=',', dtype=str)
#get the first column
data = data[:,0]
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=30000, oov_token="<OOV>")
tokenizer.fit_on_texts(data)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(data)
padded = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=1000, padding='post', truncating='post')


### 4.3. sauvegarde du vocabulaire dans un fichier csv

In [30]:
#store the words and their index in a file, the first column is the word and the second column is the index
import csv
with open('word_index.csv', 'w') as csv_file:
    writer = csv.writer(csv_file)
    for key, value in word_index.items():
        writer.writerow([key, value])
