# 1. Importation des librairies

In [20]:
import pandas as pd
import re     as re
import os     as os  
import uuid   as uuid
import nltk   as nltk

In [21]:
from xgboost                         import XGBClassifier
from sklearn.cluster                 import MiniBatchKMeans
from sklearn.pipeline                import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing           import LabelEncoder, OrdinalEncoder
from sklearn.compose                 import ColumnTransformer
from sklearn.model_selection         import cross_val_score
from sklearn.metrics                 import accuracy_score, classification_report, confusion_matrix
from nltk.corpus                     import stopwords
from joblib                          import dump

# 2. Chargement des données

In [22]:
# Chargement des données d'entraînement
train = pd.read_parquet('../../../data/parquet/train.parquet')

In [23]:
# Chargement des données de test
test = pd.read_parquet('../../../data/parquet/test.parquet')

In [24]:
# Affichage des données d'entraînement
train.head()

Unnamed: 0.1,Unnamed: 0,url,url_len,ip_add,geo_loc,tld,who_is,https,js_len,js_obf_len,content,label
0,0,http://members.tripod.com/russiastation/,40,42.77.221.155,Taiwan,com,complete,yes,58.0,0.0,Named themselves charged particles in a manly ...,good
1,1,http://www.ddj.com/cpp/184403822,32,3.211.202.180,United States,com,complete,yes,52.5,0.0,And filipino field \n \n \n \n \n \n \n \n the...,good
2,2,http://www.naef-usa.com/,24,24.232.54.41,Argentina,com,complete,yes,103.5,0.0,"Took in cognitivism, whose adherents argue for...",good
3,3,http://www.ff-b2b.de/,21,147.22.38.45,United States,de,incomplete,no,720.0,532.8,fire cumshot sodomize footaction tortur failed...,bad
4,4,http://us.imdb.com/title/tt0176269/,35,205.30.239.85,United States,com,complete,yes,46.5,0.0,"Levant, also monsignor georges. In 1800, lists...",good


In [25]:
# Affichage des données de test
test.head()

Unnamed: 0.1,Unnamed: 0,url,url_len,ip_add,geo_loc,tld,who_is,https,js_len,js_obf_len,content,label
0,0,http://www.dutchthewiz.com/freeware/,36,175.67.214.68,China,com,complete,yes,38.5,0.0,"Decay suggest in 1315.. Current constitution, ...",good
1,1,http://www.collectiblejewels.com,32,188.120.171.121,Sweden,com,incomplete,yes,187.0,0.0,breast addict nudger whash ky darkie catholics...,good
2,2,http://www.deadlinedata.com,27,193.51.170.1,France,com,complete,yes,31.0,0.0,Nato's military stoic philosophy says to accep...,good
3,3,http://www.mil.fi/maavoimat/kalustoesittely/00...,56,13.237.35.44,Australia,fi,complete,yes,152.0,0.0,Night being newton. according to the formation...,good
4,4,http://www.avclub.com/content/node/24539,40,220.193.62.89,China,com,complete,yes,150.0,0.0,34 per two children. if we exercise simple pra...,good


# 3. Prétraitement des données

In [26]:
# Suppression des colonnes 'Unnamed: 0' et 'ip_add' et 'js_obf_len'
train.drop(columns=['Unnamed: 0', 'ip_add', 'js_obf_len'], inplace=True)

In [27]:
# Suppression des colonnes 'Unnamed: 0' et 'ip_add' et 'js_obf_len'
test.drop(columns=['Unnamed: 0', 'ip_add', 'js_obf_len'], inplace=True)

In [28]:
# On ne garde que 50 mots par 'content'
train['content'] = train['content'].apply(lambda x: ' '.join(x.split()[:50]))

In [29]:
# On ne garde que 50 mots par 'content'
test['content'] = test['content'].apply(lambda x: ' '.join(x.split()[:50]))

In [30]:
# Suppression des valeurs manquantes
train.dropna(inplace=True)

In [31]:
# Suppression des valeurs manquantes
test.dropna(inplace=True)

In [32]:
# Suppression des doublons
train.drop_duplicates(inplace=True)

In [33]:
# Suppression des doublons
test.drop_duplicates(inplace=True)

In [34]:
train.shape, test.shape

((1200000, 9), (361934, 9))

# 4. Nettoyage des données

In [35]:
# Téléchargement des mots vides
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hugueslopezpardo/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [36]:
# Liste des mots vides 
stop = set(stopwords.words('english'))

In [37]:
def clean_text(text):
    """
    Nettoyage du texte.
    :param text: Texte à nettoyer.
    :return: Texte nettoyé.
    """
    cleaned_text = re.sub(r'[^\w\s]', '', text).lower()
    return ' '.join([item for item in cleaned_text.split() if item not in stop])

In [38]:
def clean_url(url):
    """
    Nettoyage de l'URL.
    :param url: URL à nettoyer.
    :return: URL nettoyée.
    """
    # On retire https://www
    url = url.replace('https://www.', '')
    url = url.replace('http://www.', '')
    return ' '.join(re.sub(r'^(?:http|ftp)s?://|www\.|\.|/', ' ', url).strip().split())

In [39]:
def label_encoder(label):
    """
    Encodage des labels.
    :param label: Label à encoder.
    :return: Label encodé.
    """
    return 1 if label == 'bad' else 0

In [40]:
# Nettoyage des données d'entraînement [url]
train['url'] = train['url'].apply(clean_url)

In [41]:
# Nettoyage des données de test [url]
test['url'] = test['url'].apply(clean_url)

In [42]:
# Nettoyage des données d'entraînement [content]
train['content'] = train['content'].apply(clean_text)

In [43]:
# Nettoyage des données de test [content]
test['content'] = test['content'].apply(clean_text)

In [44]:
# Encodage des labels
train['label'] = train['label'].apply(label_encoder)

In [45]:
# Encodage des labels
test['label'] = test['label'].apply(label_encoder)

In [46]:
train.head()

Unnamed: 0,url,url_len,geo_loc,tld,who_is,https,js_len,content,label
0,members tripod com russiastation,40,Taiwan,com,complete,yes,58.0,named charged particles manly aspect francis g...,0
1,ddj com cpp 184403822,32,United States,com,complete,yes,52.5,filipino field betatron criticized defense pro...,0
2,naef-usa com,24,Argentina,com,complete,yes,103.5,took cognitivism whose adherents argue overall...,0
3,ff-b2b de,21,United States,de,incomplete,no,720.0,fire cumshot sodomize footaction tortur failed...,1
4,us imdb com title tt0176269,35,United States,com,complete,yes,46.5,levant also monsignor georges 1800 lists respe...,0


In [47]:
test.head()

Unnamed: 0,url,url_len,geo_loc,tld,who_is,https,js_len,content,label
0,dutchthewiz com freeware,36,China,com,complete,yes,38.5,decay suggest 1315 current constitution cathed...,0
1,collectiblejewels com,32,Sweden,com,incomplete,yes,187.0,breast addict nudger whash ky darkie catholics...,0
2,deadlinedata com,27,France,com,complete,yes,31.0,natos military stoic philosophy says accept pr...,0
3,mil fi maavoimat kalustoesittely 00116_en dsp,56,Australia,fi,complete,yes,152.0,night newton according formation transformatio...,0
4,avclub com content node 24539,40,China,com,complete,yes,150.0,34 per two children exercise simple practices ...,0


# 5. Création du modèle

In [48]:
# Paramètres de la transformation TfidfVectorizer
tfidf_params = {
    'min_df': 5, 'max_df': 0.95,
    'ngram_range': (1, 2),
    'stop_words': 'english',
    'max_features': 100000
}

In [49]:
# Paramètres de la transformation MiniBatchKMeans
kmeans_params = {
    'n_clusters': 4,
    'init': 'k-means++',
    'init_size': 2048,
    'batch_size': 4096,
    'random_state': 20
}

In [50]:
# Prétraitement des données
preprocessor = ColumnTransformer(transformers=[
    ('content_tfidf_kmeans', Pipeline([
        ('tfidf', TfidfVectorizer(**tfidf_params)),  # Transformation TfidfVectorizer pour content (permet de convertir le texte en vecteurs)
        ('kmeans', MiniBatchKMeans(**kmeans_params)) # Transformation MiniBatchKMeans pour content (permet de regrouper les vecteurs en clusters)
    ]), 'content'),
    ('url_tfidf_kmeans', Pipeline([
        ('tfidf', TfidfVectorizer(**tfidf_params)),  # Transformation TfidfVectorizer pour url (permet de convertir le texte en vecteurs)
        ('kmeans', MiniBatchKMeans(**kmeans_params)) # Transformation MiniBatchKMeans pour url (permet de regrouper les vecteurs en clusters)
    ]), 'url'),
    ('categorical', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), ['geo_loc', 'tld', 'who_is', 'https']), # Transformation OrdinalEncoder pour les variables catégorielles 
    ('passthrough', 'passthrough', ['js_len', 'url_len'])
], remainder='passthrough') # Les colonnes non mentionnées dans les transformations seront ignorées

In [51]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', XGBClassifier())
])

# 6. Entraînement du modèle

In [52]:
# Création des features et de la target
X_train = train.drop(columns=['label'])

In [53]:
# Création des features et de la target
y_train = train['label']

In [54]:
# Entraînement du modèle
pipeline.fit(X_train, y_train)

# 7. Évaluation du modèle

In [55]:
# Prédictions
y_pred = pipeline.predict(test.drop(columns=['label']))

In [56]:
# Score
accuracy = accuracy_score(test['label'], y_pred)

In [57]:
# Matrice de confusion
confusion = confusion_matrix(test['label'], y_pred)

In [58]:
# Rapport de classification
report = classification_report(test['label'], y_pred)

In [59]:
print(f"Score: {accuracy}")

Score: 0.9995026717578342


In [60]:
print(f"Matrice de confusion:\n{confusion}")

Matrice de confusion:
[[353798     74]
 [   106   7956]]


In [61]:
print(f"Rapport de classification:\n{report}")

Rapport de classification:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    353872
           1       0.99      0.99      0.99      8062

    accuracy                           1.00    361934
   macro avg       1.00      0.99      0.99    361934
weighted avg       1.00      1.00      1.00    361934



# 8. Sauvegarde du modèle

In [62]:
# Sauvegarde du modèle
dump(pipeline, '../models/model.joblib')

['../models/model.joblib']