### Préparation des données

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# receiver, sender, sender pseudo, message Id, scenario , a supprimé
def import_dataset(dataset_path, usecols):
    data = pd.read_csv(
        dataset_path,
        usecols=usecols,
        index_col=False
    )
    return data


def clean_dataset(dataset):
    print("Nombre de lignes avant nettoyage : ", dataset.shape[0])

    # On remplace les données infinies par nan si elles existent
    dataset.replace([np.inf, -np.inf], np.nan, inplace=True)

    # Drop les lignes avec nan

    dataset.dropna(inplace=True)
    print("Nombre de lignes après nettoyage : ", dataset.shape[0])


def sample_dataset(dataset, sample_nb):
    return dataset.sample(sample_nb)


def data_preparation(dataset, test_size=0.1):
    # Transformation en array numpy
    X = np.array(dataset.drop(["label"], axis=1))
    y = np.array(dataset["label"])

    for i in range(len(y)):
        if y[i] == 13:
            y[i] = 1

    # Séparation en données d'entrainement et de test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

    return X_train, X_test, y_train, y_test



In [10]:
#liste des path :
# "./data/1416/merged1.csv"
# "./data/1416/merged2.csv"
# "./data/1416/merged3.csv"
# "./data/1416/merged4.csv"
# "./data/1416/merged5.csv"
# "./data/1416/data1416.csv"
dataset_path = "./data/1416/merged1.csv"

usecols = [ "sendTime",
            "rcvTime",
            "pos_y_rec_f",
            "pos_y_rec",
            "pos_x_rec_f",
            "pos_x_rec",
            "pos_y_send_f",
            "pos_y_send",
            "pos_x_send_f",
            "pos_x_send",
            "label"
        ]
"""
usecols = [ "sendTime",
            "rcvTime",
            "label"
        ]
"""
data = import_dataset(dataset_path, usecols)
clean_dataset(data)
X_train, X_test, y_train, y_test = data_preparation(data, test_size=0.1)


Nombre de lignes avant nettoyage :  1242779
Nombre de lignes après nettoyage :  1224816


# Classification par clustering k-means
https://realpython.com/k-means-clustering-python/

In [11]:
import matplotlib.pyplot as plt
from kneed import KneeLocator
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score, classification_report, adjusted_rand_score
from sklearn.preprocessing import StandardScaler

### Standardisation des données

Centrée et normalisé les données :  moyenne de 0 et standard déviation de 1

In [12]:
scaler = StandardScaler()
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.fit_transform(X_test)

### Préparation et entraiement du modèle kmeans

In [20]:
kmeans = KMeans(init="random", n_clusters=2, n_init=50, max_iter=300)
kmeans.fit(scaled_X_train, y_train)

In [21]:
print("The lowest SSE (sum of the squared error) value",kmeans.inertia_)
print("The number of iterations required to converge", kmeans.n_iter_)
print("Predict labels", kmeans.labels_, "VS True label", y_train)
print("Number of features seen during fit", kmeans.n_features_in_)

The lowest SSE (sum of the squared error) value 6568288.618476442
The number of iterations required to converge 4
Predict labels [1 1 0 ... 1 0 1] VS True label [1. 1. 1. ... 1. 1. 1.]
Number of features seen during fit 10


In [22]:
y_pred=kmeans.predict(scaled_X_test)
print("predicted label", y_pred)
print("True label", y_test)

print(classification_report(y_test, y_pred))

# Adjusted rand index (ARI)
ari_kmeans = adjusted_rand_score(y_train, kmeans.labels_)

print("ARI", ari_kmeans)

predicted label [1 1 0 ... 1 1 0]
True label [0. 1. 1. ... 1. 1. 1.]
              precision    recall  f1-score   support

         0.0       0.41      0.25      0.31     51880
         1.0       0.57      0.74      0.65     70602

    accuracy                           0.53    122482
   macro avg       0.49      0.49      0.48    122482
weighted avg       0.50      0.53      0.50    122482

ARI -0.001867186189751843
