# Etape 1 - Preprocessing du dataset

In [None]:
# Import librairies
import os
import re
import sys

import csv
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import warnings


In [None]:
# Set paths
path = "."
os.chdir(path)
data_path = path + "/data"
output_path = path + "\\outputs"
fig_path = path + "\\figures"

In [None]:
# Suppression des FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)

In [None]:
# Parametres graphiques
%matplotlib inline
rc = {
    'font.size': 14,
    'font.family': 'Arial',
    'axes.labelsize': 14,
    'legend.fontsize': 12,
    'axes.titlesize': 14,
    'xtick.labelsize': 12,
    'ytick.labelsize': 12,
    'figure.max_open_warning': 30}

sns.set(font='Arial', rc=rc)
sns.set_style(
    "whitegrid", {
        'axes.edgecolor': 'k',
        'axes.linewidth': 1,
        'axes.grid': True,
        'xtick.major.width': 1,
        'ytick.major.width': 1
        })
sns.set_context(
    "notebook",
    font_scale=1.1,
    rc={"lines.linewidth": 1.5})
pd.set_option('display.max_columns', None)

In [None]:
filename = "working_data_sans_dewey.csv"

In [None]:
df = pd.read_csv(os.path.join(data_path, filename), index_col=0)
print("Dimension of the dataset: ", df.shape)

# Clustering avant échantillonnage

In [None]:
# echantillonnage
nsample = 10000
df_sample = df.sample(n=nsample).reset_index()
df_sample.shape

In [None]:
# Check absence of NA
df_sample.isna().mean()

In [None]:
# Formatage des labels
df_sample["target"] = df_sample["rameau_concepts"].apply(lambda x: eval(x))
df_sample["target"]

In [None]:
# Encodage des mots clés
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
mlb = MultiLabelBinarizer()
labels_encoded = mlb.fit_transform(df_sample["target"])
classes = mlb.classes_

print(classes)

In [None]:
# Check encoding
labels_encoded

In [None]:
# Check inverse transformation
nlab = 5
labels_true = df_sample.loc[:nlab-1, "rameau_concepts"]
print(f"True {nlab} first labels : {labels_true}")
print(f"Recoded {nlab} first labels : {mlb.inverse_transform(labels_encoded[:nlab-1])}")

In [None]:
# Check classes
nbr = 50
print(f"{classes[:nbr]} premiers mots clés (ordre alphabetique)")
print(f"{classes[-nbr:]} derniers mots clés (ordre alphabetique)")

In [None]:
# Clustering based on Kmeans
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.metrics import silhouette_score, davies_bouldin_score

In [None]:
# test different clusters
sil = []
davis_bouldin = []
sum_of_squared_distances = []
param_range = range(3,20)
for k in param_range:
    print(f"Clustering with {k} groups")
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(labels_encoded)
    labels = kmeans.labels_
    sum_of_squared_distances.append(kmeans.inertia_)
    silh = silhouette_score(
        labels_encoded, labels, metric="euclidean", sample_size=50000, random_state=200
        )
    dav = davies_bouldin_score(labels_encoded, labels)
    sil.append(silh)
    davis_bouldin.append(dav)

In [None]:
# Find optimal values accordin to silhouette and Davis-Bouldin scores
opt_val_sil = param_range[sil.index(max(sil))]
opt_val_db = param_range[davis_bouldin.index(min(davis_bouldin))]

In [None]:
def plot_metrics(
    sil, davis_bouldin, param_name, param_range, silhouette_color="red", db_color="blue"
):
    """
    Plot the silhouette score and davies_bouldin score for a range of cluster number.

    Parameters:
    -----------
        - sil (list): list of silhouette scores for all values of hyper-parameter range
        - davis_bouldin (list): list of davies-bouldin scores along hyper-parameter range
        - param_name (str): name of the hyper-parameter to be fined-tuned
        - param_range (list) : range of possible values for hyper-parameter tuning
        - silhouette_color (str): color for the silhouette score (defaut: 'red')
        - db_color (str): color for the silhouette score (defaut: 'blue')

    Returns :
    ---------
        - Evolution of silhouette and Davies-bouldin scores along hyper-parameter range
    """

    # plot
    _, ax1 = plt.subplots()
    ax1.set_xlabel(param_name)
    ax1.set_ylabel("Silhouette_score", color=silhouette_color)
    ax1.plot(param_range, sil, color=silhouette_color)
    ax1.tick_params(axis="y", labelcolor=silhouette_color)

    # Adding Twin Axes
    ax2 = ax1.twinx()
    ax2.set_ylabel("Davies_bouldin", color=db_color)
    ax2.plot(param_range, davis_bouldin, color=db_color)
    ax2.tick_params(axis="y", labelcolor=db_color)

    # Show plot
    plt.show()

In [None]:
plot_metrics(sil, davis_bouldin, param_name="k", param_range=param_range, silhouette_color="red", db_color="blue")

In [None]:
# Plot inertia
plt.plot(param_range, sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('sum_of_squared_distances')
plt.title('elbow method for optimal k')
plt.show()

In [None]:
# Best clustering
#k = opt_val_sil
k = 4
kmeans = KMeans(n_clusters=k)
kmeans.fit(labels_encoded)

In [None]:
# Graph des silhouettes
from yellowbrick.cluster import SilhouetteVisualizer
print("Graph des Silhouettes\n")
silhouette_vis = SilhouetteVisualizer(kmeans)
silhouette_vis.fit(labels_encoded)
silhouette_vis.poof()

In [None]:
# Plot clusters
# Fit visualisation pipeline
import sklearn
from sklearn.manifold import TSNE
tsne =  TSNE(init="pca").fit_transform(labels_encoded)

In [None]:
# Plot TSNE
plt.figure(figsize=(10, 10))
axe = plt.axes()
clusters = kmeans.labels_
num_classes = k
palette = np.array(sns.color_palette("tab10", num_classes))
axe.scatter(x=tsne[:, 0], y=tsne[:, 1], c=palette[clusters.astype(int)])
plt.legend()

In [None]:
df_sample["clusters"] = clusters
df_sample.to_csv(os.path.join(data_path, "clustering_result_10000_notices.csv"))