# Etape 1 - Preprocessing du dataset

In [None]:
# Import librairies
import os
import re
import sys

import csv
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import warnings

from collections import Counter

In [None]:
# Set paths
path = "."
os.chdir(path)
data_path = path + "\\data"
output_path = path + "\\outputs"
fig_path = path + "\\figures"

In [None]:
# Suppression des FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)

In [None]:
# Activation PEP8
%load_ext pycodestyle_magic
%pycodestyle_on

In [None]:
# Parametres graphiques
%matplotlib inline
rc = {
    'font.size': 14,
    'font.family': 'Arial',
    'axes.labelsize': 14,
    'legend.fontsize': 12,
    'axes.titlesize': 14,
    'xtick.labelsize': 12,
    'ytick.labelsize': 12,
    'figure.max_open_warning': 30}

sns.set(font='Arial', rc=rc)
sns.set_style(
    "whitegrid", {
        'axes.edgecolor': 'k',
        'axes.linewidth': 1,
        'axes.grid': True,
        'xtick.major.width': 1,
        'ytick.major.width': 1
        })
sns.set_context(
    "notebook",
    font_scale=1.1,
    rc={"lines.linewidth": 1.5})
pd.set_option('display.max_columns', None)

In [None]:
# Import des données
filename = "export_sans_dewey.dsv"
with open(os.path.join(data_path, filename), 'r', newline='', encoding="utf-8") as csv_file:
    csv_reader = csv.reader(csv_file, delimiter='\t',)
    data = list(csv_reader)

In [None]:
data

In [None]:
# Define dimension
nb_notice = len(data)
print(f"There are {nb_notice} in this file")

In [None]:
# Check number of column
len_col = []
for row in data:
    len_col.append(len(row))
max_number_col = max(len_col)
print(f"There are till {max_number_col} in this file")

In [None]:
# Show distribution
sns.histplot(len_col)

In [None]:
# Checking and removing badly formatted notices
official_nb_col = 5
bad_formated_notices = [x for x in len_col if x != official_nb_col]
print(f"There are {len(bad_formated_notices)} badly formatted notices")

ids_to_keep = [True if x == official_nb_col else False for x in len_col]
print(f"There are {sum(ids_to_keep)} well formatted rows")

data = [row for row, boo in zip(data, ids_to_keep) if boo]
print(f"Working dataset has {len(data)-1} notices")

In [None]:
# Creation du fichier de données à vérifier
data_to_check = [row for row, boo in zip(data, ids_to_keep) if not boo]
print(f"Need to check {len(data_to_check)} notices extractions")

data_to_check = pd.DataFrame(data_to_check)
print(data_to_check.shape)
data_to_check.head()

In [None]:
# Export des données à vérifier
data_to_check.to_csv(
    os.path.join(data_path, "data_to_check_LargeExtraction.csv"),
    index=0)

In [None]:
# Verification du format des données
df = pd.DataFrame(data[1:], columns=data[0])
print(f"le Fichier de données contient {df.shape[0]} lignes et  {df.shape[1]} colonnes")

In [None]:
# Visualisation
df.head()

In [None]:
# Ajout d'une colonne description (Titre + resumé)
df.loc[:, "DESCR"] = df.loc[:, 'TITRE'] + ' ' + df.loc[:, 'RESUME']

# Get RAMEAU labels (vedettes)

In [None]:
# Mettre à plat TOUS les mots clé
pattern=re.compile(r"[\w;^\s]| -- ")
df["rameau_list_unstack"] = df["RAMEAU"].apply(lambda x: re.split(r';\s*(?![^()]*\))| -- ', x))
df.loc[1:20, ["RAMEAU","rameau_list_unstack"]]

In [None]:
def flatten(l):
    return [item for sublist in l for item in sublist]

In [None]:
keywords = flatten(df['rameau_list_unstack'])
print(f"There are {len(set(keywords))} different RAMEAU labels (Vedettes)")

In [None]:
# Répartition des vedettes
print(Counter(keywords))

In [None]:
from wordcloud import WordCloud
wordcloud = WordCloud(width=1000, height=500, background_color='white').generate_from_frequencies(Counter(keywords))
plt.figure(figsize=(15,8))
plt.imshow(wordcloud)

### Retirer les notices avec les mits clés "ouvrages pour la jeunesse" et "romans pour la jeunesse"

In [None]:
def check_strings(df, col, string):
    res = []
    res = df[col].apply(lambda x: string in x)
    print(f"Nbre de notices contenant le concept {string} : {sum(res)}")
    return res

In [None]:
# Check notices
string = "Ouvrages pour la jeunesse"
col = "rameau_list_unstack"
is_string = check_strings(df, col, string)

In [None]:
# Check notices
string = "Roman pour la jeunesse"
col = "rameau_list_unstack"
is_string2 = check_strings(df, col, string)

In [None]:
# Reduction du jeu de données
df_reduced = df[(is_string+is_string2) == 0]
print(df_reduced.shape)

In [None]:
# Wordcloud
keywords2 = flatten(df_reduced['rameau_list_unstack'])
wordcloud2 = WordCloud(
    width=1000, height=500, background_color='white'
    ).generate_from_frequencies(Counter(keywords2))
plt.figure(figsize=(15, 8))
plt.imshow(wordcloud2)

# Clustering avant échantillonnage

In [None]:
# echantillonnage
nsample = 100000
df_sample = df_reduced.sample(n=nsample).reset_index()
df_sample.shape

In [None]:
# Encodage des mots clés
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
mlb = MultiLabelBinarizer()
labels_encoded = mlb.fit_transform(df_sample["rameau_list_unstack"])
classes = mlb.classes_

In [None]:
# Check encoding
labels_encoded

In [None]:
# Check inverse transformation
nlab = 5
labels_true = df_sample.loc[:5, "rameau_list_unstack"]
print(f"True {nlab} first labels : {labels_true}")
print(f"Recoded {nlab} first labels : {mlb.inverse_transform(labels_encoded[:nlab])}")

In [None]:
# Check classes
nbr = 50
print(f"{classes[:nbr]} premiers mots clés (ordre alphabetique)")
print(f"{classes[-nbr:]} derniers mots clés (ordre alphabetique)")

In [None]:
# Clustering based on Kmeans
from sklearn.cluster import KMeans
from sklearn import metrics

In [None]:
# test different clusters
sum_of_squared_distances = []
K = range(1,30)
for k in K:
    print(f"Clustering with {k} groups")
    k_means = KMeans(n_clusters=k)
    model = k_means.fit(labels_encoded)
    sum_of_squared_distances.append(k_means.inertia_)


In [None]:
# Plot inertia
plt.plot(K, sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('sum_of_squared_distances')
plt.title('elbow method for optimal k')
plt.show()

In [None]:
# Best clustering
k = 5
k_means = KMeans(n_clusters=k)
model = k_means.fit(labels_encoded)

In [None]:
# Graph des silhouettes
from yellowbrick.cluster import SilhouetteVisualizer
print("Graph des Silhouettes\n")
silhouette_vis = SilhouetteVisualizer(model)
silhouette_vis.fit(labels_encoded)
silhouette_vis.poof()

In [None]:
model.labels_

In [None]:
# Plot clusters


# Fit visualisation pipeline
import sklearn
from sklearn.manifold import TSNE
tsne =  TSNE(init="pca").fit_transform(labels_encoded)

In [None]:
# Plot TSNE
plt.figure(figsize=(10, 10))
axe = plt.axes()
clusters = model.labels_
num_classes = k
palette = np.array(sns.color_palette("tab10", num_classes))
axe.scatter(x=tsne[:, 0], y=tsne[:, 1], c=palette[clusters.astype(int)])
plt.legend()