# Etape 2 - Exploration des données (y compris préprocessing)

In [None]:
# Import librairies
import os
import re
import pandas as pd
import nltk
import numpy as np
import pyLDAvis
import pyLDAvis.sklearn
import seaborn as sns
import warnings

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

from utils_text_processing import *

In [None]:
# Suppression des FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)

In [None]:
# Autorisation pour la visualisation par pyLDAvis
pyLDAvis.enable_notebook()

In [None]:
# Paramètres graphiques
%matplotlib inline
rc = {
    'font.size': 14,
    'font.family': 'Arial',
    'axes.labelsize': 14,
    'legend.fontsize': 12,
    'axes.titlesize': 14,
    'xtick.labelsize': 12,
    'ytick.labelsize': 12,
    'figure.max_open_warning': 30}

sns.set(font='Arial', rc=rc)
sns.set_style(
    "whitegrid", {
        'axes.edgecolor': 'k',
        'axes.linewidth': 1,
        'axes.grid': True,
        'xtick.major.width': 1,
        'ytick.major.width': 1
        })
sns.set_context(
    "notebook",
    font_scale=1.1,
    rc={"lines.linewidth": 1.5})

pd.set_option('display.max_columns', None)
dpi = 300

In [None]:
# Import des données
df = pd.read_csv(os.path.join(data_path, "working_data.csv"), index_col=0)
print(f"le Fichier de données contient {df.shape[0]} lignes et  {df.shape[1]} colonnes")

In [None]:
# Visualisation
df.head()

## Exploration des mots-clé RAMEAU

In [None]:
df.loc[1:10, "RAMEAU"]

In [None]:
# Extraction des indices contenant " -- " dans la colonne RAMEAU => i.e chaines d'indexation
df["test_tiret"] = df["RAMEAU"].apply(lambda x: True if re.search(' -- ', x) else False)
df.loc[df["test_tiret"]==True, ["PPN", "RAMEAU"]]

=> 39141 notices avec des chaines d'indexation

In [None]:
# Extraction des chaines d'indexation
df["rameau_list"] = df["RAMEAU"].apply(lambda x: x.split(r'\w;\w'))
print(df.loc[1:10, "rameau_list"])

In [None]:
rameau_list = df["rameau_list"].tolist()
type(rameau_list)

In [None]:
# Nombre de chaines d'indexation différentes
from itertools import chain
rameau_lists = df["rameau_list"].tolist()
rameau_list = list(chain(*rameau_lists))
print(f"{len(rameau_list)} chaines d'indexation, dont {len(set(rameau_list))} uniques")

In [None]:
plot_barplot_of_tags(
    tags_list=rameau_list,
    nb_of_tags=20,
    xlabel="Nombre de references",
    ylabel="RAMEAU - Chaines d'indexation",
    figsave=os.path.join(fig_path, 'barplot_Rameau_chaines_index.png'),
    figsize=(8, 8))

In [None]:
# Mettre à plat TOUS les mots clé
pattern=re.compile(r"[\w;^\s]| -- ")
df["rameau_list_unstack"] = df["RAMEAU"].apply(lambda x: re.split(r';\s*(?![^()]*\))| -- ', x))
df.loc[1:10, "rameau_list_unstack"]

In [None]:
flatten(df.loc[1:10, 'rameau_list_unstack'])

In [None]:
keywords = flatten(df['rameau_list_unstack'])
print(keywords)

In [None]:
len(set(keywords))

In [None]:
print(Counter(keywords))

In [None]:
import wordcloud
wordcloud = WordCloud(width = 1000, height = 500, background_color='white').generate_from_frequencies(Counter(keywords))
plt.figure(figsize=(15,8))
plt.imshow(wordcloud)

In [None]:
plot_barplot_of_tags(
    tags_list=keywords,
    nb_of_tags=20,
    xlabel="Nombre de references",
    ylabel="RAMEAU - Mots clés",
    figsave='figures/barplot_Rameau_keywords_unstack.png',
    figsize=(8, 8))

# Exploration des domaines DDC

In [None]:
plot_barplot_of_tags(
    tags_list=df["TEF_LABEL"],
    nb_of_tags=20,
    xlabel="Nombre de references",
    ylabel="Libellés TEF",
    figsave='figures/barplot_libelles_TEF.png',
    figsize=(8, 8))

In [None]:
df[df["TEF_LABEL"] == "Sciences sociales, sociologie, anthropologie"].shape

In [None]:
df[df["TEF_LABEL"] == "Droit"].shape

In [None]:
df[df["TEF_LABEL"] == "Sport"].shape

In [None]:
df.to_csv(os.path.join(data_path, "working_data_rameau.csv"))

## Exploration des titres

In [None]:
list(set(stopwords.words("french")))

In [None]:
# Test function
idx = 1045
text = df.loc[idx, 'TITRE']
print(text)

In [None]:
doc = nlp("Éloge de la folie : Adages : Colloques : Réflexions sur l'éducation, la religion, la guerre, la philosophie" )
empty_list = []
for token in doc:
    empty_list.append(token.lemma_)

final_string = ' '.join(map(str,empty_list))
print(final_string)

In [None]:
doc = nlp("Éloge de la folie : Adages : Colloques : Réflexions sur l'éducation, la religion, la guerre, la philosophie" )
doc

In [None]:
# add words
add_words = [
        "la",
        "de",
        "le",
        "les",
        "l",
        "au",
        "du"
]


preprocess_text(
        text, add_words,
        numeric=False, stopw=True,
        stem=False, lem=True)

In [None]:
# Test sur un échantillon de notices
df_sample = df.sample(n=20000)

In [None]:
df_sample.shape

In [None]:
# Preprocess titres
df_sample['TITRE_processed'] = df_sample['TITRE'].apply(
    lambda x: preprocess_text(
        x,
        add_words=add_words,
        numeric=False,
        stopw=True,
        stem=False,
        lem=True))

In [None]:
# Exemple
idx = 300
print("Titre brut: \n", df_sample['TITRE'].iloc[idx])
print("\nTitre après processing :\n", df_sample['TITRE_processed'].iloc[idx])

In [None]:
plot_wordcloud(df_sample['TITRE_processed'])

### Exploration des topics (pyLDAvis)

In [None]:
# Vectorization
feature = 'TITRE_processed'
model = TfidfVectorizer(
    max_features=300,
    ngram_range=(1, 5),
    min_df=10,
    max_df=0.95)
cv_transform = model.fit_transform(df_sample[feature])
print("Dimensions de la matrice", cv_transform.shape)

In [None]:
# Reduction dimension
n_comp = 15
lda = LatentDirichletAllocation(
    n_components=n_comp,
    learning_method='online',
    random_state=42
    )
x_red = lda.fit_transform(cv_transform)

In [None]:
p = pyLDAvis.sklearn.prepare(lda, cv_transform, model)
p

In [None]:
pyLDAvis.save_html(p, os.path.join(fig_path, "pyldavis_titres_lemma.html"))

## Exploration des résumés

In [None]:
# Preproces des résumés
df_sample['RESUME_processed'] = df_sample['RESUME'].apply(
    lambda x: preprocess_text(
        x,
        add_words=add_words,
        numeric=False,
        stopw=True,
        stem=False,
        lem=True))

In [None]:
# Exemple
idx = 12945
print("Résumé brut: \n", df_sample['RESUME'].iloc[idx])
print("\nRésumé après processing :\n", df_sample['RESUME_processed'].iloc[idx ])

In [None]:
plot_wordcloud(df_sample['RESUME_processed'])

In [None]:
# Vectorization
feature = 'RESUME_processed'
model = TfidfVectorizer(
    max_features=300,
    ngram_range=(1, 5),
    min_df=10,
    max_df=0.95)
cv_transform = model.fit_transform(df_sample[feature])
print("Dimensions de la matrice", cv_transform.shape)

In [None]:
# Reduction dimension
n_comp = 15
lda = LatentDirichletAllocation(
    n_components=n_comp,
    learning_method='online',
    random_state=42
    )
x_red = lda.fit_transform(cv_transform)

In [None]:
p = pyLDAvis.sklearn.prepare(lda, cv_transform, model)
p

In [None]:
pyLDAvis.save_html(p, os.path.join(fig_path, "pyldavis_resumes_lemma.html"))

## Exploration des description (titre+ resumé)

In [None]:
# Preproces des résumés
df_sample['DESCR_processed'] = df_sample['DESCR'].apply(
    lambda x: preprocess_text(
        x,
        add_words=add_words,
        numeric=False,
        stopw=True,
        stem=False,
        lem=True))

In [None]:
# Exemple
idx = 6549
print("Description brute: \n", df_sample['DESCR'].iloc[idx])
print("\nDescription après processing :\n", df_sample['DESCR_processed'].iloc[idx])

In [None]:
plot_wordcloud(df_sample['DESCR_processed'])

In [None]:
# Vectorization
feature = 'DESCR_processed'
model = TfidfVectorizer(
    max_features=300,
    ngram_range=(1, 5),
    min_df=10,
    max_df=0.95)
cv_transform = model.fit_transform(df_sample[feature])
print("Dimensions de la matrice", cv_transform.shape)

In [None]:
# Reduction dimension
n_comp = 15
lda = LatentDirichletAllocation(
    n_components=n_comp,
    learning_method='online',
    random_state=42
    )
x_red = lda.fit_transform(cv_transform)

In [None]:
p = pyLDAvis.sklearn.prepare(lda, cv_transform, model)
p

In [None]:
pyLDAvis.save_html(p, os.path.join(fig_path, "pyldavis_description_lemma.html"))

# Exploration TEF labels