# Exploration des données textuelles (titres et/ou résumés)

Ce notebook explore les concepts RAMEA

# Set project

### Packages

In [None]:
# Import librairies
import os
import re
import pandas as pd
import nltk
import numpy as np
import pyLDAvis
import pyLDAvis.sklearn
import seaborn as sns
import spacy
import warnings

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

from utils_text_processing import *
from utils_visualization import *


nlp = spacy.load("fr_core_news_md")

In [None]:
# Suppression des FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)

### Graphical parameters

In [None]:
# Parametres graphiques
%matplotlib inline
rc = {
    'font.size': 14,
    'font.family': 'Arial',
    'axes.labelsize': 14,
    'legend.fontsize': 12,
    'axes.titlesize': 14,
    'xtick.labelsize': 12,
    'ytick.labelsize': 12,
    'figure.max_open_warning': 30}

sns.set(font='Arial', rc=rc)
sns.set_style(
    "whitegrid", {
        'axes.edgecolor': 'k',
        'axes.linewidth': 1,
        'axes.grid': True,
        'xtick.major.width': 1,
        'ytick.major.width': 1
        })
sns.set_context(
    "notebook",
    font_scale=1.1,
    rc={"lines.linewidth": 1.5})
pd.set_option('display.max_columns', None)


In [None]:
# Autorisation pour la visualisation par pyLDAvis
pyLDAvis.enable_notebook()

### Paths

In [None]:
# Set paths
path = "."
os.chdir(path)
data_path = path + "\\data"
output_path = path + "\\outputs"
fig_path = path + "\\figs"

### Useful functions

# Import data

In [None]:
# Input data
working_data_filename = "working_data_sans_dewey.csv"
analyse_dewey = False

In [None]:
# Import data
df = pd.read_csv(os.path.join(data_path, working_data_filename), index_col=0)
print(f"le Fichier de données contient {df.shape[0]} lignes et  {df.shape[1]} colonnes")

In [None]:
# Visualisation
df.head()

# Exploration des titres

In [None]:
# List des stopwords
list(set(stopwords.words("french")))

In [None]:
# Test function
idx = 698
text = df.loc[idx, 'TITRE']
print(text)

In [None]:
# Exemple lemmatization
doc = nlp(text)
print("Sans lemmatization :", doc)

empty_list = []
for token in doc:
    empty_list.append(token.lemma_)

final_string = ' '.join(map(str,empty_list))
print("Après lemmatization :",final_string)

In [None]:
# add words
add_words = [
        "la",
        "de",
        "le",
        "les",
        "l",
        "au",
        "du"
]


preprocess_text(
        text, add_words,
        numeric=False, stopw=True,
        stem=False, lem=True)

In [None]:
# Test sur un échantillon de notices
df_sample = df.sample(n=20000)
print(df_sample.shape)

In [None]:
# Preprocess titres
df_sample['TITRE_processed'] = df_sample['TITRE'].apply(
    lambda x: preprocess_text(
        x,
        add_words=add_words,
        numeric=False,
        stopw=True,
        stem=False,
        lem=True))

In [None]:
# Exemple
idx = 300
print("Titre brut: \n", df_sample['TITRE'].iloc[idx])
print("\nTitre après processing :\n", df_sample['TITRE_processed'].iloc[idx])

In [None]:
plot_wordcloud(df_sample['TITRE_processed'])

## Exploration des topics (pyLDAvis)

In [None]:
# Vectorization
feature = 'TITRE_processed'
model = TfidfVectorizer(
    max_features=300,
    ngram_range=(1, 5),
    min_df=10,
    max_df=0.95)
cv_transform = model.fit_transform(df_sample[feature])
print("Dimensions de la matrice", cv_transform.shape)

In [None]:
# Reduction dimension
n_comp = 15
lda = LatentDirichletAllocation(
    n_components=n_comp,
    learning_method='online',
    random_state=42
    )
x_red = lda.fit_transform(cv_transform)

In [None]:
p = pyLDAvis.sklearn.prepare(lda, cv_transform, model)

In [None]:
pyLDAvis.save_html(p, os.path.join(fig_path, "pyldavis_titres_lemma.html"))

# Exploration des résumés

In [None]:
# Preproces des résumés
df_sample['RESUME_processed'] = df_sample['RESUME'].apply(
    lambda x: preprocess_text(
        x,
        add_words=add_words,
        numeric=False,
        stopw=True,
        stem=False,
        lem=True))

In [None]:
# Exemple
idx = 12945
print("Résumé brut: \n", df_sample['RESUME'].iloc[idx])
print("\nRésumé après processing :\n", df_sample['RESUME_processed'].iloc[idx ])

In [None]:
plot_wordcloud(df_sample['RESUME_processed'])

In [None]:
# Vectorization
feature = 'RESUME_processed'
model = TfidfVectorizer(
    max_features=300,
    ngram_range=(1, 5),
    min_df=10,
    max_df=0.95)
cv_transform = model.fit_transform(df_sample[feature])
print("Dimensions de la matrice", cv_transform.shape)

In [None]:
# Reduction dimension
n_comp = 15
lda = LatentDirichletAllocation(
    n_components=n_comp,
    learning_method='online',
    random_state=42
    )
x_red = lda.fit_transform(cv_transform)

In [None]:
p = pyLDAvis.sklearn.prepare(lda, cv_transform, model)

In [None]:
pyLDAvis.save_html(p, os.path.join(fig_path, "pyldavis_resumes_lemma.html"))

# Exploration des description (titre+ resumé)

In [None]:
# Preproces des résumés
df_sample['DESCR_processed'] = df_sample['DESCR'].apply(
    lambda x: preprocess_text(
        x,
        add_words=add_words,
        numeric=False,
        stopw=True,
        stem=False,
        lem=True))

In [None]:
# Exemple
idx = 6549
print("Description brute: \n", df_sample['DESCR'].iloc[idx])
print("\nDescription après processing :\n", df_sample['DESCR_processed'].iloc[idx])

In [None]:
plot_wordcloud(df_sample['DESCR_processed'])

In [None]:
# Vectorization
feature = 'DESCR_processed'
model = TfidfVectorizer(
    max_features=300,
    ngram_range=(1, 5),
    min_df=10,
    max_df=0.95)
cv_transform = model.fit_transform(df_sample[feature])
print("Dimensions de la matrice", cv_transform.shape)

In [None]:
# Reduction dimension
n_comp = 15
lda = LatentDirichletAllocation(
    n_components=n_comp,
    learning_method='online',
    random_state=42
    )
x_red = lda.fit_transform(cv_transform)

In [None]:
p = pyLDAvis.sklearn.prepare(lda, cv_transform, model)

In [None]:
pyLDAvis.save_html(p, os.path.join(fig_path, "pyldavis_description_lemma.html"))