# Cleaning ABES extraction 

Ce notebook permet de vérifier l'extraction de données réalisée par l'ABES et d'enlever les notices potentiellement mal formattées (du fait de correspondance entre les métadonnées de la notice et le type de séparateur choisi par exemple).
Une étape de mise à plat des chaines d'indexation est également réalisée ainsi qu'une première exploration des concepts RAMEAU et labels TEF (issus de la classification décimale de DEWEY) sous forme de visualisation graphique (barplots et wordcloud)

MAJ - 03/05/2023 (Aurélie Thébault - EcoStats)

# Set project

### Packages

In [None]:
# Import librairies
import os
import re
import sys

import csv
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import warnings

from collections import Counter
from wordcloud import WordCloud

In [None]:
# Suppression des FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)

### Graphical parameters

In [None]:
# Parametres graphiques
%matplotlib inline
rc = {
    'font.size': 14,
    'font.family': 'Arial',
    'axes.labelsize': 14,
    'legend.fontsize': 12,
    'axes.titlesize': 14,
    'xtick.labelsize': 12,
    'ytick.labelsize': 12,
    'figure.max_open_warning': 30}

sns.set(font='Arial', rc=rc)
sns.set_style(
    "whitegrid", {
        'axes.edgecolor': 'k',
        'axes.linewidth': 1,
        'axes.grid': True,
        'xtick.major.width': 1,
        'ytick.major.width': 1
        })
sns.set_context(
    "notebook",
    font_scale=1.1,
    rc={"lines.linewidth": 1.5})
pd.set_option('display.max_columns', None)

### Paths

In [None]:
# Set paths
path = "."
os.chdir(path)
data_path = path + "\\data"
output_path = path + "\\outputs"
fig_path = path + "\\figs"

### Useful functions

In [None]:
def import_data(filename, encoding="utf-8", plot=False):
    with open(
        os.path.join(data_path, filename), 'r',
            newline='', encoding=encoding) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter='\t',)
        data = list(csv_reader)

        return data

In [None]:
class checkDataFormat:
    def __init__(self, df_list):
        self.df = df_list

    def check_format(self, plot=False):
        # Define dimension
        df = self.df

        nb_notice = len(df)

        # Check number of column
        len_col = []
        for row in df:
            len_col.append(len(row))
        max_number_col = max(len_col)
        print(f"There are {nb_notice} in this file with till {max_number_col} columns")

        if plot:
            # Show distribution
            sns.histplot(len_col)

        self.nb_notice = nb_notice
        self.max_number_col = max_number_col
        self.len_col = len_col

    def sort_notices(self, official_nb_col=5, save_file="working_data.csv", export_name=None):
        
        # Checking and removing badly formatted notices
        len_col = self.len_col
        df = self.df

        bad_formated_notices = [x for x in len_col if x != official_nb_col]
        print(f"There are {len(bad_formated_notices)} badly formatted notices")

        ids_to_keep = [True if x == official_nb_col else False for x in len_col]
        print(f"There are {sum(ids_to_keep)-1} well formatted rows")

        cleaned_data = [row for row, id in zip(df, ids_to_keep) if id]
        cleaned_data = pd.DataFrame(cleaned_data[1:], columns=cleaned_data[0])
        print(f" ==> Working dataset has {len(cleaned_data)} notices")

        # Verification du format des données
        print(f"Cleaned file contains {cleaned_data.shape[0]} notices and  {cleaned_data.shape[1]} columns")

        data_to_check = [row for row, id in zip(data, ids_to_keep) if not id]
        data_to_check = pd.DataFrame(data_to_check)
        print(f"Need to check {len(data_to_check)} notices extractions")

        # Sauvegarde des données
        cleaned_data.to_csv(
            os.path.join(data_path, save_file),
            index=0)
        print(f"Save working data as {save_file}")

        if export_name:
        # Export des données à vérifier
            data_to_check.to_csv(
                os.path.join(data_path, export_name),
                index=0)
            print(f"Save data to check data as {export_name}")


In [None]:
def plot_barplot_of_tags(
    tags_list,
    nb_of_tags,
    xlabel="Nombre d'occurences",
    ylabel="",
    figsave=None,
    figsize=(10, 30),
):
    """
    Description: plot barplot of tags count (descending order) from a list of tags

    Arguments:
        - tags_list (lsit): list of tags
        - nb_of_tags (int) : number of tags to plot in barplot (default=50)
        - xlabel, ylabel (str): labels of the barplot
        - figsize (list) : figure size (default : (10, 30))

    Returns :
        - Barplot of nb_of_tags most important tags

    """
    tag_count = Counter(tags_list)
    tag_count_sort = dict(tag_count.most_common(nb_of_tags))

    plt.figure(figsize=figsize)
    sns.barplot(
        x=list(tag_count_sort.values()),
        y=list(tag_count_sort.keys()),
        orient="h",
        palette="viridis",
    )
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    if figsave:
        plt.savefig(figsave, bbox_inches="tight")
    plt.show()


In [None]:
def flatten(list):
    flat_list = [item for sublist in list for item in sublist]
    return flat_list

In [None]:
def plot_wordcloud(keywords, save_file=None):
    plt.figure(figsize=(15, 8))
    wordcloud = WordCloud(
        width=1000, height=500,
        background_color='white').generate_from_frequencies(Counter(keywords))
    plt.imshow(wordcloud)
    if save_file:
        plt.savefig(os.path.join(fig_path, save_file), dpi=300, bbox_inches="tight")
    else:
        plt.show()
        plt.close()

In [None]:
class removeVedettes:
    def __init__(self, df, col_name, vedette_list):
        self.df = df
        self.col_name = col_name
        self.vedettes = vedette_list

    def remove_vedette(self):
        df = self.df
        col = self.col_name
        vedettes = self.vedettes

        is_ved = np.zeros(df.shape[0])
        for ved in vedettes:
            res = df[col].apply(lambda x: ved in x)
            print(f"Nbre de notices contenant le concept '{ved}' : {sum(res)}")
            is_ved += res

        # Reduction du jeu de données
        df_reduced = self.df[(is_ved) == 0]
        print(f"Les vedettes de {vedettes} ont été retirées du dataset")
        print(f"Le dataset contient maintenant {df_reduced.shape[0]} notices")

        self.df_reduced = df_reduced

In [None]:
def get_domain_from_ddc(ddc):
    # Get Domain according to Dewey code
    pattern_tenth = re.compile(r'\d{2}0|00[0-9]|944|796')
    pattern_unit= re.compile(r'\d{2}[1-9]')
    if re.findall(pattern_tenth, ddc):
        tef = str(re.findall(pattern_tenth, ddc)[0])
    elif re.findall(pattern_unit, ddc):
        tef = str(re.findall(pattern_unit, ddc)[0][:-1] + str(0))
    else:
        tef = None
    return tef

# Import data

In [None]:
# Import des données
filepath = "extraction\export.dsv"
working_data_filename = "working_data.csv"
export_data_filename = "data_to_check.csv"
encoding = "latin-1"  # (useful only on first extraction)
data = import_data(filepath, encoding)
filename = filepath.split('\\')[-1].split('.')[0]
merge_with_dewey = True

In [None]:
# Import des données
filepath = "extraction\export_sans_dewey.dsv"
working_data_filename = "working_data_sans_dewey.csv"
export_data_filename = "data_to_check_sans_dewey.csv"
# encoding = "latin-1"  # (useful only on first extraction)
data = import_data(filepath)#, encoding)
filename = filepath.split('\\')[-1].split('.')[0]
merge_with_dewey = False

In [None]:
# Clean file
df = checkDataFormat(data)
df.check_format()
df.sort_notices(
    save_file=working_data_filename,
    export_name=export_data_filename)

In [None]:
# Load working data
df = pd.read_csv(os.path.join(data_path, working_data_filename))
print(f"Data loaded :", df.shape)

In [None]:
df.head()

In [None]:
# Ajout d'une colonne description (Titre + resumé)
df.loc[:, "DESCR"] = df.loc[:, 'TITRE'] + ' ' + df.loc[:, 'RESUME']

# Explore RAMEAU

In [None]:
# Extraction des indices contenant " -- " dans la colonne RAMEAU => i.e chaines d'indexation
df["presence_chaine_indexation"] = df["RAMEAU"].apply(lambda x: True if re.search(' -- ', x) else False)
n_chaine_index = df["presence_chaine_indexation"].sum()
print(f"Le jeu de données contient {n_chaine_index} notices avec des chaines d'indexation")

In [None]:
df.loc[df["presence_chaine_indexation"] == True, ["PPN", "RAMEAU"]]

In [None]:
# Extraction des chaines d'indexation
df["rameau_chaines_index"] = df["RAMEAU"].apply(lambda x: x.split(';'))
print(df.loc[1:10, "rameau_chaines_index"])

In [None]:
# Nombre de de notices d'autorité différentes (y compris pré-construites)
# ex d'autorité preconstruite : Science-fiction américaine -- Traductions française
from itertools import chain
rameau_chaine_index = df["rameau_chaines_index"].tolist()
rameau_list_chaines_index = list(chain(*rameau_chaine_index))
print(f"{len(rameau_list_chaines_index)} chaines d'indexation rameau, dont {len(set(rameau_list_chaines_index))} différentes")

In [None]:
plot_barplot_of_tags(
    tags_list=rameau_list_chaines_index,
    nb_of_tags=20,
    xlabel="Nombre de references",
    ylabel="RAMEAU - Chaines d'indexation",
    figsave=os.path.join(fig_path, str(filename + "_" + 'barplot_Rameau_ChainesIndex.png')),
    figsize=(8, 8))

In [None]:
# Mettre à plat TOUS les mots clé
pattern = r';\s*(?![^()]*\))| -- '
df["rameau_concepts"] = df["RAMEAU"].apply(lambda x: re.split(pattern, x))
df.loc[1:20, ["RAMEAU", "rameau_concepts"]]

In [None]:
keywords = flatten(df['rameau_concepts'])
print(f"Le dataset contient {len(set(keywords))} concepts RAMEAU differents")

In [None]:
# Show main vedettes
plot_wordcloud(keywords, save_file=str(filename + "_" + "rameau_concepts_wordcloud.png"))

In [None]:
plot_barplot_of_tags(
    tags_list=keywords,
    nb_of_tags=20,
    xlabel="Nombre de references",
    ylabel="RAMEAU - Concepts",
    figsave=os.path.join(fig_path, str(filename + "_" + 'barplot_Rameau_concepts.png')),
    figsize=(8, 8))

### Remove vedettes

In [None]:
list_vedettes = ["Ouvrages pour la jeunesse", "Roman pour la jeunesse"]
colonne = "rameau_concepts"
temp = removeVedettes(df, colonne, list_vedettes)
temp.remove_vedette()

### Check cleaned file

In [None]:
# Dataset final
cleaned_df = temp.df_reduced
cleaned_df.shape

In [None]:
# Show main vedettes
keywords2 = flatten(cleaned_df['rameau_concepts'])
plot_wordcloud(keywords2, save_file=str(filename + "_" + "rameau_concepts_wordcloud_cleaned.png"))

### Explore Dewey

In [None]:
if merge_with_dewey:
    # Find DDC domain
    cleaned_df["DDC"] = cleaned_df["DEWEY"].apply(lambda x: get_domain_from_ddc(x))
    # Merge with TEF labels
    ddc = pd.read_csv(os.path.join(data_path, "dewey_label.csv"), index_col=0, dtype=str)

    # Merge 
    cleaned_df = cleaned_df.merge(ddc, on="DDC", how='left')
    print(f"Dimension of the dataframe with TEF labels: {cleaned_df.shape}")
    print("Column headers: ", list(cleaned_df.columns))

    # Visualization
    plot_barplot_of_tags(
        tags_list=cleaned_df["TEF_LABEL"],
        nb_of_tags=20,
        xlabel="Nombre de references",
        ylabel="Libellés TEF",
        figsave=os.path.join(fig_path, 'barplot_libelles_TEF.png'),
        figsize=(8, 8))

# Save working file as csv

In [None]:
# Saving file
cleaned_df.to_csv(os.path.join(data_path, working_data_filename))