# Etape 1 - Preprocessing du dataset

In [None]:
# Import librairies
import os
import re
import sys


import csv
import pandas as pd
import numpy as np
import warnings

In [None]:
# Set paths
path = "."
os.chdir(path)
data_path = path + "\\data"
output_path = path + "\\outputs"
fig_path = path + "\\figures"

In [None]:
# Suppression des FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)

In [None]:
# Import des données
with open(os.path.join(data_path, 'export.dsv'), 'r', newline='', encoding="latin") as csv_file:
    csv_reader = csv.reader(csv_file, delimiter='\t',)
    data=list(csv_reader)

In [None]:
# Verification du format des données
df = pd.DataFrame(data[1:], columns=data[0]+["A", "B"])
print(f"le Fichier de données contient {df.shape[0]} lignes et  {df.shape[1]} colonnes")

In [None]:
# Visualisation
df.head()

In [None]:
# Recherche des lignes mal formattées
data_to_check = df[(~df["A"].isna()) | (~df["B"].isna())]
print("Nombre d'ouvrages avec soucis d'importation :", data_to_check.shape)
data_to_check

In [None]:
# Export des données à vérifier
data_to_check.to_csv(os.path.join(output_path, "data_to_check.csv"), index=0, encoding="latin-1")

In [None]:
# On ne garde que les données correctement importées
df = df.drop(index = data_to_check.index)
df

In [None]:
# Verification que les colonnes A et B sont vides
print("Colonne A: ", all(df["A"].isna()))
print("Colonne B: ", all(df["B"].isna()))

In [None]:
# Suppression des colonnes vides
df.dropna(axis=1, inplace=True)
df.shape

In [None]:
# Echantillon de visualisation
df.head()

In [None]:
# Ajout d'une colonne description (Titre + resumé)
df.loc[:,"DESCR"] = df.loc[:, 'TITRE'] + ' ' + df.loc[:, 'RESUME']

In [None]:
# Get Domain according to Dewey code
ddc = "154.85"
pattern_tenth = re.compile(r'\d{2}0|00[0-9]|944|796')
pattern_unit= re.compile(r'\d{2}[1-9]')

def get_domain_from_ddc(ddc):
    if re.findall(pattern_tenth, ddc):
        tef = str(re.findall(pattern_tenth, ddc)[0])
    elif re.findall(pattern_unit, ddc):
        tef = str(re.findall(pattern_unit, ddc)[0][:-1] + str(0))
    else:
        tef = None
    return tef

In [None]:
# Get Domain according to Dewey code
ddc = "154.85"
pattern_tenth = re.compile(r'\d{2}0|00[0-9]|944|796')
pattern_unit= re.compile(r'\d{2}[1-9]')
re.findall(pattern_unit, ddc)[0][:-1] + str(0)

In [None]:
df["DDC"] = df["DEWEY"].apply(lambda x: get_domain_from_ddc(x))

In [None]:
# Vérification du format des colonnes
df.info()

In [None]:
df.head(50)

In [None]:
# Merge with TEF labels
ddc = pd.read_csv(os.path.join(data_path, "dewey_label.csv"), index_col=0, dtype=str)
ddc.head()

In [None]:
# Merge 
working_df = df.merge(ddc, on="DDC", how='left')
print(working_df.shape)
working_df.head()

In [None]:
working_df.to_csv(os.path.join(data_path, 'working_data.csv'))