In [None]:
%reload_ext autoreload
%autoreload 2
from IPython.core.interactiveshell import InteractiveShell
import pathlib

from data_loader import get_pdf_filepaths, load_page_and_line_indexes, load_split_data, load_raw_indexes_list
from pdf_reader import parse_pdf
from data_cleaner import clean_initial_indexes, add_split_data
from data_saver import save_page_and_line_indexes, save_split_data, save_raw_indexes_list, save_aggregated_data
from data_transformer import (
  get_candidates_and_frequencies, 
  add_frequencies_column, 
  add_is_in_toc, 
  add_importance,
  add_position_in_context,
  add_is_named_entity,
  add_length_of_word,
  add_is_named_author,
  add_tfidf,
  get_raw_indexes_list,
  add_is_in_index,
  aggregate_by_candidate
)
#InteractiveShell.ast_node_interactivity = "all"

# Configuration variables

In [None]:
DATA_DIR_PATH = "../data/"
PDF_SOURCE_DIR_PATH = DATA_DIR_PATH + "pdf/"
PROCESSED_DATA_DIR_PATH = DATA_DIR_PATH + "processed/"

# Load data

In [None]:
file_paths = get_pdf_filepaths(PDF_SOURCE_DIR_PATH)
file_paths

# Raw line and page data


In [None]:
for file_path in file_paths[]:#insert relevant interval
    raw_line_and_page_indexes = parse_pdf(file_path)

    line_and_page_indexes = clean_initial_indexes(raw_line_and_page_indexes)
    save_page_and_line_indexes(
     processed_data_dir_path=PROCESSED_DATA_DIR_PATH, 
     line_and_page_indexes=line_and_page_indexes)

# Data set split


In [None]:
file_path = file_paths[14]#insert relevant number

line_and_page_indexes = load_page_and_line_indexes(
  processed_data_dir_path=PROCESSED_DATA_DIR_PATH,
  pdf_filepath=file_path)

with_split_data = add_split_data(
  file_path=file_path,
  line_and_page_indexes=line_and_page_indexes)

save_split_data(
  processed_data_dir_path=PROCESSED_DATA_DIR_PATH,
  split_data=with_split_data)

# Prepare input data frames

In [None]:
with_split_data['by_page_toc']

In [None]:
with_split_data['by_page_biblio']

In [None]:
with_split_data['by_page_index']

In [None]:
split_data = load_split_data(
    processed_data_dir_path=PROCESSED_DATA_DIR_PATH,
    pdf_filepath=file_path  )

(candidates_df, freq_ngrams) = get_candidates_and_frequencies(split_data)

In [None]:
with_frequencies = add_frequencies_column(
  by_pages_body_df=split_data['by_page_body'],
  candidates_df=candidates_df,
  freq_ngrams=freq_ngrams)

In [None]:
with_is_in_toc = add_is_in_toc(
  candidates_df=with_frequencies, 
  by_line_toc=split_data['by_line_toc'])


In [None]:
with_position_in_context = add_position_in_context(with_is_in_toc)

In [None]:
with_importance = add_importance(with_position_in_context)

In [None]:
with_is_named_entity = add_is_named_entity(
  candidates_df=candidates_df,
  df_pages_body=split_data['by_page_body'])

In [None]:
with_length_of_word = add_length_of_word(candidates_df)

In [None]:
with_is_named_author = add_is_named_author(
  candidates_df=with_length_of_word,
  df_pages_biblio=split_data['by_page_biblio'])

In [None]:
with_tfidf = add_tfidf(
  candidates_df=with_is_named_author,
  df_pages_body=split_data['by_page_body'])

In [None]:
with_tfidf.head()

In [None]:
(raw_indexes_list, updated_by_line_index) = get_raw_indexes_list(
  df_cann_lines_index=split_data['by_line_index']
)

In [None]:
save_raw_indexes_list(
  processed_data_dir_path=PROCESSED_DATA_DIR_PATH,
  pdf_filepath=file_path,
  raw_indexes_list=raw_indexes_list)

In [None]:
clean_indexes = load_raw_indexes_list(
  processed_data_dir_path=PROCESSED_DATA_DIR_PATH,
  pdf_filepath=file_path)

In [None]:
with_is_in_index = add_is_in_index(
  candidates_df=with_tfidf,
  indexes_list=clean_indexes)

In [None]:
aggregated_df=aggregate_by_candidate(
  candidates_df=with_is_in_index 
)

In [None]:
save_aggregated_data(
    processed_data_dir_path=PROCESSED_DATA_DIR_PATH,
    agg_df=aggregated_df,
    file_name=pathlib.Path(file_path).name
)