In [None]:
# import modules needed
import os
import sys
import pandas as pd
import bibtexparser

In [None]:
# set option for visability
pd.set_option('display.max_colwidth', 200)  

In [None]:
# ensure src/ is in the Python path
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(os.path.join(BASE_DIR, "src"))
print(sys.path)

In [None]:
# import function from file
from retrieval import extract_text_from_pdf

# define path to documents
documents_dir = os.path.join(BASE_DIR, "data", "documents")

# list all PDF files in the folder
pdf_files = [f for f in os.listdir(documents_dir) if f.endswith('.pdf')]
pdf_files

In [None]:
# list to store the text for each document
texts = []

# loop through all PDF files and extract text
for pdf_file in pdf_files:
    pdf_path = os.path.join(documents_dir, pdf_file)
    df_text = extract_text_from_pdf(pdf_path)
    
    # combine the text content from all pages 
    combined_content = df_text['content'].str.cat(sep=' ')
    
    # extract only doc name
    doc_name = os.path.splitext(pdf_file)[0]

    # create df for document
    df_text = pd.DataFrame([[combined_content, doc_name]], columns=["content", "file"])
    
    # append df to list
    texts.append(df_text)

# concatenate all individual dfs into one combined df
df_text = pd.concat(texts, ignore_index=True)
df_text.head()

In [None]:
# define the path to metadata
metadata_dir = os.path.join(BASE_DIR, "data", "metadata")

# list all BibTeX files in the
bib_files = [f for f in os.listdir(metadata_dir) if f.endswith('.bib')]
bib_files

In [None]:
def load_bibtex(bibtex_file_path):
    ''' 
    Parses a single BibTeX file and returns a df
    '''
    with open(bibtex_file_path, 'r') as bibtex_file:
        
        # parse the BibTeX file
        bib_database = bibtexparser.load(bibtex_file)
    
    # convert the BibTeX entries into a list of dictionaries
    bib_entries = bib_database.entries
    
    # convert list of dictionaries into a df
    df_bibtex = pd.DataFrame(bib_entries)
    
    return df_bibtex

In [None]:
# list to store dfs
bib_dataframes = []

# loop through all BibTeX files and load them into dfs
for bib_file in bib_files:
    bib_file_path = os.path.join(metadata_dir, bib_file)
    df_bibtex = load_bibtex(bib_file_path)
    
    # extract only doc name
    doc_name = os.path.splitext(bib_file)[0]
    
    # add column to identify the source BibTeX file
    df_bibtex['file'] = doc_name
    
    # append the df to the list
    bib_dataframes.append(df_bibtex)

# concatenate all the dfs into one combined df
df_bibtex = pd.concat(bib_dataframes, ignore_index=True)
df_bibtex.head()


In [None]:
# select only relevant columns
df_bibtex = df_bibtex[['file', 'title', 'author', 'year', 'number', 'volume', 'journal', 'ENTRYTYPE', 'doi']]
df_bibtex.head()

In [None]:
# merge the df_text and df_bibtex on the index columns
df_combined = pd.merge(df_text, df_bibtex, on='file', how='left')
df_combined.head()

In [None]:
# rearrange column order
df_combined = df_combined[['title', 'author', 'year', 'number', 'volume', 'journal', 'ENTRYTYPE', 'content', 'doi', 'file']]
df_combined.head()

In [None]:
# rename columns
df_combined = df_combined.rename(columns={'ENTRYTYPE':'type', 'author':'authors', 'year':'year_published'})
df_combined.head()

In [None]:
# save df as csv file
df_combined.to_csv('df_combined.csv', index=False)