In [1]:
import os
import shutil
from pathlib import Path

import pandas as pd

In [2]:
relative_root = Path('..')
meta_parsed = pd.read_csv(relative_root.joinpath('data/interim/meta_publications_parsed.csv'))
meta_ocr = pd.read_csv(relative_root.joinpath('data/interim/meta_publications_ocr.csv'))

garbage_files = [
    "data/raw/UTA publications/UF papers 2014 - 2021 copy/Why study autism BPS.pdf",
    "data/raw/UTA publications/UF papers 2011-2013 copy/Kandel chapter 2012.pdf"
]

meta_ocr_no_garbage = meta_ocr.query('path != @garbage_files')

[
    len(meta_parsed),
    len(meta_ocr),
    len(meta_ocr_no_garbage)
]

[154, 120, 118]

In [3]:
destination_root = relative_root.joinpath('data/raw/publications_manual_export/UTA publications')

# files to copy from `UTA publications`
for i, row in meta_parsed.iterrows():
    # input path stays unchanged
    input_path = relative_root.joinpath(row['path'])
    filename = input_path.name

    # copy directory structure from the UTA Publications    
    path_structure = input_path.parent.parts[4:]
    path_structure = Path(*path_structure)
    # create destination directory
    destination_dir = destination_root.joinpath(path_structure)
    destination_dir.mkdir(parents=True, exist_ok=True)
    # copy file
    destination_file_path = destination_dir.joinpath(filename)
    shutil.copy(input_path, destination_file_path)

In [4]:
# files to copy from `publications_manual_export/ocr_files_after`
input_root = relative_root.joinpath('data/raw/publications_manual_export/ocr_files_after')
destination_root = relative_root.joinpath('data/raw/publications_manual_export/UTA publications')

# files to copy from `UTA publications`
for i, row in meta_ocr_no_garbage.iterrows():
    # input path is used to generate destination path
    imaginary_input_path = relative_root.joinpath(row['path'])
    # but the real input path is just the filename
    filename = imaginary_input_path.name
    input_path = input_root.joinpath(filename)

    # copy directory structure from the UTA Publications    
    path_structure = imaginary_input_path.parent.parts[4:]
    path_structure = Path(*path_structure)
    # create destination directory
    destination_dir = destination_root.joinpath(path_structure)
    destination_dir.mkdir(parents=True, exist_ok=True)
    # copy file
    destination_file_path = destination_dir.joinpath(filename)
    shutil.copy(input_path, destination_file_path)

In [11]:
# make sure no files got lost

def get_file_paths(folder_path: str) -> tuple[list, list]:
    """
    """

    pdf_paths = []
    other_paths = []
    for root, directories, files in os.walk(folder_path):
        for filename in files:
            filepath = os.path.join(root, filename)
            if filename.endswith('.pdf'):
                pdf_paths.append(filepath)
            else:
                other_paths.append(filepath)

    return pdf_paths, other_paths


datadir_orig = str(relative_root.joinpath('data/raw/UTA publications'))
datadir_reco = str(relative_root.joinpath('data/raw/publications_manual_export/UTA publications'))

pdf_paths_orig, other_paths_orig = get_file_paths(datadir_orig)
pdf_paths_reco, other_paths_reco = get_file_paths(datadir_reco)

# assert length is the same (except two garbage files that had to be deleted)
assert len(pdf_paths_orig) - 2 == len(pdf_paths_reco)
