# Preparation of sample dataset with 90 PDF documents in a table

In [1]:
import sys
import os
from pathlib import Path
import random

import pandas as pd

### Install `kagglehub`

In [2]:
try:
    import pyarrow, kagglehub
except: 
    python = sys.executable
    ! {python} -m pip install kagglehub pyarrow --quiet

### Download the whole dataset for exploration

To learn more about the dataset, visit [its Kaggle page](https://www.kaggle.com/datasets/manisha717/dataset-of-pdf-files).

In [3]:
import kagglehub

path_dataset_kaggle = Path.home() / '.cache/kagglehub/datasets/manisha717'
path_folder = path_dataset_kaggle / '/dataset-of-pdf-files/versions/1/Pdf'

In [4]:
if not Path(path_dataset_kaggle).exists():
    # download latest version
    print("Downloading dataset from Kaggle...")
    path_dataset_dir = kagglehub.dataset_download("manisha717/dataset-of-pdf-files")
    print("Done!")
    path_folder = Path(path_dataset_dir) / 'Pdf'
    print(f"A total of {len(os.listdir(path_folder))} PDFs have been downloaded to '{path_folder.as_posix()}'")
else:
    print(f"'{path_folder.as_posix()}' already exists, no need to download it.")

Downloading dataset from Kaggle...
Downloading from https://www.kaggle.com/api/v1/datasets/download/manisha717/dataset-of-pdf-files?dataset_version_number=1...


100%|███████████████████████████████████████████████████████████████████████████████| 769M/769M [00:42<00:00, 18.9MB/s]

Extracting model files...





Done!
A total of 1078 PDFs have been downloaded to 'C:/Users/carlo/.cache/kagglehub/datasets/manisha717/dataset-of-pdf-files/versions/1/Pdf'


### Pick only 90 PDFs

In [5]:
ids_to_keep = [
    'U55XYWRRRDDHAJPPHG6QWKZSAVKMQ5W5',
    'R6LHWCGK5EOQHHR7EXRI7ZJLBG2R5QOF',
    'GOCYUTJWJZBXHIUSFYG3LVX5SG6HPEVV',
    'HXOV2RQJRHU5PDC4PRLJJLU2RT4PSSLV',
    'UKNH3OQ2LY6SZKEQ7ITIOEHER4BHZEEN',
    'c31721d6245f5689e5d715b1497b2374df7ae4c6',
    'ZDFWMKCHQYHS6FUNM6VG6IBZUGFGRP2Z',
    '67JSPHIB2F6WUL35OLRAOZ2UEDTI2XZ5',
    'ZEVGJOLU5PKURGKDQ3GMLEWFTBUGJXT6',
    '37WOUPCWLZGR4AFDBNNHKUHNTVLYXVLD',
    'CI7INUGNXV4CY2ETA4PM5IOKZNUM5A2T',
    'UFTWPHWPAMA7G4B6F5SMUFNA3HU6H5UW',
    'I74UK7CA3VD36AJLDMBKXZHMRBRV6KNF',
    'X5LYOK62EFBFVPNGADYEJUVEZJTZDHMC',
    'SZYITDQYKPK6FN4ZP4NZB5QTKLZFT326',
    '887c6fd22c2be24a023105b3fb23d5e29dfd8055',
    'SXHJPFKQMVB3WUA6WOCOATLLVMBWKCCS',
    'QU4F5C3GPRUAV463D7PN5NXHKFCWJETA',
    'DR44GMFR57CROQCOUET6BVBGBYZW5HJ5',
    'EB46IVVKTIHXKYY42XWKUM64HY2USGTZ',
    'FC2VGRC55DZ77QA4O5L7DPBQEUDIQG52',
    'WCILDJ3BDTTDKHJ3HGRQD2SF45E55AXN',
    'O4Z3IEED7ROYYCKDRARK2BBHSZGR4M5Y',
    'P7ZED35W7LHI6YLKM5TVSLOQVACYSHHM',
    'YV3RJXNXP6ZR6V2SSVVQJWYAHBZSZ3GQ',
    '2XJLJAFHRTLAIWFDDAVMV2U4QJMISM4P',
    '65UHUSAELNKVXUYGUXKY2JQLAKWINVC2',
    'J4XIQKGR5I6GKS2RD2IWEWCUNLVEFSZO',
    'CMLZOTTP4BCWEEGZ2OATUKNBQGGIZ4OA',
    'HCPCI34AEBKDUFDP442OTOL2AUGIG3BJ',
    'TYLWGSX5OYKE27DHTQXUJTBMKMHMKY3B',
    'S2M644RTRLI4BC6XSMMWE2MRFB76F4KC',
    'QDT4ROBASSEPQJ22EDUO3HLP3NB2IIBC',
    'TBI3APZQ4LEU3AO3OLYNCZLYYOO4ZOLB',
    'HB2IHQ5EY4MV6YIJNCKSZ5L2EA6BPV75',
    'R7RUWIVERCT2LRQCIKTTVL3SXF4P4HEP',
    'OHTNKYOXYDGU2LJ65NZHLLFUCLSNEGNU',
    'JTJQSY6SJUAI2P54IHQQYCVFMCWPXVTN',
    '2FDPTMT2NZDE6RIJSZZXGBMD7LYL7YHV',
    'SSC5AFOSRRYRFIB7FNROD3UUH2WQL5DV',
    '7GYQLFBVGRTYWE73NKCNCODTJVURFULB',
    'Z3WZ7DRI3LWQUHWWRHOMKHBF3A5L4NTD',
    '3RCHLDD2YCPDNLHEV4AVKEPBYJP5UBZB',
    'TQSAKC5ZXKO27EPBJ7HLTN2K3CW5L4MR',
    'X5J2UFQ52GKUF4JQCQ4DPJ62QF2EJ2XG',
    '6F47YISD72RZCG6OYZQCQLCYJX5E7MBK',
    '3P5D3UKXU2R6I2TK4OJSLL6LGIQJ4NY5',
    'EIR7WZL6LLXE7N7TX5ZY4QKXPPZEISPQ',
    'RZBL4G73Y76GIJWY3BENQNYBTVX5WAR6',
    '44fd6224a81709051890169e2533ac5fd75cd93a',
    'OB5K37VVGHIEXSQPWFJBU7NZA5OEKAOO',
    '2LVOKCURIEQKLK43I6T7QLYYQX3RQUXX',
    'B5PCEYSOV53475ZZQE7OPLN4FUVDRC7Y',
    'JXM2RAH5DJSVW2XCEGXVMU2PFPC4J3P4',
    '2UD7BSKC7GT3V7I6DK5NH5TFKJ6F3EXL',
    '281928eff64137efdd144a833c81ad0ee45284c1',
    'DJB3ZGUGJANSX3J2RPJJU6L4OPGKC2NK',
    '7IYBO6EFOFMF4ETCCIZYJEGPTBQY3EE5',
    'D3BLEFGEYIMPXCDAJJUYBTSMJEAURKTV',
    'L3YHCAZRLQVYA7O4G6SG5BQHUREJTYQO',
    'P27OC5LB5CYK7PNMUUFAMXEQ3THDTNJ4',
    'X2T7N7IH7E45XFYXNJH5Q3AYJU4BEC57',
    'HVP5WMRFWBOCTMMV6NC57S2QJ674CGXG',
    'M6C6DEQ5LEULRCOK2LEADJ3AX3522H3E',
    '76P47DRTKG4I3M56ZPDSWTVW4AC37RWI',
    'SW6G4QILYS7QYXXVHMGTCXFHYU2QHV6Z',
    'ON2LIHQQ5QFF6JIQGMUDF7ALYUDKTTV6',
    '4VXCOS4VUT37JP4BK425KOKWXRYTMJDO',
    'RVF6AZOJLCDEWLY2UNGVFQ4PSVEQYAUH',
    'DGZ2C5RQO4EDW2XQHLWDCCJZUB6IQLCE',
    'OEJKXSDUJIM5XWYEYLA2NWL3K4UI5IZR',
    'JGLEJ4HUZGQBXUZ3ZXUWHSPYIDVKVU2K',
    'OQCH5BPQO3G64HZOGOJKA26NQ23FAXD2',
    'AOZ2LBQOYCC5F3KFBN3FUGN2FODM6DCU',
    'AMTYOKGXFE7QNNQKJ73HZRLDKD3CSQW2',
    'KMELIR3DJDUFB52NSPC42LSIU6OOD77U',
    'KWL6TCYCODD5ODQXOPLOAX46TIM557CL',
    'XTYOD5JGKSMJAATU6RT4WK3LPOYBGYJY',
    'QZTQ7BCZVBR4UKL4M73HH7TX4ZVROQCS',
    'YCYN7CHWTHPZYST4LVE2OHZC2Y4INSNZ',
    'TEEGK5AIZ426F27JXF2QAPKENCTIT2GG',
    'GLXEDNT3LDSIPA6BGJGFIUEXOBO4NRDE',
    'JYMSHAQOPOCHYVWZYLPSNQYDNAGMT5DR',
    'QSPE3BHAI3DOO552RNZ5WXPJVIWAZ4LT',
    'KAX2V43VIPOF5R7SL2YYD2JQFRQCPWQ5',
    '7ODKVGAYKZTRQVTJRVNSXSEK3QC43I3B',
    'POKED5F2QVX266ONHVMGK7AGDJDU7OYQ',
    'HLGUA2TVCFAMJQDFC5IC43JBVUOFXDWB',
    'ILRVVACIV2JDSO4LHLATCOCKSEQYZCMZ',
    'WPSL4I6DPEKQWLMHVD5TRXPBVMN3MUFW'
]

In [6]:
files = os.listdir(path_folder)
print(f"Number of documents in folder: {len(files)}")

files_to_keep = [file for file in files 
                 if file.replace('.pdf', '') in ids_to_keep]

print(f"Keeping only {len(files_to_keep)} files")

Number of documents in folder: 1078
Keeping only 90 files


In [7]:
files_to_keep[:5]

['281928eff64137efdd144a833c81ad0ee45284c1.pdf',
 '2FDPTMT2NZDE6RIJSZZXGBMD7LYL7YHV.pdf',
 '2LVOKCURIEQKLK43I6T7QLYYQX3RQUXX.pdf',
 '2UD7BSKC7GT3V7I6DK5NH5TFKJ6F3EXL.pdf',
 '2XJLJAFHRTLAIWFDDAVMV2U4QJMISM4P.pdf']

> ### 📝 **Note**:
> 
> If you want to create your own sample dataset with different documents, you can run this:
> ```python
>    import random
>    
>    n_docs = 100
>    # 'path_pdfs' is returned by method "kagglehub.dataset_download"
>    path_folder = Path(path_pdfs) / 'Pdf'
>    
>    files = os.listdir(path_folder)
>    print(f"Number of documents in folder: {len(files)}")
>    print(f"Keeping only {n_docs} files")
>    
>    random.seed(42)  # for reproducibility of the files to keep
>    random.shuffle(files)
>    files_to_keep = files[:n_docs]
>    ```

### Store the picked PDFs in a dataframe

In [8]:
records = []
for filename in files_to_keep:
    time_insert = pd.Timestamp.now().to_numpy()  # mimicking "insertion time"
    path_pdf = path_folder / filename
    doc_id = Path(path_pdf).name.replace(".pdf", "")
    
    with open(path_pdf, 'rb') as pdf:
        pdf_bytes = pdf.read()
        records.append({
            'time_insert':         time_insert,
            'document_identifier': doc_id,
            'content':             pdf_bytes,
        })

df_pdf_docs = pd.DataFrame.from_records(records)

In [9]:
df_pdf_docs.shape

(90, 3)

In [10]:
df_pdf_docs.head()

Unnamed: 0,time_insert,document_identifier,content
0,2025-05-04 15:46:34.392272,281928eff64137efdd144a833c81ad0ee45284c1,b'%PDF-1.7\r%\xe2\xe3\xcf\xd3\r\n146 0 obj\r<<...
1,2025-05-04 15:46:34.421531,2FDPTMT2NZDE6RIJSZZXGBMD7LYL7YHV,b'%PDF-1.6\r%\xe2\xe3\xcf\xd3\r\n27 0 obj\r<</...
2,2025-05-04 15:46:34.448864,2LVOKCURIEQKLK43I6T7QLYYQX3RQUXX,b'%PDF-1.4\r%\xe2\xe3\xcf\xd3\r\n468 0 obj\r<<...
3,2025-05-04 15:46:34.469973,2UD7BSKC7GT3V7I6DK5NH5TFKJ6F3EXL,b'%PDF-1.5\r%\xe2\xe3\xcf\xd3\r\n1869 0 obj\r<...
4,2025-05-04 15:46:34.486475,2XJLJAFHRTLAIWFDDAVMV2U4QJMISM4P,b'%PDF-1.4\r%\xe2\xe3\xcf\xd3\r\n303 0 obj\r<<...


### Write dataframe to parquet in a Spark-version-safe manner

Depending on your configuration, Spark may raise an error when you attempt to read with **Spark** a parquet file that was created with **Pandas** and that contains timestamp columns. When you do run `spark.read.parquet()`, Spark may respond with: `org.apache.spark.sql.AnalysisException: Illegal Parquet type: INT64 (TIMESTAMP(NANOS,false))`. This is caused by an incompatibility, when it comes to compressing timestamp data, between the engine Pandas uses to create the Parquet file, and the one Spark uses to read the file. If you encounter that error, you can fix it by creating the parquet with Pandas with a **timestamp format compatible with the one Spark expects**:

```python
pdf_docs.to_parquet('your_data.parquet', allow_truncated_timestamps=True, coerce_timestamps='ms')
```

To learn more, visit [this StackOverflow post](https://stackoverflow.com/questions/57699926/how-to-fix-illegal-parquet-type-int64-timestamp-micros-error).

In [11]:
def get_path_data_folder():
    NAME_REPO = 'articles'
    home_repo = [parent for parent in Path.cwd().parents if parent.name == NAME_REPO][0]
    folder_data = home_repo / 'data'
    return folder_data

folder_data = get_path_data_folder()

folder_data.mkdir(exist_ok=True)

In [12]:
n_docs = df_pdf_docs.shape[0]
parquet_name = folder_data / f'table_{n_docs}_pdf_documents.parquet'

df_pdf_docs.to_parquet(parquet_name, 
                       allow_truncated_timestamps=True, 
                       coerce_timestamps='ms')  # to prevent Spark compatibility issues
print(f"Parquet file saved at: '{parquet_name.as_posix()}'")

Parquet file saved at: 'C:/Users/carlo/Projects/articles/data/table_90_pdf_documents.parquet'


In [13]:
assert parquet_name.exists(), "File wasn't saved properly"

### Optional: Deleting the original dataset

You have saved a sample dataset. If you don't plan to use the full dataset, and want to save some space (it weighs 769 MB), you can delete the folder you downloaded with `kagglehub` by doing: 

```python
import shutil

if path_dataset_kaggle.exists():
    print(f"Deleting folder '{path_dataset_kaggle}' and all its contents ....")
    shutil.rmtree(path_dataset_kaggle)
    print("Done.")
```