# Create Pickle from Local Documents

_You will need json documents in the documents directory to run this notebook. You can get them by using the `download-documents` script in your local CLUE instance_


In [1]:
import json
import os
import pickle
from datetime import datetime


directory_path = os.path.join(os.getcwd(), "documents")

# Get all document files in a list
doc_files = []
for file in os.listdir(directory_path):
  if file.endswith(".json"):
    doc_files.append(file)

# Ingest each one into a massive list of dicts (qdocs)
batch_size = 100
total_docs = len(doc_files)
count = 0
qdocs = []

for i in range(0, total_docs, batch_size):
  batch = doc_files[i:i + batch_size]
  batch_data = []

  for b in batch:
    with open(os.path.join(directory_path, b), 'r') as f:
      obj = json.load(f)
      batch_data.append(obj)
      count += 1

  qdocs.extend(batch_data)

  if count % batch_size == 0:
    print("Ingested", count, "documents")

print("Ingested a total of", count, "documents")

timestamp = datetime.now().strftime("%Y%m%d_%H%M")
new_pickle_name = f"documents_{timestamp}.pkl"
pickle_directory_path = os.path.join(os.getcwd(), "pickles")

# Create 'pickles' directory if it doesn't exist
if not os.path.exists(pickle_directory_path):
  os.makedirs(pickle_directory_path)

with open(os.path.join(pickle_directory_path, new_pickle_name), 'wb') as file:
  pickle.dump(qdocs, file)

Ingested 100 documents
Ingested 200 documents
Ingested 300 documents
Ingested a total of 329 documents
