In [1]:
# installation
!pip install transformers accelerate datasets outlines

Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting outlines
  Downloading outlines-0.2.3-py3-none-any.whl.metadata (18 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting interegular (from outlines)
  Downloading interegular-0.3.3-py37-none-any.whl.metadata (3.0 kB)
Collecting lark (from outlines)
  Downloading lark-1.2.2-py3-none-any.whl.metadata (1.8 kB)
Collecting diskcache (from outlines)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Collecting iso3166

In [2]:
# libraries
import os
import re
from google.colab import files
from pathlib import Path
import json
from dataclasses import dataclass
from tqdm import tqdm
from transformers import AutoTokenizer
from outlines import models, generate

In [8]:
# utilities
def save_json(data, file_name):
    with open(file_name, 'w') as f:
        json.dump(data, f)

def load_json(file_name):
    with open(file_name, 'r') as f:
        return json.load(f)

def get_response(raw_output):
  return raw_output[0]['generated_text'][-1]['content']


In [4]:
# !!!!!!!! huggingface login required to use llama model (ADD TOKEN AFTER `--token`) !!!!!!!!
!huggingface-cli login --token

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
The token `general` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `general`


In [5]:
# settings
checkpoint = 'meta-llama/Llama-3.2-1B-Instruct'

In [6]:
# make directories for data and results
os.makedirs('ocr', exist_ok = True)
os.makedirs('info', exist_ok = True)

In [9]:
# upload JSON schema from `schema/schema.json`
files.upload()
schema = load_json('schema.json')
schema_string = json.dumps(schema, indent=2)

Saving schema.json to schema (1).json


In [10]:
# Upload all JSON files from `ocr_results/`
os.chdir('ocr')
ocr_paths = files.upload()
ocr_extractions = [{'key': Path(el).stem,
                    'ocr': load_json(el)} for el in ocr_paths.keys()]
os.chdir('..')

Saving 81882169_2170.json to 81882169_2170.json
Saving 81841302.json to 81841302.json
Saving 80909413.json to 80909413.json


In [11]:
# Template for information extraction
@dataclass
class Template:
  key: str
  ocr_extraction: str

  @classmethod
  def from_dict(cls, dict_):
    return cls(dict_['key'], dict_['ocr'])

  def ocr_to_string(self):
    return '\n'.join(self.ocr_extraction)

  def prompt(self):
    string = self.ocr_to_string()

    system = f'''The following is a document containing one or more emails.

Your task is to read the following emails and extract a JSON object that will capture the common structure of emails.

There may be multiple emails in the document—extract them all if so.

It is of utmost importance that you extract EXACTLY according to the JSON schema.

Now extract the JSON object.
'''

    user = string

    messages = [
      {"role": "sytem", "content": system},
      {"role": "user", "content": user},
    ]

    return messages

  def hf_to_outlines(self, messages):
    return tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True)

  def generate(self, generator):
    messages = self.prompt()
    outlines_input = self.hf_to_outlines(messages)
    return generator(outlines_input)

In [12]:
# instantiate models
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = models.transformers(checkpoint)
generator = generate.json(model, schema_string)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [13]:
# generate records and save
for el in tqdm(ocr_extractions):
  template = Template.from_dict(el)
  info = template.generate(generator)
  save_json(info, f'info/{el["key"]}.json')

100%|██████████| 3/3 [03:25<00:00, 68.34s/it]
