In [None]:
!pip install -q kor markdownify requests pydantic openai bs4 rich

In [None]:
import gc
gc.collect()

40

In [None]:
import requests

url = 'https://raw.githubusercontent.com/bugbytes-io/datasets/master/students_v1.json'

response = requests.get(url)
data = response.json()
print(data)

[{'id': 'd15782d9-3d8f-4624-a88b-c8e836569df8', 'name': 'Eric Travis', 'date_of_birth': '1995-05-25', 'GPA': '3.0', 'course': 'Computer Science', 'department': 'Science and Engineering', 'fees_paid': False}, {'id': '4c7b4c43-c863-4855-abc0-3657c078ce23', 'name': 'Mark Smith', 'date_of_birth': '1996-02-10', 'GPA': '2.5', 'course': None, 'department': 'Science and Engineering', 'fees_paid': True}, {'id': '5cd9ad59-fcf1-462c-8863-282a9fb693e4', 'name': 'Marissa Barker', 'date_of_birth': '1996-10-01', 'GPA': '3.5', 'course': 'Biology', 'department': 'Life Sciences', 'fees_paid': False}, {'id': '48dda775-785d-41e3-b0dd-26a4a2f7722f', 'name': 'Justin Holden', 'date_of_birth': '1994-08-22', 'GPA': '3.23', 'course': 'Philosophy', 'department': 'Arts and Humanities', 'fees_paid': True}, {'id': '7ffe2ceb-562b-4edd-b74c-3741e1b08453', 'name': 'Michelle Thompson', 'date_of_birth': '1995-08-05', 'GPA': '3.9', 'course': 'Film Studies', 'department': 'Arts and Humanities', 'fees_paid': True}]


We'll now define a model that inherits from Pydantic's BaseModel class, and has the above fields/types. This will be a simple, initial implementation - we will extend it later to account for optional fields and add additional validation and constraints.

In [None]:
import uuid
from datetime import date
import requests
from pydantic import BaseModel

url = 'https://raw.githubusercontent.com/bugbytes-io/datasets/master/students_v1.json'
data = requests.get(url).json()

# define Pydantic model class
class Student(BaseModel):
    id: uuid.UUID
    name: str
    date_of_birth: date
    GPA: float
    course: str | None
    department: str
    fees_paid: bool

for student in data:
    # create Pydantic model object by unpacking key/val pairs from our JSON dict as arguments
    model = Student(**student)
    print(model)

id=UUID('d15782d9-3d8f-4624-a88b-c8e836569df8') name='Eric Travis' date_of_birth=datetime.date(1995, 5, 25) GPA=3.0 course='Computer Science' department='Science and Engineering' fees_paid=False
id=UUID('4c7b4c43-c863-4855-abc0-3657c078ce23') name='Mark Smith' date_of_birth=datetime.date(1996, 2, 10) GPA=2.5 course=None department='Science and Engineering' fees_paid=True
id=UUID('5cd9ad59-fcf1-462c-8863-282a9fb693e4') name='Marissa Barker' date_of_birth=datetime.date(1996, 10, 1) GPA=3.5 course='Biology' department='Life Sciences' fees_paid=False
id=UUID('48dda775-785d-41e3-b0dd-26a4a2f7722f') name='Justin Holden' date_of_birth=datetime.date(1994, 8, 22) GPA=3.23 course='Philosophy' department='Arts and Humanities' fees_paid=True
id=UUID('7ffe2ceb-562b-4edd-b74c-3741e1b08453') name='Michelle Thompson' date_of_birth=datetime.date(1995, 8, 5) GPA=3.9 course='Film Studies' department='Arts and Humanities' fees_paid=True


## Custom Validator Functions
Sometimes, our validation logic cannot be expressed as simply as with the built-in constrained types. We also may need to use dynamic values when validating, such as fetching the current datetime. For these purposes, we can use custom validator functions instead.

Let's say we want to ensure that students cannot enrol if they are under 16 years old. We need to implement a validator on our date_of_birth field.

From Pydantic, we will import the validator() decorator, and write a method to validate the date_of_birth field.



In [None]:
import uuid
import requests
from datetime import date, datetime, timedelta
from pydantic import BaseModel, confloat, validator
from enum import Enum


url = 'https://raw.githubusercontent.com/bugbytes-io/datasets/master/students_v1.json'
data = requests.get(url).json()


class DepartmentEnum(Enum):
    ARTS_AND_HUMANITIES = 'Arts and Humanities'
    LIFE_SCIENCES = 'Life Sciences'
    SCIENCE_AND_ENGINEERING = 'Science and Engineering'


class Student(BaseModel):
    id: uuid.UUID
    name: str
    date_of_birth: date
    GPA: confloat(ge=0, le=4)
    course: str | None
    department: DepartmentEnum
    fees_paid: bool

    @validator('date_of_birth')
    def ensure_16_or_over(cls, value):
        sixteen_years_ago = datetime.now() - timedelta(days=365*16)

        # convert datetime object -> date
        sixteen_years_ago = sixteen_years_ago.date()

        # raise error if DOB is more recent than 16 years past.
        if value > sixteen_years_ago:
            raise ValueError("Too young to enrol, sorry!")
        return value

for student in data:
    # create Pydantic model object by unpacking key/val pairs from our JSON dict as arguments
    model = Student(**student)
    print(model)

id=UUID('d15782d9-3d8f-4624-a88b-c8e836569df8') name='Eric Travis' date_of_birth=datetime.date(1995, 5, 25) GPA=3.0 course='Computer Science' department=<DepartmentEnum.SCIENCE_AND_ENGINEERING: 'Science and Engineering'> fees_paid=False
id=UUID('4c7b4c43-c863-4855-abc0-3657c078ce23') name='Mark Smith' date_of_birth=datetime.date(1996, 2, 10) GPA=2.5 course=None department=<DepartmentEnum.SCIENCE_AND_ENGINEERING: 'Science and Engineering'> fees_paid=True
id=UUID('5cd9ad59-fcf1-462c-8863-282a9fb693e4') name='Marissa Barker' date_of_birth=datetime.date(1996, 10, 1) GPA=3.5 course='Biology' department=<DepartmentEnum.LIFE_SCIENCES: 'Life Sciences'> fees_paid=False
id=UUID('48dda775-785d-41e3-b0dd-26a4a2f7722f') name='Justin Holden' date_of_birth=datetime.date(1994, 8, 22) GPA=3.23 course='Philosophy' department=<DepartmentEnum.ARTS_AND_HUMANITIES: 'Arts and Humanities'> fees_paid=True
id=UUID('7ffe2ceb-562b-4edd-b74c-3741e1b08453') name='Michelle Thompson' date_of_birth=datetime.date(1995,

In [None]:
# # Kor!
# from kor.extraction import create_extraction_chain
# from kor.nodes import Object, Text, Number

# # LangChain Models
# from langchain.chat_models import ChatOpenAI
# from langchain.llms import OpenAI

# Standard Helpers
import pandas as pd
import time
import json

import uuid
import requests
from datetime import date, datetime, timedelta
from pydantic import BaseModel, confloat, validator
from enum import Enum

# Text Helpers
import requests
from bs4 import BeautifulSoup
from markdownify import markdownify as md



def printOutput(output):
    print(json.dumps(output,sort_keys=True, indent=3))

In [None]:
openai_api_key = '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'

In [None]:
# llm = ChatOpenAI(
# #     model_name="gpt-3.5-turbo", # Cheaper but less reliable
#     model_name="gpt-3.5-turbo",
#     temperature=0,
#     max_tokens=2000,
#     openai_api_key=openai_api_key
# )

# https://bugbytes.io/posts/pydantic-nested-models-and-json-schemas/

In [None]:
from enum import Enum
from pydantic import BaseModel, validator, confloat
from datetime import date, timedelta, datetime
import uuid
import requests

# Fetch the raw JSON data from Github
url = 'https://raw.githubusercontent.com/bugbytes-io/datasets/master/students_v2.json'
data = requests.get(url).json()

# Define an Enum of acceptable Department values
class DepartmentEnum(str, Enum):
    ARTS_AND_HUMANITIES = 'Arts and Humanities'
    LIFE_SCIENCES = 'Life Sciences'
    SCIENCE_AND_ENGINEERING = 'Science and Engineering'

# Pydantic model to outline structure/types of Modules
class Module(BaseModel):
    id: int | uuid.UUID
    name: str
    professor: str
    credits: int
    registration_code: str


# Pydantic model to outline structure/types of Students (including nested model)
class Student(BaseModel):
    id: uuid.UUID
    name: str
    date_of_birth: date
    GPA: confloat(ge=0, le=4)
    course: str | None
    department: DepartmentEnum
    fees_paid: bool
    modules: list[Module] = []

    @validator('modules')
    def validate_module_length(cls, value):
        if len(value) and len(value) != 3:
            raise ValueError('List of modules should have length 3')
        return value

    @validator('date_of_birth')
    def ensure_16_or_over(cls, value):
        sixteen_years_ago = datetime.now() - timedelta(days=365*16)

        # Convert datetime object -> date
        sixteen_years_ago = sixteen_years_ago.date()

        # Raise an error if DOB is more recent than 16 years past.
        if value > sixteen_years_ago:
            raise ValueError("Too young to enroll, sorry!")
        return value

# Iterate over each student record
for student in data:
    # Create a Pydantic model object by unpacking key/val pairs from our JSON dict as arguments
    try:
        model = Student(**student)
        print(model)
    except ValueError as e:
        print(f"Error creating model: {e}")


In [None]:
Module.schema_json()

'{"title": "Module", "type": "object", "properties": {"id": {"title": "Id", "anyOf": [{"type": "integer"}, {"type": "string", "format": "uuid"}]}, "name": {"title": "Name", "type": "string"}, "professor": {"title": "Professor", "type": "string"}, "credits": {"title": "Credits", "type": "integer"}, "registration_code": {"title": "Registration Code", "type": "string"}}, "required": ["id", "name", "professor", "credits", "registration_code"]}'

In [None]:
Module.schema_json(indent=2)

'{\n  "title": "Module",\n  "type": "object",\n  "properties": {\n    "id": {\n      "title": "Id",\n      "anyOf": [\n        {\n          "type": "integer"\n        },\n        {\n          "type": "string",\n          "format": "uuid"\n        }\n      ]\n    },\n    "name": {\n      "title": "Name",\n      "type": "string"\n    },\n    "professor": {\n      "title": "Professor",\n      "type": "string"\n    },\n    "credits": {\n      "title": "Credits",\n      "type": "integer"\n    },\n    "registration_code": {\n      "title": "Registration Code",\n      "type": "string"\n    }\n  },\n  "required": [\n    "id",\n    "name",\n    "professor",\n    "credits",\n    "registration_code"\n  ]\n}'

In [None]:
! pip install datamodel-code-generator

Collecting datamodel-code-generator
  Downloading datamodel_code_generator-0.21.5-py3-none-any.whl (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.3/90.3 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PySnooper<2.0.0,>=0.4.1 (from datamodel-code-generator)
  Downloading PySnooper-1.2.0-py2.py3-none-any.whl (14 kB)
Collecting argcomplete<4.0,>=1.10 (from datamodel-code-generator)
  Downloading argcomplete-3.1.2-py3-none-any.whl (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.5/41.5 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting black>=19.10b0 (from datamodel-code-generator)
  Downloading black-23.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting genson<2.0,>=1.2.1 (from datamodel-code-generator)
  Downloading genson-1.2.2.tar.gz (34 kB)
  Preparin

In [None]:
%%writefile jsonschema.json
{"title": "Module", "type": "object", "properties": {"id": {"title": "Id", "anyOf": [{"type": "integer"}, {"type": "string", "format": "uuid"}]}, "name": {"title": "Name", "type": "string"}, "professor": {"title": "Professor", "type": "string"}, "credits": {"title": "Credits", "type": "integer"}, "registration_code": {"title": "Registration Code", "type": "string"}}, "required": ["id", "name", "professor", "credits", "registration_code"]}

Writing jsonschema.json


In [None]:
!datamodel-codegen --input jsonschema.json --output models.py

The input file type was determined to be: jsonschema
This can be specificied explicitly with the `--input-file-type` option.


In [None]:
from enum import Enum
from pydantic import BaseModel, validator, confloat, Field,ValidationError,root_validator
from datetime import date, timedelta, datetime
import uuid
import requests
from typing import Optional

# Fetch the raw JSON data from Github
url = 'https://raw.githubusercontent.com/bugbytes-io/datasets/master/students_v3.json'
data = requests.get(url).json()

# Define an Enum of acceptable Department values
class DepartmentEnum(str, Enum):
    ARTS_AND_HUMANITIES = 'Arts and Humanities'
    LIFE_SCIENCES = 'Life Sciences'
    SCIENCE_AND_ENGINEERING = 'Science and Engineering'

# Pydantic model to outline structure/types of Modules
class Module(BaseModel):
    id: int | uuid.UUID
    name: str
    professor: str
    credits: int
    registration_code: str


# Pydantic model to outline structure/types of Students (including nested model)
class Student(BaseModel):
    id: uuid.UUID
    student_name: str = Field(alias="name")
    date_of_birth: date = Field(default_factory=lambda: datetime.today().date())
    GPA: confloat(ge=0, le=4)
    course: Optional[str]
    department: DepartmentEnum
    fees_paid: bool
    modules: list[Module] = Field(default=[],max_items=10)
    tags: list[str]

    class Config:
        use_enum_values = True
        extra = "ignore"
        anystr_strip_whitespace = True

    @validator('modules')
    def validate_module_length(cls, value):
        if len(value) and len(value) != 3:
            raise ValueError('List of modules should have length 3')
        return value

    @validator('date_of_birth')
    def ensure_16_or_over(cls, value):
        sixteen_years_ago = datetime.now() - timedelta(days=365*16)

        # Convert datetime object -> date
        sixteen_years_ago = sixteen_years_ago.date()

        # Raise an error if DOB is more recent than 16 years past.
        if value > sixteen_years_ago:
            raise ValueError("Too young to enroll, sorry!")
        return value

    @validator('tags', pre=True)
    def split_tags(cls, value):
        return value.split(",")

    # @validator('tags', each_item=True)
    # def remove_slackers(cls, value):
    #     if value == 'slacker':
    #         raise ValueError("Student is a slacker and cannot be enrolled!")
        return value

# Iterate over each student record
for student in data:
    try:
        model = Student(**student)
        print(model.tags)   # print out the list of tags to the terminal
    except ValidationError as e:
        print(e)

#EXAMPLE USING PYDANTIC WITH WEBSCRAPING
- https://www.youtube.com/watch?v=xzxWLVCUvLo

In [None]:
import requests
from rich import print
from pydantic import BaseModel, validator

class Variant(BaseModel):
  title: str
  sku: str
  price: str
  grams: int

  @validator("sku")
  def check_sku_len(cls,value):
    required_length = 10
    if len(value) != required_length:
      raise ValueError("SKU must be 10 chars long")
    return value

class Product(BaseModel):
  id: int
  title: str
  variants: list[Variant]

def get_data():
  resp = requests.get("https://www.allbirds.co.uk/products.json")
  return resp.json()["products"]

def main():
  products = get_data()
  for product in products:
    # print(product)
    item = Product(**product)
    # print(item) or
    print(item.dict())

if __name__ == "__main__":
    main()