In [61]:
# Please install OpenAI SDK first: `pip3 install openai`
import dotenv
from openai import OpenAI
import os

dotenv.load_dotenv()

client = OpenAI(api_key=os.environ['API_KEY'], base_url="https://api.deepseek.com")

response = client.chat.completions.create(
    model="deepseek-chat",
    messages=[
        {"role": "system", "content": "You are a helpful assistant"},
        {"role": "user", "content": "Hello"},
    ],
    stream=False
)

print(response.choices[0].message.content)

Hello! How can I assist you today? 😊


In [4]:
!pip install langchain_community pypdf



In [8]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from typing import List
    
    

In [9]:

def extract_pdf_with_metadata(file_paths: List[str]):
    data_final_list = []
    for file_path in file_paths:
        if not file_path.endswith(".pdf"):
            raise ValueError("Only PDF files are supported")
        loader = PyPDFLoader(file_path)
        pages = loader.load_and_split()
        data =  [
            {
                "content": page.page_content,
                "metadata": {
                    "page": page.metadata["page"],
                    "source": file_path
                }
            }
            for page in pages
        ]
        data_final_list.extend(data)
    return data_final_list


In [10]:
import os

def get_all_files(file_path: str):
    if not os.path.isdir(file_path):
        raise ValueError("The provided path is not a directory")
    return [os.path.join(file_path, f) for f in os.listdir(file_path) if os.path.isfile(os.path.join(file_path, f))]

# Example usage
file_path = "data/"
all_files = get_all_files(file_path)
print(all_files)

['data/aami-home-building-insurance-pds.pdf', 'data/aami-comprehensive-car-insurance-pds.pdf']


In [11]:
all_pages = extract_pdf_with_metadata(all_files)

In [13]:
all_pages[0]

{'content': 'HOME  \nBUILDING  \nINSURANCE\nPRODUCT  \nDISCLOSURE  \nSTATEMENT',
 'metadata': {'page': 0,
  'source': 'data/aami-home-building-insurance-pds.pdf'}}

In [15]:
from langchain.prompts import PromptTemplate

# Define the prompt template
template = """
You are a helpful assistant. Please summarize the following content by identifying and focusing on the most important topics. Think through the key points step by step before providing the summary.

**Content:**
{content}

**Chain of Thought:**
1. Identify the main themes or topics discussed in the content.
2. Highlight the key points related to each main topic.
3. Condense the highlighted points into a coherent summary.

**Summary:**
"""

# Create a PromptTemplate instance
prompt_template = PromptTemplate(input_variables=["content"], template=template)


In [22]:
import tqdm
import ollama

In [25]:
for page in tqdm.tqdm(all_pages):
    response = ollama.generate(model='deepseek-r1:8b', prompt=prompt_template.format(content=page["content"]))
    page["summary"] = response.response

100%|██████████| 180/180 [36:53<00:00, 12.30s/it]


In [29]:
all_pages[6]

{'content': '7\nWhat we cover\nInsured events\nWe cover the building for loss or damage caused by specific events like \nstorms, floods, and fires (including bushfires).\nLegal liability\nWe cover your legal liability to pay compensation for death, or bodily injury \nto other people (not you), or loss or damage to their property, in certain \nsituations.\nAdditional cover that comes with your policy\nThere are some additional covers that come with your policy for no extra cost. \nSee section 5 ‘Additional cover that comes with your policy’ on page 53 for \nmore information.\nOptional cover you can pay extra for\nThere are some optional covers that you’ll have to pay extra for. If an optional \ncover applies to your policy, it’s shown on your certificate of insurance. \nSee section 6 ‘Optional cover you can pay extra for’ on page 67 for more \ninformation.',
 'metadata': {'page': 6,
  'source': 'data/aami-home-building-insurance-pds.pdf'},
 'summary': '<think>\nOkay, so I\'m trying to s

In [28]:
!pip install --pre --extra-index-url https://pypi.fury.io/lancedb/ lancedb


Looking in indexes: https://pypi.org/simple, https://pypi.fury.io/lancedb/
Collecting lancedb
  Downloading https://pypi.fury.io/lancedb/-/ver_24PuN2/lancedb-0.18.1b2-cp39-abi3-macosx_11_0_arm64.whl (27.0 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.0/27.0 MB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hCollecting deprecation (from lancedb)
  Using cached deprecation-2.1.0-py2.py3-none-any.whl.metadata (4.6 kB)
Collecting pylance==0.23.0b3 (from lancedb)
  Downloading https://pypi.fury.io/lancedb/-/ver_1zJrzG/pylance-0.23.0b3-cp39-abi3-macosx_11_0_arm64.whl (33.0 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.0/33.0 MB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m:01[0m
Collecting pyarrow>=14 (from pylance==0.23.0b3->lancedb)
  Downloading pyarrow-19.0.0-cp312-cp312-macosx_12_0_arm64.whl.metadata (3.3 kB)
Using cached deprecation-2.1.0-py2.py3

In [34]:
import lancedb
from lancedb.pydantic import LanceModel, Vector
from lancedb.embeddings import get_registry
from typing import Dict, Any, Optional

In [39]:
db = lancedb.connect("db/temp")

embedding_func = get_registry().get("ollama").create(name="mxbai-embed-large")

In [41]:
class Document(LanceModel):
    content: str = embedding_func.SourceField()
    vector: Vector(embedding_func.ndims()) = embedding_func.VectorField()
    page: Optional[int] = None
    source: Optional[str] = None
    summary: str

In [42]:
table_name = "documents"
table = db.create_table(table_name, schema=Document, mode="overwrite")

[2025-01-30T07:22:28Z WARN  lance::dataset::write::insert] No existing dataset at /Volumes/ssd/deep_seek/db/temp/documents.lance, it will be created


In [59]:
def insert_documents(documents: List[Dict[str, Any]]):
    temp_list = []
    for document in documents:
        temp_list.append({
            "content": document["content"],
            "page": document["metadata"]["page"],
            "source": document["metadata"]["source"],
            "summary": document["summary"]
        })
    table.add(temp_list)

In [60]:
insert_documents(all_pages)

In [1]:
import openai

# Set your OpenAI API key
openai.api_key = ''

# Define the prompt
prompt = "Once upon a time"

# Generate a completion
response = openai.Completion.create(
    engine="gpt-4o-mini",
    prompt=prompt,
    max_tokens=50
)

# Print the generated text
print(response.choices[0].text.strip())


APIRemovedInV1: 

You tried to access openai.Completion, but this is no longer supported in openai>=1.0.0 - see the README at https://github.com/openai/openai-python for the API.

You can run `openai migrate` to automatically upgrade your codebase to use the 1.0.0 interface. 

Alternatively, you can pin your installation to the old version, e.g. `pip install openai==0.28`

A detailed migration guide is available here: https://github.com/openai/openai-python/discussions/742
