In [2]:
from docx import Document
import os
from tqdm.auto import tqdm

from uuid import uuid4
from typing import List, Dict, Tuple, Optional, Any
from dotenv import load_dotenv
from config.config import BASE_DIR, DATA_DIR, EMBEDDING_MODEL_NAME, PINECONE_INDEX_NAME

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
base = os.path.dirname('../data/')
base

'../data'

In [4]:
common = Document(base + '/external/common.docx')

In [5]:
common.paragraphs[0].text

'1. Is senior living the same as a nursing home? '

In [6]:
def getText_docx(file:Document) -> str:
    content = []
    for paragraph in file.paragraphs:
        print(paragraph.text)
        content.append(paragraph.text)
    return '\n'.join(content)

In [7]:
content = getText_docx(common)
type(content)

1. Is senior living the same as a nursing home? 
Think of it this way, senior living is a social model providing care when you need it while preserving your independence. Nursing homes primarily provide nursing services to the chronically ill. It’s important to note, nursing homes often provide a broader range of skilled nursing. Senior living, on the other hand, offers various lifestyle options for older adults who want to maintain their independence while living in their own apartments.
2. Will I lose my independence when I move in?
No, in fact, quite the opposite. At Carlton, we take pride in our philosophy of independence with assistance. In our communities, you have the freedom to live life while knowing that a helping hand is always available when you need it. Whether it’s assistance with daily activities, medication management, or simply having someone there for peace of mind, we are here to provide the support that complements and enhances your independence.
3. Is senior living

str

In [8]:
os.path.join(base, 'raw', 'common.docx')

'../data/raw/common.docx'

In [9]:
def writeText(content:str, filename: str, base_path:Optional[str]=base):
    write_dir = os.path.join(base_path,'raw', filename) 
    with open(write_dir, 'w') as f:
        f.write(content)
    return f'File {filename} written in {write_dir}'

In [10]:
writeText(content, 'common.txt')

'File common.txt written in ../data/raw/common.txt'

In [11]:
import tiktoken

tokenizer = tiktoken.get_encoding('cl100k_base')

# create the length function
def tiktoken_len(text:str) -> int:
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

tiktoken_len(content)

658

In [12]:
from langchain.text_splitter import RecursiveCharacterTextSplitter


text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=600,
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)

Use local documents to do the splitting without parse it as strings in Python

In [13]:
from langchain.document_loaders import TextLoader
loader = TextLoader("../data/raw/common.txt") #use local documents to split documents
type(loader.load()[0])

langchain.schema.Document

In [14]:
loader.load()

[Document(page_content='1. Is senior living the same as a nursing home? \nThink of it this way, senior living is a social model providing care when you need it while preserving your independence. Nursing homes primarily provide nursing services to the chronically ill. It’s important to note, nursing homes often provide a broader range of skilled nursing. Senior living, on the other hand, offers various lifestyle options for older adults who want to maintain their independence while living in their own apartments.\n2. Will I lose my independence when I move in?\nNo, in fact, quite the opposite. At Carlton, we take pride in our philosophy of independence with assistance. In our communities, you have the freedom to live life while knowing that a helping hand is always available when you need it. Whether it’s assistance with daily activities, medication management, or simply having someone there for peace of mind, we are here to provide the support that complements and enhances your indepe

In [15]:
chunks  = text_splitter.split_documents(loader.load())

In [16]:
chunks

[Document(page_content='1. Is senior living the same as a nursing home? \nThink of it this way, senior living is a social model providing care when you need it while preserving your independence. Nursing homes primarily provide nursing services to the chronically ill. It’s important to note, nursing homes often provide a broader range of skilled nursing. Senior living, on the other hand, offers various lifestyle options for older adults who want to maintain their independence while living in their own apartments.\n2. Will I lose my independence when I move in?\nNo, in fact, quite the opposite. At Carlton, we take pride in our philosophy of independence with assistance. In our communities, you have the freedom to live life while knowing that a helping hand is always available when you need it. Whether it’s assistance with daily activities, medication management, or simply having someone there for peace of mind, we are here to provide the support that complements and enhances your indepe

In [17]:
chunks[0].page_content

'1. Is senior living the same as a nursing home? \nThink of it this way, senior living is a social model providing care when you need it while preserving your independence. Nursing homes primarily provide nursing services to the chronically ill. It’s important to note, nursing homes often provide a broader range of skilled nursing. Senior living, on the other hand, offers various lifestyle options for older adults who want to maintain their independence while living in their own apartments.\n2. Will I lose my independence when I move in?\nNo, in fact, quite the opposite. At Carlton, we take pride in our philosophy of independence with assistance. In our communities, you have the freedom to live life while knowing that a helping hand is always available when you need it. Whether it’s assistance with daily activities, medication management, or simply having someone there for peace of mind, we are here to provide the support that complements and enhances your independence.\n3. Is senior l

In [18]:
from langchain.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name = "sentence-transformers/all-MiniLM-L6-v2")

In [19]:
res = embeddings.embed_documents(chunks[1].page_content)
len(res)

574

In [20]:
from langchain.schema import Document
from langchain.document_loaders import DirectoryLoader

def loadFilesinDirectory(path: str, glob: Optional[str] = None) -> List[Document]:
    if glob is None:
        loader = DirectoryLoader(path = path)
    else:
        loader = DirectoryLoader(path = path, glob = glob)
    docs = loader.load()
    return docs

In [21]:
docs = loadFilesinDirectory(path='../data/raw/')

In [22]:
docs[:10]

[Document(page_content='1. Is senior living the same as a nursing home? Think of it this way, senior living is a social model providing care when you need it while preserving your independence. Nursing homes primarily provide nursing services to the chronically ill. It’s important to note, nursing homes often provide a broader range of skilled nursing. Senior living, on the other hand, offers various lifestyle options for older adults who want to maintain their independence while living in their own apartments. 2. Will I lose my independence when I move in? No, in fact, quite the opposite. At Carlton, we take pride in our philosophy of independence with assistance. In our communities, you have the freedom to live life while knowing that a helping hand is always available when you need it. Whether it’s assistance with daily activities, medication management, or simply having someone there for peace of mind, we are here to provide the support that complements and enhances your independen

In [23]:
len(docs)

184

In [24]:
def embed_documents_batch(docs: List[Document]) -> List[Document]:
    embeded_docs = []
    for doc in tqdm(docs):
        embeded_docs.append(embeddings.embed_documents(doc.page_content))
    return embeded_docs

In [25]:
#   docs_embeded = embed_documents_batch(docs[:10])

In [26]:
# len(docs_embeded[9][0])

In [27]:
load_dotenv()
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
PINECONE_ENV = os.getenv('PINECONE_ENV')

In [28]:
import pinecone
def connect_index(index_name: str, API_KEY:str = PINECONE_API_KEY, ENV:str = PINECONE_ENV) -> pinecone.Index:
    pinecone.init(api_key=API_KEY, environment=ENV)
    index = pinecone.Index(index_name)
    return index

In [29]:
index = connect_index('test-docs')
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [30]:
docs[0].metadata

{'source': '../data/raw/common.txt'}

In [31]:
for i, record in enumerate(docs):
    # first get metadata fields for this record
    metadata = {
    'id': uuid4().hex,
    'source': record.metadata,
    }
    record_metadatas = [{
            "chunk": j, "text": text, **metadata
            } for j, text in enumerate(record_texts)]

In [32]:
record_texts = text_splitter.split_documents(docs)

In [33]:
def insert_embedded_documents(documents: List[Document], embeddings, index: pinecone.Index, batch_limit: int =100, **metadata_dict: Optional[Dict[str, Any]]): 
    batch_limit = 100

    texts = []
    metadatas = []

    record_texts = documents
    for i, record in enumerate(tqdm(documents)):
        # first get metadata fields for this record
        if len(metadata_dict)>0:
            metadata = metadata_dict
        else:
            metadata = {
            'id': uuid4().hex,
            'source': record.metadata['source'],
            }
        # now we create chunks from the record text
        record_texts = text_splitter.split_text(record.page_content)
        # create individual metadata dicts for each chunk
        record_metadatas = [{
                    "chunk": j, "text": text, **metadata
                    } for j, text in enumerate(record_texts)]
        # # append these to current batches
        # texts.extend(record_texts)
        # metadatas.extend(record_metadatas)
        # if we have reached the batch_limit we can add texts
        if len(texts) >= batch_limit:
            # ids = [str(uuid4()) for _ in range(len(texts))]
            # embeds = embeddings.embed_documents(texts)
            index.upsert(vectors=zip(ids, embeds, metadatas))
            texts = []
            metadatas = []

    if len(texts) > 0:
        ids = [str(uuid4()) for _ in range(len(texts))]
        embeds = embeddings.embed_documents(texts)
        index.upsert(vectors=zip(ids, embeds, metadatas))

In [34]:
docs

[Document(page_content='1. Is senior living the same as a nursing home? Think of it this way, senior living is a social model providing care when you need it while preserving your independence. Nursing homes primarily provide nursing services to the chronically ill. It’s important to note, nursing homes often provide a broader range of skilled nursing. Senior living, on the other hand, offers various lifestyle options for older adults who want to maintain their independence while living in their own apartments. 2. Will I lose my independence when I move in? No, in fact, quite the opposite. At Carlton, we take pride in our philosophy of independence with assistance. In our communities, you have the freedom to live life while knowing that a helping hand is always available when you need it. Whether it’s assistance with daily activities, medication management, or simply having someone there for peace of mind, we are here to provide the support that complements and enhances your independen

In [36]:
insert_embedded_documents(docs[:10], embeddings, index)

100%|██████████| 10/10 [00:00<00:00, 125.94it/s]


In [3]:
BASE_DIR

PosixPath('/Users/isaac/FundamentlPartners/abinvenv-sol')

In [6]:
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader(f"{BASE_DIR}/data/raw/ALQMFHOFCDKV7HIO2VGHCM3ZDDVVZPBW.pdf")
pages = loader.load_and_split()

In [7]:
pages

[Document(page_content='1 \n  \nTestimony of  \n \nBlake Gerard  \n \nBefore the U.S.  House of Representatives   \nCommittee on Agriculture  \n \nHearing to Review the Future of U.S. Farm Policy and the Formulation of the 2012 Farm Bill  \n \nGalesburg, Illinois  \nMarch 23 , 2012  \n \nIntroduction  \n \nChairman Lucas, Ranking Member Peterson , and Members of the Committee, thank you for \nholding this hearing concerning farm policy and the 2012 farm bill.   I appreciate the opportunity \nto offer testimony on farm policy from the perspective of a diversified grain producer.    \n \nMy name is Blake Gerard.   I raise rice , soybean s, wheat, and corn in  Alexander and Union \ncounties in southern Illinois  and I have been farming on my own now for 16 years.   I am the \nfourth generation in my family to farm this land and this is my 13th year  to farm rice in Illinois.  I \nam also co-owner in a seed conditioning facility that does contract seed production, \nconditioning, packaging

In [4]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name = EMBEDDING_MODEL_NAME)

In [12]:
faiss_index = FAISS.from_documents(pages, embeddings)
docs = faiss_index.similarity_search("What does Blake Gerard?", k=2)
for doc in docs:
    print(str(doc.metadata["page"]) + ":", doc.page_content[:300])

0: 1 
  
Testimony of  
 
Blake Gerard  
 
Before the U.S.  House of Representatives   
Committee on Agriculture  
 
Hearing to Review the Future of U.S. Farm Policy and the Formulation of the 2012 Farm Bill  
 
Galesburg, Illinois  
March 23 , 2012  
 
Introduction  
 
Chairman Lucas, Ranking Member P
10: BLAKE GERARD  
 
 
 
EXPERIENCE  
 
2002 to present  GERARD&CRAIN FARMS, INC. dba RIVERBEND RICE SEED CO. – McClure, IL  
   President/Manager  
• Founded this Agribusiness Compa ny specializing in rice seed production.  The primary 
goal is to provide a superior quality seed supply to the Mid -Sout


In [7]:
def loadPDFs(path: str) -> List[Document]:
    docs = []
    for file in os.listdir(path):
        if file.endswith(".pdf"):
            print(os.path.join(path, file))
            loader = PyPDFLoader(os.path.join(path, file))
            pages = loader.load_and_split()
            docs.extend(pages)
    return docs

In [8]:
loadPDFs(f"{BASE_DIR}/data/raw/")

/Users/isaac/FundamentlPartners/abinvenv-sol/data/raw/JI27U7TQTGCH2WWIVTO3GRUAQ2AGBBDH.pdf
/Users/isaac/FundamentlPartners/abinvenv-sol/data/raw/ALQMFHOFCDKV7HIO2VGHCM3ZDDVVZPBW.pdf
/Users/isaac/FundamentlPartners/abinvenv-sol/data/raw/guidance_observationalrainfalldata.pdf


[Document(page_content='Promoter: Dannon Svab  \n \nLocation: Ridgewood High \nSchool, West Lafayette, Ohio \nDate:  April 5, 2008 \n \nCommissioner:   \nInspectors:  \nRandy Jarvis Ohio Athletic Commission \nExecutive Director \nBernie Profato \nOffice: (330) 797-2556 \nwww.aco.ohio.gov\n          MMA \n    Amateur \nCaged Madness 3 Judges: \nR Wince \nW Messer \nB Pethel \nReferees: \nC Snider \nM Matheny \nTimekeepers:   \n K Matheny \nPhysicians: \n  Dr Chlovechuk \n \nSch \nRnds  Contestants Weight Results Remarks \nSteve Roberts 197 Lost  AM \n  3 Jeremiah Street 194 Won UNAN \nMatt Nelson 155 Won UNAN  \nBradford Jordan 156 Lost  \nLarry Shuck 203 Won UNAN  \nJesus Santiago 197 Lost  \nBrian Kerr 154 Lost   \nRyan McLaughlin 154 Won TO Arm bar 1:22 1st Rd \nRobert Hitchcock 181.5 Lost 60 Day suspension –Unsportmanship  \nBill Jones 185 Won TO GUI 2:13 1st Rd \nJason Lampshire 231 Lost 30 day suspension  \nChris Alverson 228 Won TKO 22 secs 1st Rd \nJohn McNeeley 145.5 Lost   \nJ