In [1]:
# Import Libraries

import spacy 
import numpy as np                 
import pandas as pd        
import torch           
import transformers 
from transformers import pipeline    
import tqdm

# Import Libraries
from ollama import generate
from langchain_community.llms import Ollama


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama.llms import OllamaLLM


In [3]:
import ollama

print(
    ollama.chat(
        model="llama3",
        messages=[{"role": "user", "content": "Hello"}]
    )["message"]["content"]
)


Hello! It's nice to meet you. Is there something I can help you with, or would you like to chat?


In [4]:
import ollama

response = ollama.chat(
    model='llama3',
    messages=[
        {'role': 'user', 'content': 'Explain PLS regression simply.'}
    ]
)

print(response['message']['content'])


The joys of multivariate statistics!

PLS (Partial Least Squares) regression is a type of dimension reduction technique that's commonly used in data analysis, particularly when dealing with high-dimensional data.

**What's the problem it solves?**

Imagine you have a dataset with many variables (e.g., X1 to X100), and you want to explain a target variable Y using some of these X variables. A classic regression approach would be Ordinary Least Squares (OLS) or Multiple Linear Regression (MLR). However, when dealing with high-dimensional data, OLS/MLR can suffer from:

1. **The Curse of Dimensionality**: As the number of predictor variables increases, the number of possible combinations explodes, making it difficult to identify meaningful relationships.
2. **Collinearity**: Many X variables might be highly correlated, leading to unstable estimates and poor model performance.

**How does PLS help?**

PLS regression addresses these issues by:

1. **Selecting relevant variables**: PLS ident

In [23]:
import spacy
nlp = spacy.load("en_core_web_sm")


with open("mosquitoes.txt", "r", encoding="utf-8") as f:
    dext = f.read()
    
text = nlp(dext)

for token in text[:100].sents:
    print(token)

Mosquitoes, the Culicidae, are a family of small flies consisting of 3,600 species.
The word mosquito is Spanish and Portuguese for little fly.
Mosquitoes have a slender segmented body, one pair of wings, three pairs of long hair-like legs, and specialized, highly elongated, piercing-sucking mouthparts.
All mosquitoes drink nectar from flowers; females of many species have adapted to also drink blood.
The group diversified during the Cretaceous period.
Evolutionary biologists view mosquitoes as micropredators, small animals that parasitise larger ones by drinking their blood without immediately killing them.


# Pipeline ü§°üåûüåª

| Capability     | Package                    |
| -------------- | -------------------------- |
| Core           | `langchain-core`           |
| Text splitting | `langchain-text-splitters` |
| Ollama         | `langchain-ollama`         |
| HuggingFace    | `langchain-huggingface`    |
| Chroma         | `langchain-chroma`         |
| OpenAI         | `langchain-openai`         |


In [6]:
from langchain_chroma import Chroma
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter, Language
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from uuid import uuid4

In [7]:
text_path = "mosquitoes.txt"
text_data = TextLoader(text_path, encoding="utf-8")
doc = text_data.load()

In [8]:
print(type(doc))

<class 'list'>


# Recurssive Splitters for TEXT ü§°ü¶Å

In [9]:
splitter = RecursiveCharacterTextSplitter(
        chunk_size = 300, # number of characters
        chunk_overlap = 60, # number of characters 10 - 20% of chunk size
        add_start_index = True
        )

chunks = splitter.split_documents(doc)
print(chunks)

[Document(metadata={'source': 'mosquitoes.txt', 'start_index': 0}, page_content='Mosquitoes, the Culicidae, are a family of small flies consisting of 3,600 species. The word mosquito is Spanish and Portuguese for little fly. Mosquitoes have a slender segmented body, one pair of wings, three pairs of long hair-like legs, and specialized, highly elongated, piercing-sucking'), Document(metadata={'source': 'mosquitoes.txt', 'start_index': 235}, page_content='legs, and specialized, highly elongated, piercing-sucking mouthparts. All mosquitoes drink nectar from flowers; females of many species have adapted to also drink blood. The group diversified during the Cretaceous period. Evolutionary biologists view mosquitoes as micropredators, small animals that'), Document(metadata={'source': 'mosquitoes.txt', 'start_index': 481}, page_content='view mosquitoes as micropredators, small animals that parasitise larger ones by drinking their blood without immediately killing them. Medical parasitologis

In [21]:
for i, chunk in enumerate(chunks[:2]):
    print(f"\n -- Chunk -{i} --\n")
    print(chunk)


 -- Chunk -0 --

page_content='Mosquitoes, the Culicidae, are a family of small flies consisting of 3,600 species. The word mosquito is Spanish and Portuguese for little fly. Mosquitoes have a slender segmented body, one pair of wings, three pairs of long hair-like legs, and specialized, highly elongated, piercing-sucking' metadata={'source': 'mosquitoes.txt', 'start_index': 0}

 -- Chunk -1 --

page_content='legs, and specialized, highly elongated, piercing-sucking mouthparts. All mosquitoes drink nectar from flowers; females of many species have adapted to also drink blood. The group diversified during the Cretaceous period. Evolutionary biologists view mosquitoes as micropredators, small animals that' metadata={'source': 'mosquitoes.txt', 'start_index': 235}


In [None]:
# docs = splitter.split_documents(documents)
for i, doc in enumerate(chunks[:2]):
    print(f"\n--- Chunk {i} ---")
    print("Content:")
    print(doc.page_content)
    print("Metadata:")
    print(doc.metadata)


--- Chunk 0 ---
Content:
Mosquitoes, the Culicidae, are a family of small flies consisting of 3,600 species. The word mosquito is Spanish and Portuguese for little fly. Mosquitoes have a slender segmented body, one pair of wings, three pairs of long hair-like legs, and specialized, highly elongated, piercing-sucking
Metadata:
{'source': 'mosquitoes.txt', 'start_index': 0}

--- Chunk 1 ---
Content:
legs, and specialized, highly elongated, piercing-sucking mouthparts. All mosquitoes drink nectar from flowers; females of many species have adapted to also drink blood. The group diversified during the Cretaceous period. Evolutionary biologists view mosquitoes as micropredators, small animals that
Metadata:
{'source': 'mosquitoes.txt', 'start_index': 235}


Loader -> Splitter -> Embeddings -> Vector Database -> Retriver -> LLM

# Embeddings ~ Text Embeddings üìÅ‚úçüèæ

In [10]:
from langchain_chroma import Chroma
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_ollama import OllamaEmbeddings

In [None]:
embedding_model = HuggingFaceEmbeddings(
                    model_name = "sentence-transformers/all-MiniLM-L6-v2"
                    )

embeddings_model_1 = OllamaEmbeddings(
            model="nomic-embed-text"
)


In [26]:
vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embedding_model,
    persist_directory="./chroma_db",
    collection_name= "my_docs"   
                                )


# Chroma -
Chroma is a vector database. its job is not to generate text or embeddings, but to:
1) store
2) index
3) retrieve 
embeddings efficiently

CHROMA <- STORES + INDEXES + RETRIEVES 

In [38]:
retriever = vectorstore.as_retriever(search_kwargs={"k":4})
hits = retriever.invoke("What is this document about?")

for h in hits:
    print(h.metadata, h.page_content[:200], "\n--\n")

{'start_index': 767, 'source': 'mosquitoes.txt'} The mosquito life cycle consists of four stages: egg, larva, pupa, and adult. Eggs are laid on the water surface; they hatch into motile larvae that feed on aquatic algae and organic material. These l 
--

{'source': 'mosquitoes.txt', 'start_index': 767} The mosquito life cycle consists of four stages: egg, larva, pupa, and adult. Eggs are laid on the water surface; they hatch into motile larvae that feed on aquatic algae and organic material. These l 
--

{'source': 'mosquitoes.txt', 'start_index': 767} The mosquito life cycle consists of four stages: egg, larva, pupa, and adult. Eggs are laid on the water surface; they hatch into motile larvae that feed on aquatic algae and organic material. These l 
--

{'source': 'mosquitoes.txt', 'start_index': 481} view mosquitoes as micropredators, small animals that parasitise larger ones by drinking their blood without immediately killing them. Medical parasitologists view mosquitoes as vectors 

In [39]:
retriever_new = vectorstore.similarity_search(
    "Mosquitoes, the Culicidae, are a family of small flies consisting of 3,600 species.",
    k=3
    )

for i,r in enumerate(retriever_new):
    print(f"{i}: {r.page_content}")

0: Mosquitoes, the Culicidae, are a family of small flies consisting of 3,600 species. The word mosquito is Spanish and Portuguese for little fly. Mosquitoes have a slender segmented body, one pair of wings, three pairs of long hair-like legs, and specialized, highly elongated, piercing-sucking
1: Mosquitoes, the Culicidae, are a family of small flies consisting of 3,600 species. The word mosquito is Spanish and Portuguese for little fly. Mosquitoes have a slender segmented body, one pair of wings, three pairs of long hair-like legs, and specialized, highly elongated, piercing-sucking
2: Mosquitoes, the Culicidae, are a family of small flies consisting of 3,600 species. The word mosquito is Spanish and Portuguese for little fly. Mosquitoes have a slender segmented body, one pair of wings, three pairs of long hair-like legs, and specialized, highly elongated, piercing-sucking


# Use LLM ollama üêêüåªü§°