# Final Project Tasks.ipynb

In [1]:
# Install required packages
# !pip install langchain_community pypdfium2 ibm-watsonx-ai langchain-ibm chromadb

In [2]:
# To filter out warnings
import warnings
warnings.filterwarnings("ignore")

## Task 1 - PDF Loader

In [3]:
# Import necessary libraries
from langchain_community.document_loaders import PyPDFium2Loader

# Define the URL of the PDF document
paper_url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/WgM1DaUn2SYPcCg_It57tA/A-Comprehensive-Review-of-Low-Rank-Adaptation-in-Large-Language-Models-for-Efficient-Parameter-Tuning-1.pdf"

# Load the PDF document using PyPDFium2Loader
pdf_loader = PyPDFium2Loader(paper_url)

# Load and split the PDF document into pages
pages = pdf_loader.load_and_split()
print(pages[0].page_content[:1000])

A Comprehensive Review of Low-Rank
Adaptation in Large Language Models for
Efficient Parameter Tuning
September 10, 2024
Abstract
Natural Language Processing (NLP) often involves pre-training large
models on extensive datasets and then adapting them for specific tasks
through fine-tuning. However, as these models grow larger, like GPT-3
with 175 billion parameters, fully fine-tuning them becomes computationally expensive. We propose a novel method called LoRA (Low-Rank
Adaptation) that significantly reduces the overhead by freezing the original model weights and only training small rank decomposition matrices.
This leads to up to 10,000 times fewer trainable parameters and reduces
GPU memory usage by three times. LoRA not only maintains but sometimes surpasses fine-tuning performance on models like RoBERTa, DeBERTa, GPT-2, and GPT-3. Unlike other methods, LoRA introduces
no extra latency during inference, making it more efficient for practical
applications. All relevant code and mo

## Task 2 - Code Splitter

In [4]:
latex_text = """
    \documentclass{article}
    
    \begin{document}
    
    \maketitle
    
    \section{Introduction}
    Large language models (LLMs) are a type of machine learning model that can be trained on vast amounts of text data to generate human-like language. In recent years, LLMs have made significant advances in various natural language processing tasks, including language translation, text generation, and sentiment analysis.
    
    \subsection{History of LLMs}
    The earliest LLMs were developed in the 1980s and 1990s, but they were limited by the amount of data that could be processed and the computational power available at the time. In the past decade, however, advances in hardware and software have made it possible to train LLMs on massive datasets, leading to significant improvements in performance.
    
    \subsection{Applications of LLMs}
    LLMs have many applications in the industry, including chatbots, content creation, and virtual assistants. They can also be used in academia for research in linguistics, psychology, and computational linguistics.
    
    \end{document}
"""

In [5]:
# Import the necessary module for text splitting
from langchain.text_splitter import Language, RecursiveCharacterTextSplitter

# Create a RecursiveCharacterTextSplitter instance for LaTeX documents
latex_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.LATEX, chunk_size=60, chunk_overlap=0
)

# Split the LaTeX text into documents
latex_docs = latex_splitter.create_documents([latex_text])
latex_docs

[Document(metadata={}, page_content='\\documentclass{article}\n    \n    \x08egin{document}'),
 Document(metadata={}, page_content='\\maketitle\n    \n    \\section{Introduction}\n    Large'),
 Document(metadata={}, page_content='language models (LLMs) are a type of machine learning model'),
 Document(metadata={}, page_content='that can be trained on vast amounts of text data to'),
 Document(metadata={}, page_content='generate human-like language. In recent years, LLMs have'),
 Document(metadata={}, page_content='made significant advances in various natural language'),
 Document(metadata={}, page_content='processing tasks, including language translation, text'),
 Document(metadata={}, page_content='generation, and sentiment analysis.'),
 Document(metadata={}, page_content='\\subsection{History of LLMs}\n    The earliest LLMs were'),
 Document(metadata={}, page_content='developed in the 1980s and 1990s, but they were limited by'),
 Document(metadata={}, page_content='the amount of data 

## Task 3 - Embedding

In [6]:
# Import the necessary module for embedding
from ibm_watsonx_ai.metanames import EmbedTextParamsMetaNames
from langchain_ibm import WatsonxEmbeddings

# Define the embedding parameters
embedding_params = {
    EmbedTextParamsMetaNames.TRUNCATE_INPUT_TOKENS: 3,
    EmbedTextParamsMetaNames.RETURN_OPTIONS: {"input_text": True},
}

# Initialize the WatsonxEmbeddings with the specified model and parameters
watsonx_embedding = WatsonxEmbeddings(
    model_id="ibm/slate-30m-english-rtrvr",
    url="https://us-south.ml.cloud.ibm.com",
    project_id="skills-network",
    params=embedding_params,
)

# Embed the given query using the WatsonxEmbeddings instance
query = "How are you?"
query_result = watsonx_embedding.embed_query(query)
print(query_result[:5])

[0.018171897, -0.018608226, 0.059054308, 0.07260351, 0.08736516]


## Task 4 - Vector DB

In [7]:
# Import the necessary module for vector stores
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma

# Load the text document using TextLoader
text_loader = TextLoader("data/new-Policies.txt")
data = text_loader.load()

# Split the loaded text document into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len,
)
chunks = text_splitter.split_documents(data)
ids = [str(i) for i in range(0, len(chunks))]

# Define the embedding parameters for the WatsonxEmbeddings
embedding_params = {
    EmbedTextParamsMetaNames.TRUNCATE_INPUT_TOKENS: 3,
    EmbedTextParamsMetaNames.RETURN_OPTIONS: {"input_text": True},
}

# Initialize the WatsonxEmbeddings with the specified model and parameters
watsonx_embedding = WatsonxEmbeddings(
    model_id="ibm/slate-125m-english-rtrvr",
    url="https://us-south.ml.cloud.ibm.com",
    project_id="skills-network",
    params=embedding_params,
)

# Create a Chroma vector store from the chunks and their embeddings
vectordb = Chroma.from_documents(chunks, watsonx_embedding, ids=ids)

# Perform a similarity search in the vector store using a query
query = "Smoking policy"
docs = vectordb.similarity_search(query, k=5)
for i, doc in enumerate(docs, start=1):
    print(f"Result {i}: {doc.page_content}")

Result 1: This policy encourages the responsible use of mobile devices in line with legal and ethical
Result 2: This policy promotes the safe and responsible use of digital communication tools in line with our
Result 3: This policy lays the foundation for a diverse, inclusive, and talented workforce. It ensures that
Result 4: guidelines. The policy is regularly reviewed to stay current with evolving technology and security
Result 5: We encourage a culture of safety, including reporting any unsafe practices or conditions.


## Task 5 - Retriever

In [8]:
# Import the necessary module for vector stores
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma

# Load the text document using TextLoader
text_loader = TextLoader("data/new-Policies.txt")
data = text_loader.load()

# Split the loaded text document into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len,
)
chunks = text_splitter.split_documents(data)
ids = [str(i) for i in range(0, len(chunks))]

# Define the embedding parameters for the WatsonxEmbeddings
embedding_params = {
    EmbedTextParamsMetaNames.TRUNCATE_INPUT_TOKENS: 3,
    EmbedTextParamsMetaNames.RETURN_OPTIONS: {"input_text": True},
}

# Initialize the WatsonxEmbeddings with the specified model and parameters
watsonx_embedding = WatsonxEmbeddings(
    model_id="ibm/slate-125m-english-rtrvr",
    url="https://us-south.ml.cloud.ibm.com",
    project_id="skills-network",
    params=embedding_params,
)

# Create a Chroma vector store from the chunks and their embeddings
vectordb = Chroma.from_documents(chunks, watsonx_embedding, ids=ids)

# Create a retriever from the vector store
retriever = vectordb.as_retriever(search_kwargs={"k": 2})

# Use the retriever to find relevant documents for a given query
query = "Email policy"
docs = retriever.invoke(query)
for i, doc in enumerate(docs, start=1):
    print(f"Result {i}: {doc.page_content}")

Result 1: and email use, including copyright and data protection laws.
Result 2: This policy encourages the responsible use of mobile devices in line with legal and ethical
