<a href="https://colab.research.google.com/github/adharshrj/llmsinedu/blob/main/LLM_in_Education.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Installation

In [None]:
# Pip installation LangChain and Hugginface API
!pip install langchain
!pip install huggingface_hub

# Pip installation of additional needed libraries
!pip install sentence_transformers
!pip install faiss-cpu
!pip install "unstructured[all-docs]"


Env Setup

In [18]:
import os
import requests
os.environ["HUGGINGFACEHUB_API_TOKEN"] = ""
os.environ["HF_TOKEN"] = ""

Connect Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Setup Loaders

In [4]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import OnlinePDFLoader

In [25]:
def loadPDFFromLocal(pdf_file_path="/content/drive/MyDrive/LLM_Testing_Docs/clrs.pdf"):
    loader = PyPDFLoader(pdf_file_path)
    pages = loader.load_and_split()

    # Adding progress tracking
    total_pages = len(pages)
    for i, page in enumerate(pages):
        print(f"Processing page {i+1} of {total_pages}")

    return pages


In [40]:
def loadFromUrl(url="https://www.nrel.gov/docs/fy12osti/55871.pdf"):
  onlineLoader = OnlinePDFLoader(url)
  newPg = onlineLoader.load_and_split()

  print(newPg)
  return newPg

Split Documents (LLMS cannot read large amounts of data)

In [9]:
from langchain.text_splitter import CharacterTextSplitter

In [10]:
def splitDocument(loaded_docs):
    # Splitting documents into chunks
    splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
    chunked_docs = splitter.split_documents(loaded_docs)
    return chunked_docs

Create Embeddings

In [11]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

In [12]:
def createEmbeddings(chunked_docs):
    # Create embeddings and store them in a FAISS vector store
    embedder = HuggingFaceEmbeddings()
    vector_store = FAISS.from_documents(chunked_docs, embedder)
    return vector_store

Use those embeddings to feed the LLM model and Answer Questions

In [13]:
from langchain.chains.question_answering import load_qa_chain
from langchain import HuggingFaceHub

In [34]:
def loadLLMModel():
    llm=HuggingFaceHub(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1", model_kwargs={"temperature":0, "max_length":2048})
    chain = load_qa_chain(llm, chain_type="stuff")
    return chain

def askQuestions(vector_store, chain, question):
    # Ask a question using the QA chain
    similar_docs = vector_store.similarity_search(question)
    response = chain.run(input_documents=similar_docs, question=question)
    return response

In [35]:
chain = loadLLMModel()

Testing

In [None]:
PDF_loaded_docs = loadPDFFromLocal()
PDF_chunked_docs = splitDocument(PDF_loaded_docs)
PDF_vector_store = createEmbeddings(PDF_loaded_docs)

In [None]:
PDF_loaded_docs = loadFromUrl()
PDF_vector_store = createEmbeddings(PDF_loaded_docs)

In [None]:
PDF_response = askQuestions(PDF_vector_store, chain, "Summarize the content of this paper please")
print(PDF_response)