# Langchain crash course

In [1]:
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain_cohere import ChatCohere
from langchain_cohere import CohereEmbeddings
from langchain_chroma import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

os.environ['COHERE_API_KEY'] = ""

* 'allow_population_by_field_name' has been renamed to 'populate_by_name'
* 'smart_union' has been removed


In [2]:
model = ChatCohere(model="command-r-plus")

In [3]:
file_paths = ["./data/MTP_Thesis_ArpitDwivedi.pdf", "./data/NLP_Report.pdf", "./data/MIES TERM PROJECT REPORT.pdf"]

# Initialize an empty list to store all documents
docs = []

# Loop through the file paths and load each PDF
for file_path in file_paths:
    loader = PyPDFLoader(file_path)
    d = loader.load()
    docs.extend(d)  # Add the documents to the list

print(f"Total documents loaded: {len(docs)}")

Total documents loaded: 49


In [4]:
cohere_embeddings = CohereEmbeddings(
    model="embed-english-v3.0",
)

In [5]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) #Split text, embed it and then store in vector store
splits = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(documents=splits, embedding=cohere_embeddings)

retriever = vectorstore.as_retriever()

In [6]:
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)


question_answer_chain = create_stuff_documents_chain(model, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

results = rag_chain.invoke({"input": "Does the ESIM model have better retrieval metrics or BERT? What is the Recall@10 for both?"})

In [14]:
print(results['answer'])

The BERT model has better retrieval metrics. The Recall@10 for ESIM is 81.30% and for BERT is 93.27%.


In [15]:
#Other examples
print(rag_chain.invoke({"input": "Can you summarise the imposter detection project in 3 lines?"})['answer'])
print(rag_chain.invoke({"input": "What was the model architecture used for multilingual news article similarity measurement?"})['answer'])

The project aims to design an algorithm to distinguish imposters from authentic users based on mouse dynamics data. It uses self-organizing maps for unsupervised machine learning and can prevent security breaches. The conclusion states that these maps are effective tools for authentication when combined with supervised algorithms.
The model architecture used was an ensemble method, combining six mBERT models and an artificial neural network (ANN) to output an overall similarity score.
