In [1]:
!pip install langchain-community
!pip install tiktoken
!pip install faiss-cpu

Collecting langchain-community
  Downloading langchain_community-0.3.5-py3-none-any.whl.metadata (2.9 kB)
Collecting SQLAlchemy<2.0.36,>=1.4 (from langchain-community)
  Downloading SQLAlchemy-2.0.35-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain<0.4.0,>=0.3.6 (from langchain-community)
  Downloading langchain-0.3.7-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.4.0,>=0.3.15 (from langchain-community)
  Downloading langchain_core-0.3.15-py3-none-any.whl.metadata (6.3 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.6.1-py3-none-any.whl.metadata (3.5 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from datac

Dataset - https://www.kaggle.com/datasets/radek1/additional-train-data-for-llm-science-exam?select=extra_train_set.csv

In [7]:
import pandas as pd

data = pd.read_csv('/content/exam-science-llm.csv')

In [8]:
data.columns

Index(['prompt', 'C', 'E', 'D', 'B', 'A', 'answer'], dtype='object')

In [9]:
data['prompt'].value_counts()

Unnamed: 0_level_0,count
prompt,Unnamed: 1_level_1
"In relation to Eunice Fay McKenzie's career, which statement accurately reflects her most notable work?",1
"What is the main economic activity of Pierpont Township in Ashtabula County, Ohio?",1
In which family does Gudeodiscus anceyi belong?,1
"What was the recognition received by Eartha Kitt's album ""Back in Business"" at the 38th Annual Grammy Awards in 1996?",1
What happened in 1956 that resulted in the transfer of canal operation to the state-owned Suez Canal Authority?,1
...,...
What is the campus of La Jolla High School surrounded by?,1
"What is the geographical location of IFK Kalix, a Swedish football club?",1
"What is the population of Dickeyville, a village in Grant County, Wisconsin?",1
"Who recorded the song ""You Got It""?",1


Vector Store - These are like special filing cabinets for the number patterns (vectors) created by embedding models. When you want to find similar text later, the vector store helps you quickly find vectors that are close to each other. It's like organizing books by topic, but with numbers instead.

In [15]:
from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from dotenv import load_dotenv
import os

load_dotenv()

os.environ["OPENAI_API_KEY"] = 'Insert Your API KEY'

# Load the document, split it into chunks, embed each chunk and load it into the vector store.
# Insert question & answer csv file
raw_documents = TextLoader('/content/exam-science-llm.csv').load()
text_splitter = CharacterTextSplitter(chunk_size=200, chunk_overlap=0, separator = "\n",)
documents = text_splitter.split_documents(raw_documents)
db = FAISS.from_documents(documents, OpenAIEmbeddings())



In [16]:
query = "In relation to Eunice Fay McKenzie's career, which statement accurately reflects her most notable work?"
docs = db.similarity_search(query)
print(docs[0].page_content)

"In relation to Eunice Fay McKenzie's career, which statement accurately reflects her most notable work?",McKenzie gained recognition for her role as a child actress in a series of iconic silent films.,"McKenzie's successful career in sound films continued into adulthood, becoming known for her versatile acting abilities.",McKenzie's collaborations with director Blake Edwards were instrumental in her rise to fame.,McKenzie is primarily remembered for her starring roles opposite Gene Autry in popular Western films of the 1940s.,"McKenzie showcased her singing talents in numerous musical productions, garnering critical acclaim.",B


In [18]:
print(documents[1])

page_content='"In relation to Eunice Fay McKenzie's career, which statement accurately reflects her most notable work?",McKenzie gained recognition for her role as a child actress in a series of iconic silent films.,"McKenzie's successful career in sound films continued into adulthood, becoming known for her versatile acting abilities.",McKenzie's collaborations with director Blake Edwards were instrumental in her rise to fame.,McKenzie is primarily remembered for her starring roles opposite Gene Autry in popular Western films of the 1940s.,"McKenzie showcased her singing talents in numerous musical productions, garnering critical acclaim.",B' metadata={'source': '/content/exam-science-llm.csv'}


Retrievers - These are like smart librarians. When you ask a question, they go to the vector store, find the most relevant information, and bring it back. They help fetch the most useful pieces of information based on what you're looking for.

In [19]:
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI

retriever = db.as_retriever()

In [20]:
qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=retriever)

query = "What big impact in Eunice Fay McKenzie's career?"
qa.run(query)

  qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=retriever)
  qa.run(query)


" McKenzie's collaborations with director Blake Edwards were instrumental in her rise to fame."

In [21]:
query = "In which family does Gudeodiscus anceyi belong?"
qa.run(query)

' Pupillidae'

In [22]:
query = "What is the geographical location of IFK Kalix, a Swedish football club? What's the answer?"
qa.run(query)

' IFK Kalix is located in Norrbotten County, in the region of Norrland, Sweden.'

In [24]:
query = "Which of the following statements accurately describes Ray Montgomerie's football career?"
qa.run(query)

' Ray Montgomerie is a former footballer who played as a defender for Dumbarton, Kilmarnock, and Partick Thistle.'

In [26]:
query = "What is the genre of the 2021 film Dreamcatcher directed by Jacob Johnson?"
qa.run(query)

" I don't know."