In [1]:
!pip install langchain langchain_community sentence-transformers faiss-cpu transformers torch accelerate


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import pandas as pd
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import FAISS
from transformers import pipeline
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA

In [3]:
class Config:
    SEED: int = 42
    SAMPLE: int = 10000
    BASE_DATA_PATH: str = '../data/'
    BASE_OUTPUT_PATH: str = '../output/'

In [4]:
file_path = f'{Config.BASE_DATA_PATH}/arXiv_scientific dataset.csv'

df = pd.read_csv(file_path)

In [5]:
df.columns

Index(['id', 'title', 'category', 'category_code', 'published_date',
       'updated_date', 'authors', 'first_author', 'summary',
       'summary_word_count'],
      dtype='object')

In [6]:
df = df.sample(Config.SAMPLE, random_state=Config.SEED)

In [7]:
len(df)

10000

In [34]:
df.columns

Index(['id', 'title', 'category', 'category_code', 'published_date',
       'updated_date', 'authors', 'first_author', 'summary',
       'summary_word_count'],
      dtype='object')

In [44]:
docs = [
    f"""title: {title}
author: {authors}
publish_date: {published_date}
summary: {summary}"""
    for title, authors, published_date, summary in zip(df["title"], df["authors"], df["published_date"], df["summary"])
]

splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
texts = []
for d in docs:
    texts.extend(splitter.split_text(d))

In [45]:
print(f'{len(docs)=}')
print(f'{len(texts)=}')

len(docs)=10000
len(texts)=10000


In [46]:
import pprint as pp

pp.pprint(docs[0])

('title: Machine Learning and the Future of Realism\n'
 "author: ['Giles Hooker', 'Cliff Hooker']\n"
 'publish_date: 4/15/17\n'
 'summary: The preceding three decades have seen the emergence, rise, and '
 'proliferation\n'
 'of machine learning (ML). From half-recognised beginnings in perceptrons,\n'
 'neural nets, and decision trees, algorithms that extract correlations (that '
 'is,\n'
 'patterns) from a set of data points have broken free from their origin in\n'
 'computational cognition to embrace all forms of problem solving, from voice\n'
 'recognition to medical diagnosis to automated scientific research and\n'
 'driverless cars, and it is now widely opined that the real industrial\n'
 'revolution lies less in mobile phone and similar than in the maturation and\n'
 'universal application of ML. Among the consequences just might be the '
 'triumph\n'
 'of anti-realism over realism.')


In [47]:
embed = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = FAISS.from_texts(docs, embed)
vectorstore.save_local(f"{Config.BASE_OUTPUT_PATH}/faiss_index")

In [24]:
hf_pipe = pipeline(
    "text2text-generation",
    model="google/flan-t5-small",
    device_map="cpu"  # CPU by default; or set device_map={"": 0} for GPU
)
llm = HuggingFacePipeline(pipeline=hf_pipe)

Device set to use cpu


In [48]:
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(search_kwargs={"k": 5}),
    return_source_documents=True,  # <-- return docs in output
    verbose=True,                  # <-- print chain steps (optional)
)

In [49]:
res = qa({"query": "Who are author of Machine Learning and the Future of Realism?"})
print("🏷️ Answer:\n", res["result"], "\n")
print("📄 Retrieved documents:")
for i, doc in enumerate(res["source_documents"], 1):
    # doc.page_content is the text, doc.metadata might have your CSV row info
    print(f"\n----- Doc #{i} -----")
    print(doc.metadata)
    print(doc.page_content)



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
🏷️ Answer:
 Giles Hooker 

📄 Retrieved documents:

----- Doc #1 -----
{}
title: Machine Learning and the Future of Realism
author: ['Giles Hooker', 'Cliff Hooker']
publish_date: 4/15/17
summary: The preceding three decades have seen the emergence, rise, and proliferation
of machine learning (ML). From half-recognised beginnings in perceptrons,
neural nets, and decision trees, algorithms that extract correlations (that is,
patterns) from a set of data points have broken free from their origin in
computational cognition to embrace all forms of problem solving, from voice
recognition to medical diagnosis to automated scientific research and
driverless cars, and it is now widely opined that the real industrial
revolution lies less in mobile phone and similar than in the maturation and
universal application of ML. Among the consequences just might be the triumph
of anti-realism over realism.

----- Doc #2 -----
{}
titl

In [50]:
res['result']

'Giles Hooker'