In [None]:
# !pip install --upgrade huggingface_hub langchain InstructorEmbedding sentence_transformers

In [None]:
import pandas as pd
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.llms import HuggingFaceHub
from langchain.chains import RetrievalQA
from langchain.document_loaders import CSVLoader
from langchain.vectorstores import DocArrayInMemorySearch
from langchain.indexes import VectorstoreIndexCreator

In [None]:
# HUB API KEY
api_key = ''

In [None]:
# File references
src_file = 'dataset/winemag-data-130k-v2.csv'
dest_file = 'dataset/wines_30.csv'
df = pd.read_csv(src_file).iloc[:30, 1:].drop(columns=[
    'designation',
    'points',
    'price',
    'province',
    'region_1',
    'region_2',
    'taster_name',
    'taster_twitter_handle'])
df.to_csv(dest_file)
df.head()

## Using RetrievalQA chain and HuggingFaceHub model

In [None]:
# initialize LLM
instruction_model = 'google/flan-t5-xxl'
embedding_model = 'hkunlp/instructor-xl'
llm = HuggingFaceHub(
    huggingfacehub_api_token=api_key,
    repo_id=instruction_model,
    model_kwargs={'temperature': 0.1, 'max_new_tokens': 1000}
)

# Initialize documents
dest_file = 'dataset/wines_30.csv'
loader = CSVLoader(file_path=dest_file)
docs = loader.load()

# initialize embeddings
embedding = HuggingFaceInstructEmbeddings(model_name=embedding_model)
index = VectorstoreIndexCreator(
    embedding=embedding,
    vectorstore_cls=DocArrayInMemorySearch
).from_loaders([loader])

# initialize db 
db = DocArrayInMemorySearch.from_documents(
    docs, 
    embedding
)

# initialize retriever
retriever = db.as_retriever()

# initialize chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", # map_reduce, refine, map_rerank
    retriever=retriever, 
    verbose=True
)

In [None]:
# Query
query = "List all items from Italy, reply only title and variety, separate entries using comma."
response = qa_chain.run(query)
print(response)