In [17]:
# !pip install --upgrade huggingface_hub langchain InstructorEmbedding sentence_transformers pandas "langchain[docarray]"

In [11]:
import pandas as pd
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.llms import HuggingFaceHub
from langchain.chains import RetrievalQA
from langchain.document_loaders import CSVLoader
from langchain.vectorstores import DocArrayInMemorySearch
from langchain.indexes import VectorstoreIndexCreator

In [12]:
# HUB API KEY
api_key = '?'

In [13]:
# File references
src_file = 'dataset/winemag-data-130k-v2.csv'
dest_file = 'dataset/wines_30.csv'
df = pd.read_csv(src_file).iloc[:30, 1:].drop(columns=[
    'designation',
    'points',
    'price',
    'province',
    'region_1',
    'region_2',
    'taster_name',
    'taster_twitter_handle'])
df.to_csv(dest_file)
df.head()

Unnamed: 0,country,description,title,variety,winery
0,Italy,"Aromas include tropical fruit, broom, brimston...",Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,US,"Tart and snappy, the flavors of lime flesh and...",Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,US,"Pineapple rind, lemon pith and orange blossom ...",St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,US,"Much like the regular bottling from 2012, this...",Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


## Using RetrievalQA chain and HuggingFaceHub model

In [14]:
# initialize LLM
instruction_model = 'google/flan-t5-xxl'
embedding_model = 'hkunlp/instructor-xl'
llm = HuggingFaceHub(
    huggingfacehub_api_token=api_key,
    repo_id=instruction_model,
    model_kwargs={'temperature': 0.1, 'max_new_tokens': 1000}
)

# Initialize documents
dest_file = 'dataset/wines_30.csv'
loader = CSVLoader(file_path=dest_file)
docs = loader.load()

# initialize embeddings
embedding = HuggingFaceInstructEmbeddings(model_name=embedding_model)
index = VectorstoreIndexCreator(
    embedding=embedding,
    vectorstore_cls=DocArrayInMemorySearch
).from_loaders([loader])

# initialize db 
db = DocArrayInMemorySearch.from_documents(
    docs, 
    embedding
)

# initialize retriever
retriever = db.as_retriever()

# initialize chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", # map_reduce, refine, map_rerank
    retriever=retriever, 
    verbose=True
)

load INSTRUCTOR_Transformer
max_seq_length  512


In [16]:
# Query
query = "List all items from Italy, reply only title, separate entries using comma."
response = qa_chain.run(query)
print(response)



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
Masseria Setteporte 2012 Rosso (Etna), Canicatt 2009 Aynat Nero d'Avola (Sicilia), Stemmari 2013 Dalila White (Terre Siciliane), Terre di Giurfo 2013 Belsito Frappato (Vittoria)
