In [2]:
import os
from langchain.embeddings import HuggingFaceEmbeddings
from dotenv import load_dotenv
from langchain.vectorstores.pinecone import Pinecone
import pinecone

In [3]:

load_dotenv()

True

In [40]:
from datasets import load_dataset

data = load_dataset("wikipedia", "20220301.simple", split='train[:10000]')
data

Found cached dataset wikipedia (/Users/isaac/.cache/huggingface/datasets/wikipedia/20220301.simple/2.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559)


Dataset({
    features: ['id', 'url', 'title', 'text'],
    num_rows: 10000
})

In [56]:
import tiktoken

tokenizer = tiktoken.get_encoding('cl100k_base')

# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

tiktoken_len("hello I am a chunk of text and using the tiktoken_len function "
             "we can find the length of this chunk of text in tokens")

26

In [57]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)

In [58]:
chunks = text_splitter.split_text(data[6]['text'])[:3]
chunks

['Alan Mathison Turing OBE FRS (London, 23 June 1912 – Wilmslow, Cheshire, 7 June 1954) was an English mathematician and computer scientist. He was born in Maida Vale, London.\n\nEarly life and family \nAlan Turing was born in Maida Vale, London on 23 June 1912. His father was part of a family of merchants from Scotland. His mother, Ethel Sara, was the daughter of an engineer.\n\nEducation \nTuring went to St. Michael\'s, a school at 20 Charles Road, St Leonards-on-sea, when he was five years old.\n"This is only a foretaste of what is to come, and only the shadow of what is going to be.” – Alan Turing.\n\nThe Stoney family were once prominent landlords, here in North Tipperary. His mother Ethel Sara Stoney (1881–1976) was daughter of Edward Waller Stoney (Borrisokane, North Tipperary) and Sarah Crawford (Cartron Abbey, Co. Longford); Protestant Anglo-Irish gentry.\n\nEducated in Dublin at Alexandra School and College; on October 1st 1907 she married Julius Mathison Turing, latter son o

In [14]:
embeddings = HuggingFaceEmbeddings(model_name = "sentence-transformers/all-MiniLM-L6-v2")

In [42]:
texts = [
    'this is the first chunk of text',
    'then another second chunk of text is here'
]

In [43]:
res = embeddings.embed_documents(texts)

In [44]:
res[1]

[0.03048289753496647,
 0.019477777183055878,
 0.04769350215792656,
 0.020642202347517014,
 0.02876034937798977,
 -0.024882949888706207,
 -0.023797957226634026,
 -0.02318163961172104,
 0.1302988976240158,
 -0.028141969814896584,
 0.0786842554807663,
 0.023741433396935463,
 -0.030606720596551895,
 -0.040750179439783096,
 0.005268531385809183,
 0.0037540814373642206,
 0.02434307150542736,
 -0.03420156612992287,
 -0.02976469323039055,
 -0.08451639115810394,
 0.04233452305197716,
 0.0967378318309784,
 0.030192751437425613,
 0.04283454269170761,
 0.01891050487756729,
 0.10174936056137085,
 -0.09383708238601685,
 0.10698720812797546,
 0.06146544963121414,
 0.01342026423662901,
 -0.0481453575193882,
 0.034789733588695526,
 0.05406195670366287,
 0.029579579830169678,
 0.03954707831144333,
 -0.026034638285636902,
 -0.03798598423600197,
 0.09986936300992966,
 0.022789129987359047,
 -0.019318530336022377,
 0.018158327788114548,
 -0.10023137927055359,
 0.002032781019806862,
 0.04046790674328804,
 0

In [4]:

# Save it into pinecone
API_KEY = os.environ.get("PINECONE_API_KEY")
YOUR_ENV = os.environ.get("PINECONE_ENVIRONMENT", "us-west4-gcp-free")
index_name = "test-langchain"

In [5]:
pinecone.init(
    api_key=API_KEY,
    environment=YOUR_ENV
)

if len(pinecone.list_indexes()) == 0:
    pinecone.create_index(name=index_name, metric="cosine", shards=1, dimension=len(res[0]))

In [6]:
pinecone.describe_index(pinecone.list_indexes()[0])

IndexDescription(name='test-langchain', metric='cosine', replicas=1, dimension=384.0, shards=1, pods=1, pod_type='p1', status={'ready': True, 'state': 'Ready'}, metadata_config=None, source_collection='')

In [7]:
index = pinecone.Index(index_name)
index

<pinecone.index.Index at 0x113061030>

In [8]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.1,
 'namespaces': {'': {'vector_count': 28430}},
 'total_vector_count': 28430}

In [None]:
#vector_db = Pinecone.from_texts(texts=texts, embedding=embeddings, index_name="test-langchain")

In [53]:
from tqdm.auto import tqdm
from uuid import uuid4

In [54]:
batch_limit = 100

texts = []
metadatas = []

In [59]:
for i, record in enumerate(tqdm(data)):
    # first get metadata fields for this record
    metadata = {
        'wiki-id': str(record['id']),
        'source': record['url'],
        'title': record['title']
    }
    # now we create chunks from the record text
    record_texts = text_splitter.split_text(record['text'])
    # create individual metadata dicts for each chunk
    record_metadatas = [{
        "chunk": j, "text": text, **metadata
    } for j, text in enumerate(record_texts)]
    # append these to current batches
    texts.extend(record_texts)
    metadatas.extend(record_metadatas)
    # if we have reached the batch_limit we can add texts
    if len(texts) >= batch_limit:
        ids = [str(uuid4()) for _ in range(len(texts))]
        embeds = embeddings.embed_documents(texts)
        index.upsert(vectors=zip(ids, embeds, metadatas))
        texts = []
        metadatas = []

if len(texts) > 0:
    ids = [str(uuid4()) for _ in range(len(texts))]
    embeds = embeddings.embed_documents(texts)
    index.upsert(vectors=zip(ids, embeds, metadatas))

100%|██████████| 10000/10000 [12:17<00:00, 13.55it/s]


In [60]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.1,
 'namespaces': {'': {'vector_count': 28422}},
 'total_vector_count': 28422}

## Using LangChain Pinecone client

In [15]:
from langchain.vectorstores import Pinecone

In [78]:
index_name

'test-langchain'

In [16]:
text_field = "text"

index = pinecone.Index(index_name)

## Querying

In [17]:
vectorstore = Pinecone.from_existing_index(index_name, embeddings, text_field)

In [43]:
query = "I have an issue about writing legislation for drink driving in country Mexico. What should I do"

vectorstore.similarity_search(query, k=3)

[Document(page_content="Drunk driving (Drink driving in the UK and Australia) is the act of driving a motor vehicle (car, truck, etc.) while under the effects of alcohol. Drunk driving is illegal in most areas of the world.  In some places, driving a motorless vehicle such as a bicycle while drunk is also illegal.\n\nMost areas that make laws (jurisdictions) started with DWI (driving while intoxicated) laws, banning just alcohol. Later, most changed them to DUI (driving under the influence) laws, adding other drugs to those banned while driving. The most common blood alcohol content (BAC) limit in the United States is 0.08% for the legal meaning of drunk. Only three states still use the more lax, original standard of 0.10%.  Many jurisdictions add extra penalties (more jail time and/or a longer DUI program) in cases where the driver's BAC is over 0.20%.\n\nUnited States laws \n\nThe first place in the United States to adopt laws against drunk driving was the state of New York in 1910, 

## Generative QA

In [55]:
docs = vectorstore.similarity_search(query, k=3)[0]
body = docs.page_content
source = docs.metadata['source']

In [57]:
question = f"Using only the following documents {docs} answer" + query 
question

'Using only the following documents page_content="Drunk driving (Drink driving in the UK and Australia) is the act of driving a motor vehicle (car, truck, etc.) while under the effects of alcohol. Drunk driving is illegal in most areas of the world.  In some places, driving a motorless vehicle such as a bicycle while drunk is also illegal.\\n\\nMost areas that make laws (jurisdictions) started with DWI (driving while intoxicated) laws, banning just alcohol. Later, most changed them to DUI (driving under the influence) laws, adding other drugs to those banned while driving. The most common blood alcohol content (BAC) limit in the United States is 0.08% for the legal meaning of drunk. Only three states still use the more lax, original standard of 0.10%.  Many jurisdictions add extra penalties (more jail time and/or a longer DUI program) in cases where the driver\'s BAC is over 0.20%.\\n\\nUnited States laws \\n\\nThe first place in the United States to adopt laws against drunk driving wa

In [58]:
from langchain import HuggingFaceHub
llm = HuggingFaceHub(repo_id='bigscience/bloom-1b7')

In [37]:
from langchain.chains import RetrievalQA

In [38]:
retriever = vectorstore.as_retriever()

In [39]:
qa = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True)

In [59]:
result = qa(inputs=query)

ValueError: Error raised by inference API: Input validation error: `inputs` must have less than 1000 tokens. Given: 1054

In [25]:
result

{'query': 'Who was Benito Mussolini?',
 'result': '<extra_id_0>. - Please remember that the facts of fascism are very',
 'source_documents': [Document(page_content='Benito Amilcare Andrea Mussolini KSMOM GCTE (29 July 1883 – 28 April 1945) was an Italian politician and journalist. He was also the Prime Minister of Italy from 1922 until 1943. He was the leader of the National Fascist Party.\n\nBiography\n\nEarly life\nBenito Mussolini was named after Benito Juarez, a Mexican opponent of the political power of the Roman Catholic Church, by his anticlerical (a person who opposes the political interference of the Roman Catholic Church in secular affairs) father. Mussolini\'s father was a blacksmith. Before being involved in politics, Mussolini was a newspaper editor (where he learned all his propaganda skills) and elementary school teacher.\n\nAt first, Mussolini was a socialist, but when he wanted Italy to join the First World War, he was thrown out of the socialist party. He \'invented\'