In [1]:
import os
from langchain.embeddings import HuggingFaceEmbeddings
from dotenv import load_dotenv
from langchain.vectorstores.pinecone import Pinecone
import pinecone

  from tqdm.autonotebook import tqdm


In [2]:

load_dotenv()

True

In [3]:
from datasets import load_dataset

data = load_dataset("wikipedia", "20220301.simple", split='train[:10000]')
data

Found cached dataset wikipedia (/Users/isaac/.cache/huggingface/datasets/wikipedia/20220301.simple/2.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559)


Dataset({
    features: ['id', 'url', 'title', 'text'],
    num_rows: 10000
})

In [4]:
import tiktoken

tokenizer = tiktoken.get_encoding('cl100k_base')

# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

tiktoken_len("hello I am a chunk of text and using the tiktoken_len function "
             "we can find the length of this chunk of text in tokens")

26

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)

In [6]:
data[6]['text']

'Alan Mathison Turing OBE FRS (London, 23 June 1912 – Wilmslow, Cheshire, 7 June 1954) was an English mathematician and computer scientist. He was born in Maida Vale, London.\n\nEarly life and family \nAlan Turing was born in Maida Vale, London on 23 June 1912. His father was part of a family of merchants from Scotland. His mother, Ethel Sara, was the daughter of an engineer.\n\nEducation \nTuring went to St. Michael\'s, a school at 20 Charles Road, St Leonards-on-sea, when he was five years old.\n"This is only a foretaste of what is to come, and only the shadow of what is going to be.” – Alan Turing.\n\nThe Stoney family were once prominent landlords, here in North Tipperary. His mother Ethel Sara Stoney (1881–1976) was daughter of Edward Waller Stoney (Borrisokane, North Tipperary) and Sarah Crawford (Cartron Abbey, Co. Longford); Protestant Anglo-Irish gentry.\n\nEducated in Dublin at Alexandra School and College; on October 1st 1907 she married Julius Mathison Turing, latter son of

In [7]:
chunks = text_splitter.split_text(data[6]['text'])[:3]
chunks

['Alan Mathison Turing OBE FRS (London, 23 June 1912 – Wilmslow, Cheshire, 7 June 1954) was an English mathematician and computer scientist. He was born in Maida Vale, London.\n\nEarly life and family \nAlan Turing was born in Maida Vale, London on 23 June 1912. His father was part of a family of merchants from Scotland. His mother, Ethel Sara, was the daughter of an engineer.\n\nEducation \nTuring went to St. Michael\'s, a school at 20 Charles Road, St Leonards-on-sea, when he was five years old.\n"This is only a foretaste of what is to come, and only the shadow of what is going to be.” – Alan Turing.\n\nThe Stoney family were once prominent landlords, here in North Tipperary. His mother Ethel Sara Stoney (1881–1976) was daughter of Edward Waller Stoney (Borrisokane, North Tipperary) and Sarah Crawford (Cartron Abbey, Co. Longford); Protestant Anglo-Irish gentry.\n\nEducated in Dublin at Alexandra School and College; on October 1st 1907 she married Julius Mathison Turing, latter son o

In [8]:
embeddings = HuggingFaceEmbeddings(model_name = "sentence-transformers/all-MiniLM-L6-v2")

In [9]:
texts = [
    'this is the first chunk of text',
    'then another second chunk of text is here'
]

In [10]:
res = embeddings.embed_documents(texts)

In [11]:
res[1]

[0.03048289753496647,
 0.019477777183055878,
 0.04769350215792656,
 0.020642202347517014,
 0.02876034937798977,
 -0.024882949888706207,
 -0.023797957226634026,
 -0.02318163961172104,
 0.1302988976240158,
 -0.028141969814896584,
 0.0786842554807663,
 0.023741433396935463,
 -0.030606720596551895,
 -0.040750179439783096,
 0.005268531385809183,
 0.0037540814373642206,
 0.02434307150542736,
 -0.03420156612992287,
 -0.02976469323039055,
 -0.08451639115810394,
 0.04233452305197716,
 0.0967378318309784,
 0.030192751437425613,
 0.04283454269170761,
 0.01891050487756729,
 0.10174936056137085,
 -0.09383708238601685,
 0.10698720812797546,
 0.06146544963121414,
 0.01342026423662901,
 -0.0481453575193882,
 0.034789733588695526,
 0.05406195670366287,
 0.029579579830169678,
 0.03954707831144333,
 -0.026034638285636902,
 -0.03798598423600197,
 0.09986936300992966,
 0.022789129987359047,
 -0.019318530336022377,
 0.018158327788114548,
 -0.10023137927055359,
 0.002032781019806862,
 0.04046790674328804,
 0

In [12]:

# Save it into pinecone
API_KEY = os.environ.get("PINECONE_API_KEY")
YOUR_ENV = os.environ.get("PINECONE_ENVIRONMENT", "us-west4-gcp-free")
index_name = "test-langchain"

In [13]:
pinecone.init(
    api_key=API_KEY,
    environment=YOUR_ENV
)

if len(pinecone.list_indexes()) == 0:
    pinecone.create_index(name=index_name, metric="cosine", shards=1, dimension=len(res[0]))

In [14]:
pinecone.describe_index(pinecone.list_indexes()[0])

IndexDescription(name='test-docs', metric='cosine', replicas=1, dimension=384.0, shards=1, pods=1, pod_type='p1', status={'ready': True, 'state': 'Ready'}, metadata_config=None, source_collection='')

In [43]:
index = pinecone.Index(index_name)
index

<pinecone.index.Index at 0x2cff4d390>

In [44]:
index.describe_index_stats()

In [17]:
#vector_db = Pinecone.from_texts(texts=texts, embedding=embeddings, index_name="test-langchain")

In [18]:
from tqdm.auto import tqdm
from uuid import uuid4

In [19]:
batch_limit = 100

texts = []
metadatas = []

In [20]:
for i, record in enumerate(tqdm(data)):
    # first get metadata fields for this record
    metadata = {
        'wiki-id': str(record['id']),
        'source': record['url'],
        'title': record['title']
    }
    # now we create chunks from the record text
    record_texts = text_splitter.split_text(record['text'])
    # create individual metadata dicts for each chunk
    record_metadatas = [{
        "chunk": j, "text": text, **metadata
    } for j, text in enumerate(record_texts)]
    # append these to current batches
    texts.extend(record_texts)
    metadatas.extend(record_metadatas)
    # if we have reached the batch_limit we can add texts
    if len(texts) >= batch_limit:
        ids = [str(uuid4()) for _ in range(len(texts))]
        embeds = embeddings.embed_documents(texts)
        index.upsert(vectors=zip(ids, embeds, metadatas))
        texts = []
        metadatas = []

if len(texts) > 0:
    ids = [str(uuid4()) for _ in range(len(texts))]
    embeds = embeddings.embed_documents(texts)
    index.upsert(vectors=zip(ids, embeds, metadatas))

  0%|          | 23/10000 [00:02<14:49, 11.22it/s]


In [21]:
index.describe_index_stats()

## Using LangChain Pinecone client

In [22]:
from langchain.vectorstores import Pinecone

In [23]:
index_name

'test-langchain'

In [24]:
text_field = "text"

index = pinecone.Index(index_name)

## Querying

In [25]:
vectorstore = Pinecone.from_existing_index(index_name, embeddings, text_field)

In [26]:
query = "I have an issue about writing legislation for drink driving in country Mexico. What should I do"

vectorstore.similarity_search(query, k=3)

## Generative QA

In [27]:
docs = vectorstore.similarity_search(query, k=3)[0]
body = docs.page_content
source = docs.metadata['source']

In [28]:
question = f"Using only the following documents {docs} answer" + query 
question

In [29]:
from langchain import HuggingFaceHub
llm = HuggingFaceHub(repo_id='bigscience/bloom-1b7')

In [30]:
from langchain.chains import RetrievalQA

In [31]:
retriever = vectorstore.as_retriever()

In [32]:
qa = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True)

In [33]:
result = qa(inputs=query)

In [34]:
result