In [20]:
from datasets import load_dataset

data = load_dataset("wikipedia", "20220301.simple", split='train[:10000]')
data

Found cached dataset wikipedia (/home/codespace/.cache/huggingface/datasets/wikipedia/20220301.simple/2.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559)


Dataset({
    features: ['id', 'url', 'title', 'text'],
    num_rows: 10000
})

In [21]:
# import pandas as pd
# from datasets import load_dataset

# # Load the dataset
# data = load_dataset("wikipedia", "20220301.simple", split='train[:10000]')

# # Convert the dataset to a Pandas DataFrame
# df = pd.DataFrame(data)

# df = df.drop(columns = ["text"])
# # Dump the first 10 records
# print(df.heåad(10))

In [22]:
data[6]

{'id': '13',
 'url': 'https://simple.wikipedia.org/wiki/Alan%20Turing',
 'title': 'Alan Turing',
 'text': 'Alan Mathison Turing OBE FRS (London, 23 June 1912 – Wilmslow, Cheshire, 7 June 1954) was an English mathematician and computer scientist. He was born in Maida Vale, London.\n\nEarly life and family \nAlan Turing was born in Maida Vale, London on 23 June 1912. His father was part of a family of merchants from Scotland. His mother, Ethel Sara, was the daughter of an engineer.\n\nEducation \nTuring went to St. Michael\'s, a school at 20 Charles Road, St Leonards-on-sea, when he was five years old.\n"This is only a foretaste of what is to come, and only the shadow of what is going to be.” – Alan Turing.\n\nThe Stoney family were once prominent landlords, here in North Tipperary. His mother Ethel Sara Stoney (1881–1976) was daughter of Edward Waller Stoney (Borrisokane, North Tipperary) and Sarah Crawford (Cartron Abbey, Co. Longford); Protestant Anglo-Irish gentry.\n\nEducated in Dub

Create the tokenizer that helps to understand how many token is a text.

In [23]:
import tiktoken  # !pip install tiktoken

tokenizer = tiktoken.get_encoding('p50k_base')

# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

tiktoken_len("hello I am a chunk of text and using the tiktoken_len function "
             "we can find the length of this chunk of text in tokens")

28

We have nice components that can split text for us.

In [24]:
from langchain.text_splitter import RecursiveCharacterTextSplitter


text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)


In [25]:
from pprint import pprint
chunks = text_splitter.split_text(data[6]['text'])
pprint(chunks[:2])

# cycle in all chunks and print length with previous function
for chunk in chunks:
    print(tiktoken_len(chunk))

['Alan Mathison Turing OBE FRS (London, 23 June 1912 – Wilmslow, Cheshire, 7 '
 'June 1954) was an English mathematician and computer scientist. He was born '
 'in Maida Vale, London.\n'
 '\n'
 'Early life and family \n'
 'Alan Turing was born in Maida Vale, London on 23 June 1912. His father was '
 'part of a family of merchants from Scotland. His mother, Ethel Sara, was the '
 'daughter of an engineer.\n'
 '\n'
 'Education \n'
 "Turing went to St. Michael's, a school at 20 Charles Road, St "
 'Leonards-on-sea, when he was five years old.\n'
 '"This is only a foretaste of what is to come, and only the shadow of what is '
 'going to be.” – Alan Turing.\n'
 '\n'
 'The Stoney family were once prominent landlords, here in North Tipperary. '
 'His mother Ethel Sara Stoney (1881–1976) was daughter of Edward Waller '
 'Stoney (Borrisokane, North Tipperary) and Sarah Crawford (Cartron Abbey, Co. '
 'Longford); Protestant Anglo-Irish gentry.\n'
 '\n'
 'Educated in Dublin at Alexandra School an

Now we need to use embeddings, remmeber we need to use code for azure.

In [26]:
from langchain.embeddings.openai import OpenAIEmbeddings
import os
from dotenv import load_dotenv, find_dotenv
import openai
openai.api_key = os.getenv("OPENAI_API_KEY")
openai.api_base = os.getenv("OPENAI_API_BASE")
openai.api_type = 'azure'
openai.api_version = '2023-05-15'

_ = load_dotenv(find_dotenv()) # read local .env file

model_name = 'text-embedding-ada-002'
api_key = os.getenv("OPENAI_API_KEY")

# even if the api is azure, we can simply set as above the base class to work with azure
# then use standard embeddings
embed = OpenAIEmbeddings(
    openai_api_key=api_key
)



In [27]:
texts = [
    'this is the first chunk of text',
]

res = embed.embed_documents(texts)
len(res), len(res[0])

(1, 1536)

Now integrate with pinecone

In [28]:
import pinecone

_ = load_dotenv(find_dotenv()) # read local .env file

index_name = 'langchain-retrieval-augmentation'

pinecone.init(
        api_key=os.getenv("PINECONE_KEY"),  # find api key in console at app.pinecone.io
        environment=os.getenv("PINECONE_ENV")  # find next to api key in console
)

# print pinecone key and env
# print(os.getenv("PINECONE_KEY"))
print(os.getenv("PINECONE_ENV"))


us-west1-gcp-free


In [29]:

# we create a new index
pinecone.create_index(
        name=index_name,
        metric='dotproduct',
        dimension=len(res[0]) # 1536 dim of text-embedding-ada-002
)


us-west1-gcp-free


In [30]:
import pinecone
index = pinecone.GRPCIndex(index_name)

index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [None]:
from tqdm.auto import tqdm
from uuid import uuid4

for i, record in enumerate(tqdm(data)):
    # first get metadata fields for this record

    metadata = {
        'wiki-id': str(record['id']),
        'source': record['url'],
        'title': record['title']
    }
    # now we create chunks from the record text
    record_texts = text_splitter.split_text(record['text'])
    # create individual metadata dicts for each chunk
    record_metadatas = [{
        "chunk": j, "text": text, **metadata
    } for j, text in enumerate(record_texts)]
    # append these to current batches
    texts.extend(record_texts)
    metadatas.extend(record_metadatas)
    # cannot do the batch in azure api openai, need to call one by one
    # for text in record_texts:
    #     id = str(uuid4())
    #     print(id)
    #     # embeds = embed.embed_documents(text)
    #     # index.upsert(vectors=zip([id], embeds, [metadata]))

