# Simple Retrieval Augmented Generation

### Import  the Needed Packages

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
from datasets import load_dataset
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec
from tqdm.auto import tqdm
from _Utils import Utils

import ast
import os
import pandas as pd

In [3]:
# get api key
utils = Utils()
PINECONE_API_KEY = utils.get_pinecone_api_key()

### Setup Pinecone

In [4]:
pinecone = Pinecone(api_key=PINECONE_API_KEY)

utils = Utils()
INDEX_NAME = utils.create_dlai_index_name('dl-ai')
if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
  pinecone.delete_index(INDEX_NAME)

pinecone.create_index(name=INDEX_NAME, dimension=1536, metric='cosine',
  spec=ServerlessSpec(cloud='aws', region='us-west-2'))

index = pinecone.Index(INDEX_NAME)

### Load the Dataset

<p style="background-color:#fff1d7; padding:15px; "> <b>(Note: <code>max_articles_num = 500</code>):</b> To achieve a more comprehensive context for the Language Learning Model, a larger number of articles is generally more beneficial. Once you've done an initial run, consider increasing this value to 750 or 1,000. You'll likely notice that the context provided to the LLM becomes richer and better.</p>

In [5]:
max_articles_num = 500
df = pd.read_csv('./wiki.csv', nrows=max_articles_num)
df.head()


Unnamed: 0,id,metadata,values
1,1-0,"{'chunk': 0, 'source': 'https://simple.wikiped...","[-0.011254455894231796, -0.01698738895356655, ..."
2,1-1,"{'chunk': 1, 'source': 'https://simple.wikiped...","[-0.0015197008615359664, -0.007858820259571075..."
3,1-2,"{'chunk': 2, 'source': 'https://simple.wikiped...","[-0.009930099360644817, -0.012211072258651257,..."
4,1-3,"{'chunk': 3, 'source': 'https://simple.wikiped...","[-0.011600767262279987, -0.012608098797500134,..."
5,1-4,"{'chunk': 4, 'source': 'https://simple.wikiped...","[-0.026462381705641747, -0.016362832859158516,..."


### Prepare the Embeddings and Upsert to Pinecone

In [6]:
prepped = []

for i, row in tqdm(df.iterrows(), total=df.shape[0]):
    meta = ast.literal_eval(row['metadata'])
    prepped.append({'id':row['id'], 
                    'values':ast.literal_eval(row['values']), 
                    'metadata':meta})
    if len(prepped) >= 250:
        index.upsert(prepped)
        prepped = []


  0%|          | 0/500 [00:00<?, ?it/s]

In [7]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 500}},
 'total_vector_count': 500}

### Connect to OpenAI

In [8]:
OPENAI_API_KEY = utils.get_openai_api_key()
openai_client = OpenAI(api_key=OPENAI_API_KEY)

def get_embeddings(articles, model="text-embedding-ada-002"):
   return openai_client.embeddings.create(input = articles, model=model)

### Run Your Query

In [10]:
query = "Describe the edible fruit produced by several types of trees."

embed = get_embeddings([query])
res = index.query(vector=embed.data[0].embedding, top_k=3, include_metadata=True)
text = [r['metadata']['text'] for r in res['matches']]
print('\n'.join(text))


Apple is the edible fruit of a number of trees, known for this juicy, green or red fruits. The tree (Malus spp.) is grown worldwide. Its fruit is low-cost and popular, and is harvested all over the world. 

Applewood is a type of wood that comes from this tree.

The apple tree comes from southern Kazakhstan, Kyrgyzstan, Uzbekistan, and northwestern part of China. Apples have been grown for thousands of years in Asia and Europe. They were brought to North America by European settlers. Apples have religious and mythological significance in many cultures.

Apples are generally propagated by grafting, although wild apples grow readily from seed. Apple trees are large if grown from seed, but small if grafted onto roots (rootstock). There are more than 10000 known cultivars of apples, with a range of desired characteristics. Different cultivars are bred for various tastes and uses: cooking, eating raw and cider production are the most common uses. 

Trees and fruit are attacked by fungi, bac

### Build the Prompt

In [14]:
query = "write an article titled: Apple, the edible fruit of various trees?"
embed = get_embeddings([query])
res = index.query(vector=embed.data[0].embedding, top_k=3, include_metadata=True)

contexts = [
    x['metadata']['text'] for x in res['matches']
]

prompt_start = (
    "Answer the question based on the context below.\n\n"+
    "Context:\n"
)

prompt_end = (
    f"\n\nQuestion: {query}\nAnswer:"
)

prompt = (
    prompt_start + "\n\n---\n\n".join(contexts) + 
    prompt_end
)

print(prompt)

Answer the question based on the context below.

Context:
Apple is the edible fruit of a number of trees, known for this juicy, green or red fruits. The tree (Malus spp.) is grown worldwide. Its fruit is low-cost and popular, and is harvested all over the world. 

Applewood is a type of wood that comes from this tree.

The apple tree comes from southern Kazakhstan, Kyrgyzstan, Uzbekistan, and northwestern part of China. Apples have been grown for thousands of years in Asia and Europe. They were brought to North America by European settlers. Apples have religious and mythological significance in many cultures.

Apples are generally propagated by grafting, although wild apples grow readily from seed. Apple trees are large if grown from seed, but small if grafted onto roots (rootstock). There are more than 10000 known cultivars of apples, with a range of desired characteristics. Different cultivars are bred for various tastes and uses: cooking, eating raw and cider production are the most

### Get the Summary 

In [15]:
res = openai_client.completions.create(
    model="gpt-3.5-turbo-instruct",
    prompt=prompt,
    temperature=0,
    max_tokens=636,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0,
    stop=None
)
print('-' * 80)
print(res.choices[0].text)

--------------------------------------------------------------------------------


Apple is a widely popular and beloved fruit that is enjoyed by people all over the world. It is known for its juicy, green or red fruits that are both low-cost and delicious. The apple tree, scientifically known as Malus spp., is grown worldwide and its fruit is harvested in various countries.

The history of apples dates back thousands of years, with evidence of their cultivation in Asia and Europe. They were brought to North America by European settlers in the 1600s and have since become a staple in the American diet. Apples have also played a significant role in various cultures, with religious and mythological significance in many societies.

There are over 10,000 known cultivars of apples, each with its own unique characteristics and uses. Some are bred for cooking, while others are best eaten raw or used for cider production. Apples are generally propagated by grafting, although wild apples can als