## Install Requirements

In [None]:
!pip3 install -U cohere pinecone-client datasets
!pip3 install python-dotenv
!pip3 install cohere

# Imports

In [18]:
import os
import cohere
from pinecone import Pinecone, PodSpec
from datasets import Dataset

from dotenv import load_dotenv
load_dotenv()

cohereAPIkey = os.getenv('cohereAPIKey')
pineconeAPIkey = os.getenv('pineconeAPIKey')

co = cohere.Client(cohereAPIkey)

# Import dataset

In [19]:
mh_dataset = Dataset.from_csv('mh_chatbotQnA.csv')

In [20]:
# Splitting data into halves because cohere only taken in 96 
mh_dataset_one = mh_dataset.select(range(60))
mh_dataset_two = mh_dataset.select(range(60, 119))

In [21]:
def getEmbeddings(dataset):
    embeds = co.embed(
        texts= dataset,
        model='embed-english-v3.0',
        input_type='search_document',
        truncate='END'
    ).embeddings

    return embeds

In [22]:
embeds_one = getEmbeddings(mh_dataset_one['Question'])
embeds_two = getEmbeddings(mh_dataset_two['Question'])

total_embeds = embeds_one + embeds_two

In [23]:
import numpy as np

shape = np.array(total_embeds).shape
print(shape)


(119, 1024)


In [24]:
pc = Pinecone(api_key=pineconeAPIkey)

# Name can contain only lowercase letters, numbers and hyphens
index_name = 'mentalhealth-embeddings'

# if the index does not exist, we create it
if index_name not in pc.list_indexes():
    pc.create_index(
        name = index_name,
        dimension=shape[1],
        metric='cosine',
        spec=PodSpec(
            environment="gcp-starter"
        )
    )

# # connect to index
index = pc.Index(index_name)

In [25]:
batch_size = 50

ids = [str(i) for i in range(shape[0])]
# create list of metadata dictionaries
meta = [{'question': question, 'answer': answer} for question, answer in zip(mh_dataset['Question'], mh_dataset['Answer'])]

# create list of (id, vector, metadata) tuples to be upserted
to_upsert = list(zip(ids, total_embeds, meta))

for i in range(0, shape[0], batch_size):
    i_end = min(i+batch_size, shape[0])
    index.upsert(vectors=to_upsert[i:i_end])

In [29]:
query = "what are some services I can get for free?"

# create the query embedding
xq = co.embed(
    texts=[query],
    model='embed-english-v3.0',
    input_type='search_query',
    truncate='END'
).embeddings

print(np.array(xq).shape)

# query, returning the top 5 most similar results
res = index.query(vector=xq, top_k=1, include_metadata=True)

for match in res['matches']:
    print(f"{match['score']:.2f}: {match['metadata']['question']}")

(1, 1024)
0.47: What services are provided at no cost to currently registered GVSU students?
