<a href="https://colab.research.google.com/github/amodsgit/AmodTheCoder/blob/main/HairChat_Completed.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -qU \
  tiktoken==0.4.0 \
  openai==0.27.7 \
  langchain==0.0.179 \
  pinecone-client==2.2.1 \
  datasets==2.13.1

In [2]:
from datasets import load_dataset

data = load_dataset("Amod/hair_medical_sit", split='train')
data



Dataset({
    features: ['Hair Disease', 'Medication', 'Duration', 'Side Effects', 'Symptoms', ' Severity of Disease', 'Disease Description', 'Medication Description'],
    num_rows: 91
})

In [3]:
data[45] # sanmple of a data record

{'Hair Disease': 'Telogen effluvium',
 'Medication': 'Niacin supplements',
 'Duration': '3 months',
 'Side Effects': 'Flushing, Headache, Itching',
 'Symptoms': 'Sudden hair shedding, often in large amounts and usually from the scalp. The hair may appear thinner, but there are usually no bald patches.',
 ' Severity of Disease': 'Mild',
 'Disease Description': 'Telogen effluvium is a scalp disorder characterized by the thinning or shedding of hair resulting from the early entry of hair in the telogen phase (the resting phase of the hair follicle). Emotional or physiological stress may result in an alteration of the normal hair cycle and cause the disorder.',
 'Medication Description': 'Also known as vitamin B3, niacin can improve blood circulation in the scalp, which can stimulate hair growth.'}

In [4]:
import tiktoken

tiktoken.encoding_for_model('gpt-3.5-turbo')

<Encoding 'cl100k_base'>

In [5]:
import tiktoken

tokenizer = tiktoken.get_encoding('cl100k_base')

# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

In [6]:
tiktoken_len(' '.join(data[45].values()))

134

In [7]:

from getpass import getpass
OPENAI_API_KEY = getpass('Enter your OpenAI API key')


Enter your OpenAI API key··········


In [8]:
from langchain.embeddings.openai import OpenAIEmbeddings

model_name = 'text-embedding-ada-002'

embed = OpenAIEmbeddings(
    model=model_name,
    openai_api_key=OPENAI_API_KEY
)

In [9]:
index_name = 'haircare-chat-sit'

In [10]:
import pinecone

# initialize connection to pinecone (get API key at app.pinecone.io)
api_key = getpass('Enter your API key')
# find your environment next to the api key in pinecone console
env = getpass('Enter your ENV')

pinecone.init(api_key=api_key, environment=env)
pinecone.whoami()

Enter your API key··········
Enter your ENV··········


WhoAmIResponse(username='24b6755', user_label='default', projectname='fdd0d9a')

In [11]:
import time

# check if index already exists (it shouldn't if this is first time)
if index_name not in pinecone.list_indexes():
    # if does not exist, create index
    pinecone.create_index(
        index_name,
        dimension=1536,
        metric='cosine'
    )
    # wait for index to be initialized
    while not pinecone.describe_index(index_name).status['ready']:
        time.sleep(1)

# connect to index
index = pinecone.Index(index_name)
# view index stats
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [12]:
from tqdm.auto import tqdm
from uuid import uuid4
# prepare texts and metadata from dataset
texts = []
metadatas = []

for i, record in enumerate(data):
    # Prepare metadata for this record
    metadata = {
        'severity': record[' Severity of Disease'],
        'hair_disease': record['Hair Disease'],
        'side_effects': record['Side Effects'],
        'medication': record['Medication'],
        'symptoms': record['Symptoms']
    }

    # Prepare texts for this record
    # Add more structure to the text by prefixing each field with its name
    text = "Severity: " + record[' Severity of Disease'] + \
           ". Hair Disease: " + record['Hair Disease'] + \
           ". Duration: " + record['Duration'] + \
           ". Medication Description: " + record['Medication Description'] + \
           ". Disease Description: " + record['Disease Description'] + \
           ". Side Effects: " + record['Side Effects'] + \
           ". Medication: " + record['Medication'] + \
           ". Symptoms: " + record['Symptoms']

    texts.append(text)
    metadatas.append(metadata)

# create embeddings
embeds = embed.embed_documents(texts)

# indexing
ids = [str(uuid4()) for _ in range(len(texts))]
index.upsert(vectors=zip(ids, embeds, metadatas))

# check number of vectors in the index
index.describe_index_stats()


{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 91}},
 'total_vector_count': 91}

In [14]:
from langchain.vectorstores import Pinecone
# Initialize the vector store
vectorstore = Pinecone(index, embed.embed_query, "text")

# Perform a search
query = "what is the side effect of Clobetasol propionate lotion?"
vectorstore.similarity_search(query, k=3)



[]

In [25]:
import openai
import pprint

# Embedding model
embed_model = "text-embedding-ada-002"

# Your question
query = "What are the side effects of Clobetasol propionate lotion?"

# Get the query embedding
res = openai.Embedding.create(
    input=[query],
    engine=embed_model
)

# Retrieve the embedding from the response
xq = res['data'][0]['embedding']

# Use Pinecone to find the top 5 relevant contexts
res = index.query(xq, top_k=1, include_metadata=True)

# Extract the retrieved contexts
contexts = [item['metadata'] for item in res['matches']]

# Convert dictionaries to strings
contexts = ["\n".join(f"{key}: {value}" for key, value in context.items()) for context in contexts]

# System primer message
primer = {
    "role": "system",
    "content": "You are an intelligent assistant with extensive medical knowledge. You provide information and answer user questions to the best of your ability based on the information provided. If the information is not available in the given context, you truthfully say 'I don't know'."
}

# User message
user_message = {
    "role": "user",
    "content": query
}

# Context message
context_messages = [{
    "role": "assistant",
    "content": context
} for context in contexts]

# Messages to be sent to the API
messages = [primer, *context_messages, user_message]

# Print the formatted messages
pprint.pprint(messages)

# Create a conversation with GPT-3.5-turbo
response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=messages,
    temperature=0.8,
    max_tokens=150
)

# Display the assistant's response
print(response['choices'][0]['message']['content'])


[{'content': 'You are an intelligent assistant with extensive medical '
             'knowledge. You provide information and answer user questions to '
             'the best of your ability based on the information provided. If '
             'the information is not available in the given context, you '
             "truthfully say 'I don't know'.",
  'role': 'system'},
 {'content': 'hair_disease: Alopecia Areata\n'
             'medication: Clobetasol propionate lotion\n'
             'severity: Moderate to severe\n'
             'side_effects: Skin rash, Burning sensation, Itching\n'
             'symptoms: Patchy hair loss, often on the scalp. The patches are '
             'usually several centimeters or less. Hair loss might also occur '
             'on other parts of the body.',
  'role': 'assistant'},
 {'content': 'What are the side effects of Clobetasol propionate lotion?',
  'role': 'user'}]
The side effects of Clobetasol propionate lotion can include skin rash, burning sens