<a href="https://colab.research.google.com/github/Zeaxanthin80/CAI2300C/blob/main/Assignments/Assignment%202.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Assignment 2
### &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <font color="#abb2b9">Building a Semantic Search Engine for a Healthcare Dataset with OpenAI.</font>

## Step 2: Setting Up the Environment










In [20]:
from openai import OpenAI  # Import the OpenAI library to interact with OpenAI's API
from scipy.spatial import distance  # Import distance from scipy.spatial for computing vector distances
import numpy as np  # Import NumPy for numerical operations

from google.colab import userdata
openai = userdata.get('OPENAI_KEY')

# Initialize OpenAI client with your API key
client = OpenAI(api_key=openai)  # Replace "your_key" with your actual OpenAI API key

# Function to generate embeddings for a list of input texts
def create_embeddings(texts, model="text-embedding-3-small"):
    """
    This function takes a list of texts and generates embeddings using the specified OpenAI model.

    Parameters:
    texts (list of str): List of input texts to be embedded.
    model (str): The name of the embedding model to use (default is "text-embedding-3-small").

    Returns:
    list of lists: A list containing embedding vectors for each input text.
    """
    embeddings = []  # Initialize an empty list to store the embeddings
    for text in texts:  # Iterate through each text in the input list
        response = client.embeddings.create(input=text, model=model)
        embeddings.append(response.data[0].embedding)  # Extract and store the embedding vector
    return embeddings  # Return the list of embeddings


## Step 3: Data Preparation



### <font color="#abb2b9">Retrieving the Healthcare Dataset from Kaggle and moving it to the current working directory.</font>

In [21]:
import kagglehub  # Import the kagglehub library for interacting with Kaggle datasets.
import shutil  # Import the shutil library for file and directory operations (moving).
import os  # Import the os library for interacting with the operating system (paths, directories).

# Download the latest version of the specified Kaggle dataset.
# The `kagglehub.dataset_download()` function downloads the dataset to a temporary location.
# The return value `path` is the path to this temporary directory.
path = kagglehub.dataset_download("prasad22/healthcare-dataset")

# Extract the dataset name from the downloaded path.
# We split the path string by "/" and take the third element from the end, which is
# expected to be the dataset name.
dataset_name = path.split("/")[-3]

# Define the destination directory where the dataset will be moved.
# We create a directory named "Kaggle-Dataset" in the current directory,
# and within it, a subdirectory with the dataset's name.
destination_directory = os.path.join(".", "Kaggle-Dataset", dataset_name)

# Create the destination directory if it doesn't already exist.
# `os.makedirs()` creates the directory and any necessary parent directories.
# `exist_ok=True` prevents an error if the directory already exists.
os.makedirs(destination_directory, exist_ok=True)

# Move the contents of the downloaded dataset directory to the destination directory.
# We iterate through each item (file or subdirectory) within the downloaded path.
for item in os.listdir(path):
    source_item_path = os.path.join(path, item)  # Create the full path to the source item.
    destination_item_path = os.path.join(destination_directory, item)  # Create the full path to the destination item.

    # Check if the item is a file.
    if os.path.isfile(source_item_path):
        shutil.move(source_item_path, destination_item_path)  # Move the file to the destination.
    # Check if the item is a directory.
    elif os.path.isdir(source_item_path):
        shutil.move(source_item_path, destination_item_path)  # Move the directory (including its contents) to the destination.

# Print a message to the console indicating that the dataset has been moved.
print(f"Dataset content moved to: {destination_directory}")

Dataset content moved to: ./Kaggle-Dataset/healthcare-dataset


### <font color="#abb2b9">Convert the comma separated list (csv) file to a list of dictionary for each record.</font>


In [22]:
import csv

def csv_to_list_of_dicts(csv_file_path):
    """
    Converts a CSV file to a list of dictionaries, where each dictionary
    represents a record (row) in the CSV.

    Args:
        csv_file_path (str): The path to the CSV file.

    Returns:
        list: A list of dictionaries, where each dictionary represents a record.
    """

    data = []
    with open(csv_file_path, 'r', encoding='utf-8') as file:  # Open in read mode
        reader = csv.DictReader(file)  # Create a DictReader object
        for row in reader:  # Iterate through rows
            data.append(row)  # Add each row (as a dictionary) to the list
    return data

In [23]:
csv_file_path = "./Kaggle-Dataset/healthcare-dataset/healthcare_dataset.csv"  # Replace with your actual CSV file path
dataset_of_records = csv_to_list_of_dicts(csv_file_path)

# You can access data like this:
# print(dataset_as_list_of_dicts[5])  # Print the first record (dictionary)
# print(dataset_as_list_of_dicts[0]['column_name'])  # Access a specific column value

In [26]:
# Generate embeddings for the patient records
records = []
# Extract the 'Medical_Specialty' field for embedding generation, if it does not exist replace it with "Unknown"
medical_specialties = [record.get('Medical_Specialty', "Unknown") for record in dataset_of_records]
embeddings = create_embeddings(medical_specialties, model="text-embedding-3-small")

for record, embedding in zip(dataset_of_records, embeddings):
    records.append({"patient record": record, "embedding": embedding})

KeyboardInterrupt: 

# Step 4: Implementing Semantic Search

In [None]:
# Search query
search_text = "What percentage of patients over the age of 45 have diabetis?"

# Generate the embedding for the query
search_embedding = create_embeddings([search_text])[0]

# Calculate cosine distances between the query and records
distances = []
for record in records:
    dist = distance.cosine(search_embedding, record["embedding"])
    distances.append(dist)

# Find the closest complaint
min_dist_ind = np.argmin(distances)
closest_record = records[min_dist_ind]

print(f"Search Query: {search_text}")
print(f"Closest Patient Record: {closest_record['record']}")