# Convert CSV to JSON

In [1]:
import pandas as pd
import json

def csv_to_json_dynamic(csv_file_path, output_json_path):
    # Load the CSV file
    df = pd.read_csv(csv_file_path)
    
    # Initialize the JSON data dictionary
    json_data = {}
    
    # Iterate through each row in the DataFrame
    for index, row in df.iterrows():
        entry = {}
        for column in df.columns:
            # Using column names directly from CSV
            entry[column] = row[column]
        
        # Use the first column as the key for JSON entries
        key = str(row[df.columns[0]])
        json_data[key] = entry
    
    # Save the converted data to a JSON file
    with open(output_json_path, 'w') as json_file:
        json.dump(json_data, json_file, indent=4)
    
    return output_json_path

# Path to the input CSV file
csv_file_path = 'data_csv/Data Capstone - Raw.csv'
# Path to the output JSON file
output_json_path = 'data_json/Capstone_RAG3.json'

# Convert CSV to JSON dynamically
converted_json_path = csv_to_json_dynamic(csv_file_path, output_json_path)
converted_json_path


'data_json/Capstone_RAG3.json'

# Load JSON data

In [2]:
import json

# Define the file path to the JSON file
file_path = r'data_json/Capstone_RAG3.json'

# Open the JSON file for reading with UTF-8 encoding
with open(file_path, 'r', encoding='utf-8') as file:
    # Load the JSON data from the file into a Python dictionary
    data = json.load(file)
    
    # Convert the values of the dictionary to a list
    eskwe_data = list(data.values())

    # Print the loaded JSON data in a nicely formatted way
    print("Loaded JSON data:")
    print(json.dumps(data, indent=4))


Loaded JSON data:
{
    "1": {
        "No.": 1,
        "Title": "Overview and Topics of Sprint 1",
        "Link": "https://docs.google.com/spreadsheets/d/1TuJEyzLfSoFf7ukmVg1pNSWApJHQNNhL_NtvxwN_nF8/edit?gid=1163492588#gid=1163492588",
        "Body": "Main Topic: Introduction to Data Science and Machine Learning\nSubtopics: \n-Python Fundamentals\n-Pandas: Data Wrangling Techniques\n-Data Distribtutions\n-Data Visualizations\n-Exploratory Data Analysis\n-Data Story Telling\n-Github\n-Deployment using Streamlit Cloud\n-Introduction to Machine Learning\n-RFM Clustering\n-Introduction to Linear Regression and Logistic Regression",
        "Author": "Eskwelabs",
        "Date Published": "Aug 1, 2024",
        "Sprint": "Sprint Topics",
        "Notes": "All sprints"
    },
    "2": {
        "No.": 2,
        "Title": "Overview and Topics of Sprint 2",
        "Link": "https://docs.google.com/spreadsheets/d/1TuJEyzLfSoFf7ukmVg1pNSWApJHQNNhL_NtvxwN_nF8/edit?gid=1163492588#gid=116349258

In [3]:
# Show the list of values
eskwe_data

[{'No.': 1,
  'Title': 'Overview and Topics of Sprint 1',
  'Link': 'https://docs.google.com/spreadsheets/d/1TuJEyzLfSoFf7ukmVg1pNSWApJHQNNhL_NtvxwN_nF8/edit?gid=1163492588#gid=1163492588',
  'Body': 'Main Topic: Introduction to Data Science and Machine Learning\nSubtopics: \n-Python Fundamentals\n-Pandas: Data Wrangling Techniques\n-Data Distribtutions\n-Data Visualizations\n-Exploratory Data Analysis\n-Data Story Telling\n-Github\n-Deployment using Streamlit Cloud\n-Introduction to Machine Learning\n-RFM Clustering\n-Introduction to Linear Regression and Logistic Regression',
  'Author': 'Eskwelabs',
  'Date Published': 'Aug 1, 2024',
  'Sprint': 'Sprint Topics',
  'Notes': 'All sprints'},
 {'No.': 2,
  'Title': 'Overview and Topics of Sprint 2',
  'Link': 'https://docs.google.com/spreadsheets/d/1TuJEyzLfSoFf7ukmVg1pNSWApJHQNNhL_NtvxwN_nF8/edit?gid=1163492588#gid=1163492588',
  'Body': 'Main topic: Machine Learning Techniques and Model Evaluation\nSubtopics:\n-Introduction to Credit 

# Vector Embeddings & Storage

In [2]:
# Import necessary libraries
from dotenv import load_dotenv  # For loading environment variables from a .env file
import os  # For interacting with the operating system
import openai  # For interacting with OpenAI's API
import json  # For working with JSON data
import chromadb  # For using ChromaDB
from chromadb.utils import embedding_functions  # Utility functions for embeddings in ChromaDB

# Load environment variables from .env file
load_dotenv()

# Set the OpenAI API key from environment variable
openai.api_key = os.getenv("OPENAI_API_KEY")

# Load eskwe_data JSON file
file_path = r'data_json/Capstone_RAG3.json'
with open(file_path, 'r', encoding='utf-8') as file:
    data = json.load(file)  # Load the JSON data from the file into a dictionary
    eskwe_data = list(data.values())  # Convert the values of the dictionary to a list

# Constants and Initializing ChromaDB
CHROMA_DATA_PATH = 'eskwe'  # Path for storing ChromaDB data
COLLECTION_NAME = "eskwe_embeddings"  # Name of the ChromaDB collection

# Initialize ChromaDB client
client = chromadb.PersistentClient(path=CHROMA_DATA_PATH)  # Create a persistent client for ChromaDB
openai_ef = embedding_functions.OpenAIEmbeddingFunction(api_key=os.getenv("OPENAI_API_KEY"), model_name="text-embedding-ada-002")  # Define the OpenAI embedding function

# Create or get the collection in ChromaDB
collection = client.get_or_create_collection(
    name=COLLECTION_NAME,
    embedding_function=openai_ef,
    metadata={"hnsw:space": "cosine"}  # Using cosine space for HNSW (Hierarchical Navigable Small World)
)

# Function to generate embeddings and add to collection
def add_embeddings_to_collection(eskwelabs):
    for i, eskdata in enumerate(eskwelabs):
        # Prepare the data for embedding
        eskwe_data_final = f"Title: {eskdata.get('Title', 'N/A')}\nLink: {eskdata.get('Link', 'N/A')}\nPublish Date: {eskdata.get('Date Published', 'N/A')}\nSprint: {eskdata.get('Sprint', 'N/A')}\nBody: {eskdata.get('Body', 'N/A')}"
        
        # Use OpenAI API to create embeddings
        response = openai.Embedding.create(
            input=eskwe_data_final, 
            model="text-embedding-ada-002"
        )
        embedding = response.data[0].embedding  # Extract the embedding from the response
        print(f"Generated embedding for eskdata {eskdata.get('Title', 'Unknown Data')}: {embedding[:5]}...")

        try:
            # Add the embedding to the ChromaDB collection
            collection.add(
                ids=[str(i)],
                documents=[eskwe_data_final],
                embeddings=[embedding],
                metadatas=[{"eskdata": eskdata.get('Title', 'Unknown Data')}]
            )
            print(f"Stored embedding for eskdata {eskdata.get('Title', 'Unknown Data')}")
        except Exception as e:  # Catch potential errors in collection storage
            print(f"Error storing embedding for eskdata {eskdata.get('Title', 'Unknown Data')}: {e}")

# Function to retrieve and print embedding and metadata by ID
def retrieve_embedding_by_id(doc_id):
    try:
        query_result = collection.get(ids=[str(doc_id)], include=['embeddings', 'metadatas', 'documents'])
        print(f"Query result for ID {doc_id}: {query_result}")

        if not query_result or 'ids' not in query_result or not query_result['ids']:
            print(f"No results found for the given ID: {doc_id}.")
            return None, None, None  # Return None for all if not found

        embedding = query_result['embeddings'][0] if query_result['embeddings'] else None
        metadata = query_result['metadatas'][0] if query_result['metadatas'] else None
        document = query_result['documents'][0] if query_result['documents'] else None
        return embedding, metadata, document

    except Exception as e:
        print(f"Error retrieving embedding for ID {doc_id}: {str(e)}")
        return None, None, None  # Return None for all in case of error

# Add embeddings to the collection
add_embeddings_to_collection(eskwe_data)

# Verify retrieval
print("\nStored Embeddings in ChromaDB:")
for i in range(min(10, len(eskwe_data))):  # Iterate up to 10 or the number of data, whichever is smaller
    embedding, metadata, document = retrieve_embedding_by_id(i)
    if embedding and metadata:
        print(f"ID: {i}, Embedding (first 5): {embedding[:5]}..., Metadata: {metadata}, Document(first 100 chars): {document[:100]}")
    else:
        print(f"Failed to retrieve embedding and metadata for ID: {i}")


Add of existing embedding ID: 0
Insert of existing embedding ID: 0


Generated embedding for eskdata Overview and Topics of Sprint 1: [-0.003084039082750678, -0.001510584494099021, 0.016013214364647865, 0.006871545221656561, -0.016951169818639755]...
Stored embedding for eskdata Overview and Topics of Sprint 1


Add of existing embedding ID: 1
Insert of existing embedding ID: 1


Generated embedding for eskdata Overview and Topics of Sprint 2: [-0.014404241926968098, 0.013667123392224312, 0.022343480959534645, 0.007276508957147598, -0.009088873863220215]...
Stored embedding for eskdata Overview and Topics of Sprint 2


Add of existing embedding ID: 2
Insert of existing embedding ID: 2


Generated embedding for eskdata Overview and Topics of Sprint 3: [-0.0011725821532309055, 0.011545424349606037, 0.018900081515312195, 0.009318212047219276, -0.001614902401342988]...
Stored embedding for eskdata Overview and Topics of Sprint 3


Add of existing embedding ID: 3
Insert of existing embedding ID: 3


Generated embedding for eskdata Overview and Topics of Sprint 4: [-0.009051410481333733, -0.002994192996993661, 0.017565932124853134, 0.0004327813221607357, 0.007309960667043924]...
Stored embedding for eskdata Overview and Topics of Sprint 4


Add of existing embedding ID: 4
Insert of existing embedding ID: 4


Generated embedding for eskdata How to Install Anaconda to Run Python for Data Science  by Mayank Aggarwal  Medium: [0.007355670910328627, -0.0011169660137966275, 0.020184392109513283, -0.03880889713764191, -0.008996241725981236]...
Stored embedding for eskdata How to Install Anaconda to Run Python for Data Science  by Mayank Aggarwal  Medium


Add of existing embedding ID: 5
Insert of existing embedding ID: 5


Generated embedding for eskdata Installing Anaconda on Windows Tutorial: [-0.006038852035999298, -0.006417734548449516, 0.016883527860045433, -0.008568057790398598, -0.00540738133713603]...
Stored embedding for eskdata Installing Anaconda on Windows Tutorial


Add of existing embedding ID: 6
Insert of existing embedding ID: 6


Generated embedding for eskdata Installing Anaconda on Mac OS X: [-0.0009095656569115818, -0.00429855240508914, 0.025624573230743408, -0.012565512210130692, 0.007583326194435358]...
Stored embedding for eskdata Installing Anaconda on Mac OS X


Add of existing embedding ID: 7
Insert of existing embedding ID: 7


Generated embedding for eskdata Managing environments: [0.017091797664761543, -0.0058974893763661385, 0.017296573147177696, 0.002766154706478119, -0.00024764848058111966]...
Stored embedding for eskdata Managing environments


Add of existing embedding ID: 8
Insert of existing embedding ID: 8


Generated embedding for eskdata Introduction to Credit Card Fraud and Outlier Detection: [-0.009500113315880299, 0.00757214892655611, 0.007725827395915985, -0.01739358901977539, -0.01598254404962063]...
Stored embedding for eskdata Introduction to Credit Card Fraud and Outlier Detection


Add of existing embedding ID: 9
Insert of existing embedding ID: 9


Generated embedding for eskdata Introduction to Credit Card Fraud and Outlier Detection codes: [-0.00422069663181901, 0.005614067893475294, 0.003128320910036564, -0.014231323264539242, -0.00937481690198183]...
Stored embedding for eskdata Introduction to Credit Card Fraud and Outlier Detection codes


Add of existing embedding ID: 10
Insert of existing embedding ID: 10


Generated embedding for eskdata ML | Credit Card Fraud Detection: [-0.004786971025168896, 0.007091301027685404, 0.015609092079102993, -0.012982429936528206, -0.010678100399672985]...
Stored embedding for eskdata ML | Credit Card Fraud Detection


Add of existing embedding ID: 11
Insert of existing embedding ID: 11


Generated embedding for eskdata How to Use Python for Credit Card Fraud Detection: Python Pandas: [0.010969659313559532, 0.015165925025939941, 0.028631754219532013, -0.032949455082416534, -0.015246881172060966]...
Stored embedding for eskdata How to Use Python for Credit Card Fraud Detection: Python Pandas


Add of existing embedding ID: 12
Insert of existing embedding ID: 12


Generated embedding for eskdata Simple Machine Learning Model: [-0.0035731566604226828, 0.014092391356825829, 0.006190019194036722, -0.006221089977771044, -0.0005359735223464668]...
Stored embedding for eskdata Simple Machine Learning Model


Add of existing embedding ID: 13
Insert of existing embedding ID: 13


Generated embedding for eskdata Simple Machine Learning Model codes: [0.001383452326990664, 0.01995428465306759, 0.007711219601333141, -0.030051805078983307, -0.013340622186660767]...
Stored embedding for eskdata Simple Machine Learning Model codes


Add of existing embedding ID: 14
Insert of existing embedding ID: 14


Generated embedding for eskdata Train Test Split and its importance: [0.014548911713063717, 0.002796733286231756, 0.04902065545320511, -0.016593623906373978, -0.012628717347979546]...
Stored embedding for eskdata Train Test Split and its importance


Add of existing embedding ID: 15
Insert of existing embedding ID: 15


Generated embedding for eskdata Train Test Split Documentation Scikit Learn: [-0.025563757866621017, 0.002033799886703491, 0.047393325716257095, -0.01231861487030983, -0.022419176995754242]...
Stored embedding for eskdata Train Test Split Documentation Scikit Learn


Add of existing embedding ID: 16
Insert of existing embedding ID: 16


Generated embedding for eskdata Tree-based Ensembles: [-0.026372963562607765, -0.008972245268523693, 0.01665303111076355, -0.017659010365605354, -0.002147901337593794]...
Stored embedding for eskdata Tree-based Ensembles


Add of existing embedding ID: 17
Insert of existing embedding ID: 17


Generated embedding for eskdata Tree-based Ensembles code: [-0.017479142174124718, -0.009495880454778671, 0.007030873093754053, -0.0167788565158844, -0.009698963724076748]...
Stored embedding for eskdata Tree-based Ensembles code


Add of existing embedding ID: 18
Insert of existing embedding ID: 18


Generated embedding for eskdata Decision Trees, Explained: [-0.012529783882200718, -0.00037198825157247484, 0.01941753923892975, -0.017202911898493767, -0.003687750780954957]...
Stored embedding for eskdata Decision Trees, Explained


Add of existing embedding ID: 19
Insert of existing embedding ID: 19


Generated embedding for eskdata Decision Trees: [-0.020096229389309883, 0.0018921111477538943, 0.01709882542490959, -0.00677821971476078, -0.01064759586006403]...
Stored embedding for eskdata Decision Trees


Add of existing embedding ID: 20
Insert of existing embedding ID: 20


Generated embedding for eskdata Machine Learning Beyond Accuracy: Advanced Model Evaluation Metrics and Techniques: [-0.00453291367739439, 0.002593484241515398, 0.013381889089941978, -0.017586026340723038, -0.008212408982217312]...
Stored embedding for eskdata Machine Learning Beyond Accuracy: Advanced Model Evaluation Metrics and Techniques


Add of existing embedding ID: 21
Insert of existing embedding ID: 21


Generated embedding for eskdata What is A Confusion Matrix in Machine Learning? The Model Evaluation Tool Explained: [-0.002040470251813531, 0.023104006424546242, 0.028016487136483192, -0.02081407606601715, -0.00382508197799325]...
Stored embedding for eskdata What is A Confusion Matrix in Machine Learning? The Model Evaluation Tool Explained


Add of existing embedding ID: 22
Insert of existing embedding ID: 22


Generated embedding for eskdata Precision-Recall Curve in Python Tutorial: [0.0040290467441082, 0.003221515566110611, 0.041571494191884995, -0.013994910754263401, 0.0120595907792449]...
Stored embedding for eskdata Precision-Recall Curve in Python Tutorial


Add of existing embedding ID: 23
Insert of existing embedding ID: 23


Generated embedding for eskdata What is Recall in Machine Learning?: [-0.0038793468847870827, 0.01961101032793522, 0.041064150631427765, -0.020237846300005913, 0.0035147585440427065]...
Stored embedding for eskdata What is Recall in Machine Learning?


Add of existing embedding ID: 24
Insert of existing embedding ID: 24


Generated embedding for eskdata Precision and Recall in Machine Learning: [0.003247014246881008, 0.01822340302169323, 0.044989023357629776, -0.022054458037018776, -0.006186638027429581]...
Stored embedding for eskdata Precision and Recall in Machine Learning


Add of existing embedding ID: 25
Insert of existing embedding ID: 25


Generated embedding for eskdata EDA and data preparation for NLP project: a hands-on example, step by step: [-0.0006359215476550162, 0.013509267009794712, 0.016323411837220192, -0.016213860362768173, -0.009585896506905556]...
Stored embedding for eskdata EDA and data preparation for NLP project: a hands-on example, step by step


Add of existing embedding ID: 26
Insert of existing embedding ID: 26


Generated embedding for eskdata Creating Everyday Apps with Streamlit: A Data Scientist’s Guide: [-0.0026456215418875217, 0.018472755327820778, 0.018983585759997368, -0.012218526564538479, 0.01638801209628582]...
Stored embedding for eskdata Creating Everyday Apps with Streamlit: A Data Scientist’s Guide


Add of existing embedding ID: 27
Insert of existing embedding ID: 27


Generated embedding for eskdata A Quick Overview of Large Language Models (LLM): [-0.009040660224854946, 0.029505306854844093, -0.004423798993229866, -0.02844013273715973, -0.009180464781820774]...
Stored embedding for eskdata A Quick Overview of Large Language Models (LLM)


Add of existing embedding ID: 28
Insert of existing embedding ID: 28


Generated embedding for eskdata Text Summarisation with ChatGPT API: A Python Implementation: [-0.004596380051225424, -0.008125304244458675, 0.020604703575372696, -0.01681242696940899, 0.011145079508423805]...
Stored embedding for eskdata Text Summarisation with ChatGPT API: A Python Implementation


Add of existing embedding ID: 29
Insert of existing embedding ID: 29


Generated embedding for eskdata NLTK Sentiment Analysis Tutorial for Beginners: [-0.012482166290283203, -0.001407270086929202, 0.02546200528740883, -0.027116429060697556, 0.002390845911577344]...
Stored embedding for eskdata NLTK Sentiment Analysis Tutorial for Beginners


Add of existing embedding ID: 30
Insert of existing embedding ID: 30


Generated embedding for eskdata How to use GPT-4 and OpenAI’s functions for text classification: [0.012674856930971146, 0.013379410840570927, -0.01740746758878231, -0.027456259354948997, -0.0031740518752485514]...
Stored embedding for eskdata How to use GPT-4 and OpenAI’s functions for text classification


Add of existing embedding ID: 31
Insert of existing embedding ID: 31


Generated embedding for eskdata From Keywords to Insights: Extracting and Classifying Short-Text Data with OpenAI API: [0.005815125070512295, 0.019636936485767365, 0.0010940378997474909, -0.03185949847102165, -0.005271590314805508]...
Stored embedding for eskdata From Keywords to Insights: Extracting and Classifying Short-Text Data with OpenAI API


Add of existing embedding ID: 32
Insert of existing embedding ID: 32


Generated embedding for eskdata Named Entity Recognition to Enrich Text: [-0.0003796011151280254, 4.7478199121542275e-05, 0.014912868849933147, -0.011938915587961674, 0.008397468365728855]...
Stored embedding for eskdata Named Entity Recognition to Enrich Text


Add of existing embedding ID: 33
Insert of existing embedding ID: 33


Generated embedding for eskdata Understanding and Using OpenAI's Text Generation Models: [-0.014831261709332466, 0.004249914083629847, -0.00016658238018862903, -5.0658851250773296e-05, 0.0046808128245174885]...
Stored embedding for eskdata Understanding and Using OpenAI's Text Generation Models


Add of existing embedding ID: 34
Insert of existing embedding ID: 34


Generated embedding for eskdata Prompt Chaining Tutorial: What Is Prompt Chaining and How to Use It?: [-0.012264908291399479, 0.011590621434152126, -0.004184122197329998, -0.0222159493714571, -0.00026904899277724326]...
Stored embedding for eskdata Prompt Chaining Tutorial: What Is Prompt Chaining and How to Use It?


Add of existing embedding ID: 35
Insert of existing embedding ID: 35


Generated embedding for eskdata Understanding and Mitigating Bias in Large Language Models (LLMs): [-0.007359533570706844, 0.026502475142478943, 0.009350617416203022, -0.03142242133617401, -0.0021592723205685616]...
Stored embedding for eskdata Understanding and Mitigating Bias in Large Language Models (LLMs)


Add of existing embedding ID: 36
Insert of existing embedding ID: 36


Generated embedding for eskdata LLM Evaluation: Metrics, Methodologies, Best Practices: [-0.001768471673130989, 0.020123988389968872, 0.010089096613228321, -0.031141363084316254, 0.0032252585515379906]...
Stored embedding for eskdata LLM Evaluation: Metrics, Methodologies, Best Practices


Add of existing embedding ID: 37
Insert of existing embedding ID: 37


Generated embedding for eskdata What is Design Thinking? A Beginner’s Guide: [1.2216991308378056e-05, 0.018330246210098267, 0.00924246571958065, -0.008778408169746399, 0.0043440875597298145]...
Stored embedding for eskdata What is Design Thinking? A Beginner’s Guide


Add of existing embedding ID: 38
Insert of existing embedding ID: 38


Generated embedding for eskdata How Would I Learn to Code with ChatGPT if I Had to Start Again?: [-4.178859308012761e-05, 0.03221014514565468, 0.0279474388808012, -0.04772583767771721, 0.01598171330988407]...
Stored embedding for eskdata How Would I Learn to Code with ChatGPT if I Had to Start Again?


Add of existing embedding ID: 39
Insert of existing embedding ID: 39


Generated embedding for eskdata How to build a storyboard: [-0.0015698589850217104, -0.010086242109537125, -0.009668700397014618, -0.023799877613782883, 0.0038622610736638308]...
Stored embedding for eskdata How to build a storyboard


Add of existing embedding ID: 40
Insert of existing embedding ID: 40


Generated embedding for eskdata Become A Master Storyteller: 5 ChatGPT Prompts To Build Your Audience: [-0.002791930688545108, -0.004795352462679148, 0.016216887161135674, -0.020480927079916, -0.000788931967690587]...
Stored embedding for eskdata Become A Master Storyteller: 5 ChatGPT Prompts To Build Your Audience


Add of existing embedding ID: 41
Insert of existing embedding ID: 41


Generated embedding for eskdata Full Rag Demo codes: [0.00635345745831728, 0.031165456399321556, 0.01443672738969326, -0.024909300729632378, 0.0004367326037026942]...
Stored embedding for eskdata Full Rag Demo codes


Add of existing embedding ID: 42
Insert of existing embedding ID: 42


Generated embedding for eskdata Full Rag Demo: [-0.014182958751916885, 0.006695230957120657, 0.004580767825245857, -0.011060794815421104, -9.623327059671283e-05]...
Stored embedding for eskdata Full Rag Demo


Add of existing embedding ID: 43
Insert of existing embedding ID: 43


Generated embedding for eskdata What is Retrieval Augmented Generation (RAG)?: [-0.01464703120291233, 0.01174828503280878, 0.018047483637928963, -0.01108631119132042, 0.007985488511621952]...
Stored embedding for eskdata What is Retrieval Augmented Generation (RAG)?


Add of existing embedding ID: 44
Insert of existing embedding ID: 44


Generated embedding for eskdata Retrieval-Augmented Generation (RAG) from basics to advanced: [-0.02460929937660694, 0.0013945030514150858, 0.01399887166917324, -0.01883745566010475, 0.010122261010110378]...
Stored embedding for eskdata Retrieval-Augmented Generation (RAG) from basics to advanced


Add of existing embedding ID: 45
Insert of existing embedding ID: 45


Generated embedding for eskdata JSON Data in Python: [0.005208699963986874, 0.016771746799349785, 0.02941383421421051, -0.02443159930408001, -0.011789511889219284]...
Stored embedding for eskdata JSON Data in Python


Add of existing embedding ID: 46
Insert of existing embedding ID: 46


Generated embedding for eskdata Working with JSON files with Python: [0.0030381856486201286, 0.034373559057712555, 0.03255031257867813, -0.026651574298739433, -0.004732396919280291]...
Stored embedding for eskdata Working with JSON files with Python


Add of existing embedding ID: 47
Insert of existing embedding ID: 47


Generated embedding for eskdata Introduction to JSONL: [-0.005133396480232477, 0.04360116273164749, 0.025078242644667625, -0.0038905018009245396, -0.007925606332719326]...
Stored embedding for eskdata Introduction to JSONL


Add of existing embedding ID: 48
Insert of existing embedding ID: 48


Generated embedding for eskdata JSONL: [-0.011531481519341469, 0.03516613692045212, 0.007065995130687952, -0.0040994626469910145, -0.0013874031137675047]...
Stored embedding for eskdata JSONL


Add of existing embedding ID: 49
Insert of existing embedding ID: 49


Generated embedding for eskdata An Introduction to Vector Databases For Machine Learning: A Hands-On Guide With Examples: [-0.026362279430031776, 0.005450225435197353, 0.006352854426950216, -0.016578055918216705, -0.002191114705055952]...
Stored embedding for eskdata An Introduction to Vector Databases For Machine Learning: A Hands-On Guide With Examples


Add of existing embedding ID: 50
Insert of existing embedding ID: 50


Generated embedding for eskdata Introduction to Text Embeddings with the OpenAI API: [0.001523384591564536, -0.005314613226801157, 0.018776921555399895, -0.0053318459540605545, 0.009023124352097511]...
Stored embedding for eskdata Introduction to Text Embeddings with the OpenAI API


Add of existing embedding ID: 51
Insert of existing embedding ID: 51


Generated embedding for eskdata Understanding Text Classification in Python: [0.018245596438646317, 0.01677120476961136, 0.015797054395079613, -0.015007201582193375, 0.01792965643107891]...
Stored embedding for eskdata Understanding Text Classification in Python


Add of existing embedding ID: 52
Insert of existing embedding ID: 52


Generated embedding for eskdata An Introduction to Bag of Words (BoW): [-0.01455308124423027, 0.016220903024077415, 0.020834309980273247, 0.010914810933172703, -0.005359891802072525]...
Stored embedding for eskdata An Introduction to Bag of Words (BoW)


Add of existing embedding ID: 53
Insert of existing embedding ID: 53


Generated embedding for eskdata Pandas Documentation - Installation and Dependencies: [0.01730203814804554, 0.008977603167295456, 0.037077780812978745, 0.006344080436974764, 0.0016598490765318274]...
Stored embedding for eskdata Pandas Documentation - Installation and Dependencies


Add of existing embedding ID: 54
Insert of existing embedding ID: 54


Generated embedding for eskdata What Machine Learning Role is Right for You?: [-0.008556254208087921, 0.01167954970151186, 0.01936968043446541, -0.0374532975256443, -0.027296025305986404]...
Stored embedding for eskdata What Machine Learning Role is Right for You?


Add of existing embedding ID: 55
Insert of existing embedding ID: 55


Generated embedding for eskdata 10 Clustering Algorithms With Python: [0.004806830082088709, 0.008870510384440422, 0.01073514111340046, -0.041129980236291885, 0.015457522124052048]...
Stored embedding for eskdata 10 Clustering Algorithms With Python


Add of existing embedding ID: 56
Insert of existing embedding ID: 56


Generated embedding for eskdata 8 Clustering Algorithms in Machine Learning that All Data Scientists Should Know
: [0.0025764978490769863, 0.024297481402754784, 0.016645047813653946, -0.036533333361148834, 0.00555840041488409]...
Stored embedding for eskdata 8 Clustering Algorithms in Machine Learning that All Data Scientists Should Know



Add of existing embedding ID: 57
Insert of existing embedding ID: 57


Generated embedding for eskdata Unsupervised Learning and Data Clustering: [0.002182227559387684, 0.025530723854899406, 0.013394592329859734, -0.017136510461568832, 0.009860187768936157]...
Stored embedding for eskdata Unsupervised Learning and Data Clustering


Add of existing embedding ID: 58
Insert of existing embedding ID: 58


Generated embedding for eskdata Top 50 Matplotlib Visualizations – The Master Plots (with full Python code)
: [-0.0027891010977327824, -0.004495021887123585, 0.0073035284876823425, 0.0018205896485596895, -0.004135137889534235]...
Stored embedding for eskdata Top 50 Matplotlib Visualizations – The Master Plots (with full Python code)



Add of existing embedding ID: 59
Insert of existing embedding ID: 59


Generated embedding for eskdata Introduction to Data Preprocessing in Machine Learning: [-0.004934465046972036, 0.027075255289673805, 0.019480645656585693, -0.009956925176084042, -0.012765982188284397]...
Stored embedding for eskdata Introduction to Data Preprocessing in Machine Learning

Stored Embeddings in ChromaDB:
Query result for ID 0: {'ids': ['0'], 'embeddings': [[-0.003084039082750678, -0.001510584494099021, 0.016013214364647865, 0.006871545221656561, -0.016951169818639755, -0.010439855046570301, 0.0036226839292794466, 0.0018351307371631265, -0.03610449656844139, -0.038061968982219696, 0.024454815313220024, 0.020104875788092613, -0.026167605072259903, 0.014816981740295887, 0.0005424680421128869, 0.004108653869479895, 0.015197601169347763, -0.03208080306649208, -0.006045736372470856, -0.023571234196424484, -0.015782123431563377, -0.008428007364273071, -0.0015352227492257953, 0.0022072545252740383, -0.011343826539814472, 0.010738912969827652, 0.014246052131056786, -0.02005050145

In [3]:
# Retrieve Embedding by ID
# Verify a specific ID (5 in this case)
'''This line calls the retrieve_embedding_by_id function with the argument 5, which is the ID of the data we want to verify. 
The function returns three values: embedding, metadata, and document.'''
embedding, metadata, document = retrieve_embedding_by_id(5)  # Unpack all three values

# Check if Data Exists:
'''This line checks if all three values (embedding, metadata, and document) are not None. 
If any of these values is None, it means the data for the specified ID was not found or an error occurred.'''
if embedding and metadata and document:
    # Print Data if Exists:
    '''If the data exists, this line prints the ID, the first five elements of the embedding, the metadata, and the first 100 characters of the document.'''
    print(f"ID: 5, Embedding: {embedding[:5]}..., Metadata: {metadata}, Document (first 100 chars): {document[:100]}...")  # Include document

# Print Error Message if Data Not Found:
# If the data does not exist, this line prints an error message indicating the failure to retrieve data for the specified ID.
else:
    print(f"Failed to retrieve embedding and metadata for ID: 5")

Query result for ID 5: {'ids': ['5'], 'embeddings': [[-0.006038852035999298, -0.006417734548449516, 0.016883527860045433, -0.008568057790398598, -0.00540738133713603, -0.015155292116105556, -0.012330292724072933, -0.022493643686175346, -0.004892234690487385, -0.02037987858057022, 0.014570350758731365, 0.0006792462081648409, -0.029619289562106133, 0.013187762349843979, -0.013599880039691925, 0.0015670438297092915, 0.016125762835144997, -0.005852734204381704, -0.01389235071837902, -0.00820246897637844, -0.025298701599240303, 0.01068846881389618, 0.014689997769892216, -0.013407115824520588, -0.016338467597961426, 0.008435116149485111, 0.009066587314009666, -0.03778187558054924, 0.011373115703463554, -0.014105056412518024, 0.019422702491283417, -0.015873173251748085, -0.015407879836857319, -0.029645878821611404, -0.01018993929028511, 0.007757116574794054, 0.0076308222487568855, 0.003964969888329506, 0.0319058783352375, 0.022932348772883415, 0.025976702570915222, 0.005540322512388229, -0.01

# Query Vector Storage

In [3]:
# Query the collection for texts similar to "RAG" and get the top 5 results
# This line queries the ChromaDB collection to find the top 5 results that are most similar to the query text "RAG".
query_result = collection.query(query_texts=["RAG"], n_results=5)

# Print the structure of the query result
# These lines print the structure of the query result to see what keys are available in the returned dictionary.
print("\nQuery Result Structure:")
print(query_result.keys())

# Print the details of the query results
print("\nQuery Results:")

# Enumerate through the Results:
'''The enumerate function adds a counter (idx) to the loop. The zip function combines the corresponding elements from the lists of IDs, metadata, documents, and distances from the query result. 
The get method ensures that if the key is not found, an empty list is returned, preventing errors.'''
for idx, (ids, metadatas, documents, distances) in enumerate(zip(
        query_result.get('ids', [[]])[0],   
        query_result.get('metadatas', [[]])[0],
        query_result.get('documents', [[]])[0],
        query_result.get('distances', [[]])[0]
    )):

    # Print Each Result
    # For each result, it prints the ID, Eskdata (a specific field in metadata), the first 100 characters of the document, and the distance (a measure of similarity).
    print(f"\nResult {idx + 1}:")
    print(f"ID: {ids}")
    print(f"Eskdata: {metadatas.get('eskdata', 'Unknown')}")  # Access the 'eskdata' field in metadata
    print(f"Document (first 100 chars): {documents[:100]}...")  # Print the first 100 characters of the document
    print(f"Distance: {distances:.4f}")

    # Check if embeddings exist before accessing them
    # This part checks if embeddings exist and prints the first 5 values of the embedding for the current result.
    embeddings = query_result.get('embeddings', [])
    if embeddings and embeddings[0]:  # Check both levels
        print(f"Embedding (first 5): {embeddings[0][idx][:5]}...")



Query Result Structure:
dict_keys(['ids', 'distances', 'metadatas', 'embeddings', 'documents', 'uris', 'data', 'included'])

Query Results:

Result 1:
ID: 4
Eskdata: What is Retrieval Augmented Generation (RAG)?
Document (first 100 chars): Title: What is Retrieval Augmented Generation (RAG)?
Link: https://www.datacamp.com/blog/what-is-ret...
Distance: 0.1683

Result 2:
ID: 5
Eskdata: Retrieval-Augmented Generation (RAG) from basics to advanced
Document (first 100 chars): Title: Retrieval-Augmented Generation (RAG) from basics to advanced
Link: https://medium.com/@tejpal...
Distance: 0.1860

Result 3:
ID: 10
Eskdata: An Introduction to Vector Databases For Machine Learning: A Hands-On Guide With Examples
Document (first 100 chars): Title: An Introduction to Vector Databases For Machine Learning: A Hands-On Guide With Examples
Link...
Distance: 0.2848

Result 4:
ID: 15
Eskdata: What Machine Learning Role is Right for You?
Document (first 100 chars): Title: What Machine Learning Role is 

In [4]:
'''This line defines a function named return_best_eskdata that takes three parameters: user_input, collection, and n_results (default is 1). 
The function will find and return the best matching data from the collection based on the user input.'''
def return_best_eskdata(user_input, collection, n_results=1):
    # Query the collection for texts similar to user_input and get the top result
    # This line queries the ChromaDB collection to find the top results that are most similar to the user_input. It returns a dictionary containing the query results.
    query_result = collection.query(query_texts=[user_input], n_results=n_results)
    
    # Check if there are no results
    '''These lines check if there are no results by verifying if the 'ids' key is empty or the first element in the 'ids' list is empty.
    If no results are found, it prints a message and returns None.'''
    if not query_result['ids'] or not query_result['ids'][0]:
        print("No data found matching the query.")
        return None, None  # No results found

    # Get the top result's ID, metadata, and document
    # These lines extract the ID, metadata, and document of the top result from the query results.
    top_result_id = query_result['ids'][0][0]
    top_result_metadata = query_result['metadatas'][0][0]
    top_result_document = query_result['documents'][0][0]
    
    # Print the top result in a readable and formatted manner
    print("Top Data Found:")
    print("---------------")
    print(f"Title: {top_result_metadata.get('eskdata', 'Unknown Data')}")
    print("\nBody:")
    print("-----------------")
    print(top_result_document)
    print("\nRecommendation:")
    print("----------------")
    
    # This line returns the name and document of the top result.
    return top_result_metadata.get('eskdata', 'Unknown Data'), top_result_document


In [5]:
# Example function usage
user_input = "Bag of Words"
top_name, top_document = return_best_eskdata(user_input, collection, n_results=1)

# Print the result
if top_name and top_document:
    print("\nBest Match Found:")
    print(f"Title: {top_name}")
    print(f"Document: {top_document[:100]}...")  # Print the first 100 characters of the document

Top Data Found:
---------------
Title: An Introduction to Bag of Words (BoW)

Body:
-----------------
Title: An Introduction to Bag of Words (BoW)
Link: https://medium.com/@vamshiprakash001/an-introduction-to-bag-of-words-bow-c32a65293ccc
Publish Date: Jun 27, 2023
Sprint: Sprint 4
Body: Vamshi Prakash
Jun 27, 2023

Using Natural Language Processing, we make use of the text data available across the internet to generate insights for the business. In order to understand this huge amount of data and make insights from them, we need to make them usable. Natural language processing helps us to do so.

What is a Bag of Words in NLP?
Bag of words is a Natural Language Processing technique of text modelling. In technical terms, we can say that it is a method of feature extraction with text data. This approach is a simple and flexible way of extracting features from documents.

A bag of words is a representation of text that describes the occurrence of words within a document. We just keep tra

In [6]:
# Import the OpenAI library and initialize the client with the API key
from openai import OpenAI

api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)

'''This line defines a function named generate_conversational_response that takes two parameters: user_input and collection. 
The function will generate a conversational response based on the user input and the collection of data.'''
def generate_conversational_response(user_input, collection):
    # Generate a conversational response based on the user input and collection
    
    # Find the most relevant article based on the user input
    '''This line calls the return_best_eskdata function to find the most relevant article based on the user input and the collection. 
    It returns the name and document of the most relevant article.'''
    relevant_name, relevant_document = return_best_eskdata(user_input, collection)
    
    # If no relevant article is found, return a message indicating so
    # These lines check if no relevant article was found. If no article is found, it returns a message indicating so.
    if not relevant_name:
        return "I couldn't find any relevant articles based on your input."

    # Prepare the messages for the conversational AI
    '''These lines prepare a list of messages to be sent to the OpenAI API. 
    The system message sets the context, the user message contains the user input, and the assistant message contains the recommendation based on the relevant article.'''
    messages = [
        {"role": "system", "content": "You are a bot that makes recommendations for each Sprint 1 to 4 for the Data Science bootcamp."},
        {"role": "user", "content": user_input},
        {"role": "assistant", "content": f"This is the recommended article: {relevant_name}. Here is a brief about the article: {relevant_document}"}
    ]
    
    # Generate a response from the OpenAI API
    # This line calls the OpenAI API to generate a response based on the prepared messages. It uses the gpt-3.5-turbo model and limits the response to 200 tokens.
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=messages,
        max_tokens=200
    )
    
    # Return the generated response
    return response.choices[0].message.content


In [7]:
# Example usage
user_input = "I need to learn Data Visualization"
print(generate_conversational_response(user_input, collection))

Top Data Found:
---------------
Title: Top 50 Matplotlib Visualizations – The Master Plots (with full Python code)


Body:
-----------------
Title: Top 50 Matplotlib Visualizations – The Master Plots (with full Python code)

Link: https://www.machinelearningplus.com/plots/top-50-matplotlib-visualizations-the-master-plots-python/
Publish Date: nan
Sprint: Sprint 1
Body: Correlation

Scatter plot
Scatterplot is a classic and fundamental plot used to study the relationship between two variables. If you have multiple groups in your data you may want to visualise each group in a different color. In matplotlib, you can conveniently do this using plt.scatterplot().

python
Copy code
import matplotlib.pyplot as plt
import seaborn as sns

df = sns.load_dataset('iris')
sns.scatterplot(x='sepal_length', y='sepal_width', hue='species', data=df)
plt.show()
Bubble plot with Encircling
Sometimes you want to show a group of points within a boundary to emphasize their importance. In this example, you g