In [None]:
!pip install llama-index

!pip install llama-index-vector-stores-mongodb

!pip install llama-index-embeddings-openai

!pip install pymongo

!pip install datasets

!pip install pandas


In [None]:
%env OPENAI_API_KEY=your_api_key

In [None]:
import pandas as pd
import uuid  # Optional for random UUIDs
import re

from datasets import load_dataset

dataset = load_dataset("aakash0017/It-support-synthetic-data")

# Define regular expression pattern (slightly modified)
pattern = r"(?<=### Human: )(.*?)(?=### Assistant: )"  # Use non-greedy match

# Create empty lists to store questions and answers
questions = []
answers = []

# Extract data using regular expression
for row in dataset["train"]:
  match = re.search(pattern, row["text"])
  if match:
    questions.append(match.group(1).strip())  # Strip trailing whitespaces
    answers.append(row["text"].split("### Assistant: ")[-1])

# Create DataFrame with extracted data
dataset_df = pd.DataFrame({"question": questions, "answer": answers})

# Generate unique IDs with leading zeros for consistent formatting
dataset_df["ticket-number"] = "IDC" + dataset_df.apply(lambda row: str(uuid.uuid4())[:8], axis=1).str.zfill(8)

# Add text_content column
dataset_df["text_content"] = dataset_df["question"] + " " + dataset_df["answer"]

# Reorder columns to make "ticket-number" the first column
reordered_cols = ["ticket-number", "question", "answer", "text_content"]
dataset_df = dataset_df[reordered_cols]
dataset_df.head(10)


In [None]:
""" For testing purpose. Takes 10 rows from the data so it takes less time.
import pandas as pd
import re

from datasets import load_dataset

# Assuming you have already run the code that creates the DataFrame (df) with questions and answers

# Extract only the first 10 rows
dataset_df = dataset_df.head(10)

dataset_df.head(100)
"""

Unnamed: 0,ticket-number,question,answer,text_content
0,IDC79659407,What is the newest learning management system ...,Blackboard Learn.,What is the newest learning management system ...
1,IDCfc1b32d0,Does Boston University offer a laptop loan pro...,"Yes, they have a laptop loaner program for act...",Does Boston University offer a laptop loan pro...
2,IDC5385384b,What is Blackboard Learn?,Blackboard Learn is the primary learning manag...,What is Blackboard Learn? Blackboard Learn is ...
3,IDCb586e780,What are the key features of Blackboard Learn?,The key features of Blackboard Learn include i...,What are the key features of Blackboard Learn?...
4,IDC1309c0a1,What to expect from Blackboard Learn?,"Blackboard Learn is normally available 24/7, e...",What to expect from Blackboard Learn? Blackboa...
5,IDCf1f27cca,How do I get started with Blackboard Learn?,"To get started with Blackboard Learn, faculty ...",How do I get started with Blackboard Learn? To...
6,IDC53714075,What does the IT Help Center provide?,The IT Help Center provides centralized techno...,What does the IT Help Center provide? The IT H...
7,IDC87909b25,How do I contact the IT Help Center?,You can contact the IT Help Center online or b...,How do I contact the IT Help Center? You can c...
8,IDCbd8115e6,What services does the IT Help Center offer?,The IT Help Center offers a variety of service...,What services does the IT Help Center offer? T...
9,IDCa2be0617,Do I need a current BU login name to use some ...,"Yes, some services require a current BU login ...",Do I need a current BU login name to use some ...


In [None]:
from llama_index.core.settings import Settings

from llama_index.llms.openai import OpenAI

from llama_index.embeddings.openai import OpenAIEmbedding

embed_model=OpenAIEmbedding(model="text-embedding-3-small",dimensions=256)

llm = OpenAI(model="gpt-4-turbo")

Settings.llm=llm

Settings.embed_model=embed_model


In [None]:
import json
from llama_index.core import Document
from llama_index.core.schema import MetadataMode

# Convert the DataFrame to a JSON string representation
documents_json = dataset_df.to_json(orient='records')
# Load the JSON string into a Python list of dictionaries
documents_list = json.loads(documents_json)

llama_documents = []

for document in documents_list:

  # Value for metadata must be one of (str, int, float, None)
  document["ticket-number"] = json.dumps(document["ticket-number"])
  document["question"] = json.dumps(document["question"])
  document["answer"] = json.dumps(document["answer"])



  # Create a Document object with the text and excluded metadata for llm and embedding models
  llama_document = Document(
      text=document["text_content"],
      metadata=document,
      excluded_llm_metadata_keys=["question", "answer"],
      excluded_embed_metadata_keys=["question", "answer"],

      metadata_template="{key}=>{value}",
      text_template="Metadata: {metadata_str}\n-----\nContent: {content}",
      )

  llama_documents.append(llama_document)

# Observing an example of what the LLM and Embedding model receive as input
print(
    "\nThe LLM sees this: \n",
    llama_documents[0].get_content(metadata_mode=MetadataMode.LLM),
)
print(
    "\nThe Embedding model sees this: \n",
    llama_documents[0].get_content(metadata_mode=MetadataMode.EMBED),
)

In [None]:
from llama_index.core.node_parser import SentenceSplitter

parser = SentenceSplitter()
nodes = parser.get_nodes_from_documents(llama_documents)

for node in nodes:
    node_embedding = embed_model.get_text_embedding(
        node.get_content(metadata_mode="all")
    )
    node.embedding = node_embedding


In [None]:
#set the uri before running this cell.
import pymongo
from google.colab import userdata


def get_mongo_client(mongo_uri):
  """Establish connection to the MongoDB."""
  try:
    client = pymongo.MongoClient(mongo_uri)
    print("Connection to MongoDB successful")
    return client
  except pymongo.errors.ConnectionFailure as e:
    print(f"Connection failed: {e}")
    return None

mongo_uri = userdata.get('MONGO_URI')
if not mongo_uri:
  print("MONGO_URI not set in environment variables")

mongo_client = get_mongo_client(mongo_uri)

DB_NAME="itdata"
COLLECTION_NAME="it_support_data"

db = mongo_client[DB_NAME]
collection = db[COLLECTION_NAME]

Connection to MongoDB successful


In [None]:
# To ensure we are working with a fresh collection
# delete any existing records in the collection

collection.delete_many({})


DeleteResult({'n': 0, 'electionId': ObjectId('7fffffff000000000000030b'), 'opTime': {'ts': Timestamp(1714584301, 16), 't': 779}, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1714584301, 16), 'signature': {'hash': b'[O\xaa\xe8\x1dCM.\xba\xe4\x18\x9b^\xef=\xa7\xb0K|\x0b', 'keyId': 7311276111621521410}}, 'operationTime': Timestamp(1714584301, 16)}, acknowledged=True)

In [None]:
from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch

vector_store = MongoDBAtlasVectorSearch(mongo_client, db_name=DB_NAME, collection_name=COLLECTION_NAME, index_name="vector_index")
vector_store.add(nodes)


In [None]:
from llama_index.core import VectorStoreIndex, StorageContext
index = VectorStoreIndex.from_vector_store(vector_store)

In [None]:

import pprint
from llama_index.core.response.notebook_utils import display_response

query_engine = index.as_query_engine(similarity_top_k=3)
query = "Do I need a current BU login name to use some services??"
response = query_engine.query(query)
display_response(response)
pprint.pprint(response.source_nodes)
