In [None]:
import json
import os
from typing import Any, Dict, Iterable, List, Optional

#import chromadb
import pandas as pd
import requests
import tqdm
#from chromadb.api.types import EmbeddingFunction
from dotenv import load_dotenv
from genai import Model
from genai.model import Credentials
from genai.schemas import GenerateParams
#from rouge import Rouge
from datasets import Dataset
#from sentence_transformers import SentenceTransformer
import numpy as np

from bs4 import BeautifulSoup
import re

import torch
from transformers import AutoTokenizer, AutoModel
from chunkipy import TextChunker, TokenEstimator
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection, utility
from milvus import default_server
from pymilvus import connections, utility

from torch.nn.functional import normalize
from torch import clamp, sum

In [None]:
def get_genai_creds():
    load_dotenv(override=True)
    api_key = os.getenv("GENAI_KEY", None)
    api_url = os.getenv("GENAI_API", None)
    if api_key is None or api_url is None:
        print("Either api_key or api_url is None. Please make sure your credentials are correct.")
    if api_url is not None:
        api_url = api_url.rstrip("/")
    creds = Credentials(api_key, api_url)
    return creds

creds = get_genai_creds()
if creds.api_endpoint:
    print(f"Your API endpoint is: {creds.api_endpoint}")

In [None]:
# get the list of supported models from the API
models_response = requests.get(f"{creds.api_endpoint}/models")

# Parse the JSON response
models_data = json.loads(models_response.content)

model_ids = []
for model_n in models_data["results"]:
    print(model_n["id"])

In [None]:
def remove_html_tags(html_text):
    # Create a BeautifulSoup object to parse the HTML
    soup = BeautifulSoup(html_text, "html.parser")

    # Extract the plain text content from the HTML
    text_content = soup.get_text(separator="\n")

    return text_content

In [None]:
import re

def cap_consecutive_newlines(input_str):
    # Use a regular expression to replace consecutive newlines with a maximum of two
    result = re.sub(r'\n{3,}', '\n', input_str)
    return result

def remove_extra_spaces(input_str):
    # Use a regular expression to replace multiple spaces with a single space
    result = re.sub(r' +', ' ', input_str)
    return result.strip()

def preprocess_text_input(txt):
    return cap_consecutive_newlines(remove_extra_spaces(txt))

In [None]:
def load_data_v1(filename):
    if filename.endswith('.csv'):
        psgs = pd.read_csv(filename, header=0, low_memory=False)
    else:
        psgs = pd.read_excel(filename)
    return psgs

In [None]:
tickets = load_data_v1("ExampleTicketData.xlsx")
knowledge = load_data_v1("ExampleKbData.csv")

In [None]:
def refine_ticket_data(tickets):
    # can add to these lists if there are more tickets that we do not wish to consider
    banned_additional_comments = [x for x in tickets['additional_comments'].unique() if len(str(x)) < 10]
    banned_resolution = [x for x in tickets['resolution'].unique() if len(str(x)) < 10]
    tickets = tickets[~(pd.isna(tickets['additional_comments']) & pd.isna(tickets['resolution']))]
    tickets = tickets[~(tickets['additional_comments'].isin(banned_additional_comments) & tickets['resolution'].isin(banned_resolution))]
    return tickets

In [None]:
tickets = refine_ticket_data(tickets)

In [None]:
model = AutoModel.from_pretrained('intfloat/e5-base-v2')
tokenizer = AutoTokenizer.from_pretrained('intfloat/e5-base-v2')

TOKENIZER_MAX_SIZE = 512 # may have to change this  if embedding model is changed

In [None]:
class BertTokenEstimator(TokenEstimator):
    def __init__(self):
        self.bert_tokenizer = AutoTokenizer.from_pretrained('intfloat/e5-base-v2')

    def estimate_tokens(self, text):
        return len(self.bert_tokenizer.encode(text))

bert_token_estimator = BertTokenEstimator()

text_chunker = TextChunker(TOKENIZER_MAX_SIZE, tokens=True, token_estimator=BertTokenEstimator(), overlap_percent=0.3)

In [None]:
import unicodedata

def remove_non_ascii(text):
    normalized_text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return normalized_text

def remove_emails(text):
    return re.sub(r'\S+@\S+', '', text)

def chunkTicketData(row):
    text = createIndexStrTickets(row)
    return text_chunker.chunk(text)

def chunkKbData(row):
    text = createIndexStrKb(row)
    return text_chunker.chunk(text)

def convert_to_lower(inp):
    return inp.lower()

In [None]:
def createIndexStrTickets(row):
    banned_additional_comments = [x for x in tickets['additional_comments'].unique() if len(str(x)) < 10]
    banned_resolution = [x for x in tickets['resolution'].unique() if len(str(x)) < 10]
    
    result = 'Subject: ' + str(row['short_description']) + '\n\n'
    result += 'Description: ' + re.sub(r"\+91\s?\d+$", "", str(row['long_description']).strip(), flags=re.DOTALL) + '\n\n'
    
    if pd.notnull(row['resolution']) and row['resolution'] not in banned_resolution:
        result += 'Resolution notes: ' + re.sub(r"\+91\s?\d+$", "", str(row['resolution']).strip(), flags=re.DOTALL) + '\n\n'
    if pd.notnull(row['additional_comments']) and row['additional_comments'] not in banned_additional_comments:
        result += 'Additional Comments: ' + re.sub(r"\+91\s?\d+$", "", str(row['additional_comments']).strip(), flags=re.DOTALL)
    
    result = preprocess_text_input(result)
    result = remove_emails(result)
    return remove_non_ascii(result.replace('\r',''))

In [None]:
tickets = tickets[['assignment_group', 'short_description', 'long_description', 'resolution', 'additional_comments']]
tickets['assignment_group'] = tickets['assignment_group'].apply(convert_to_lower)
tickets = tickets.reset_index()
tickets = tickets.rename(columns={'index': 'ID'})
tickets.set_index('ID', inplace=True)

In [None]:
ticket_chunks = tickets.apply(chunkTicketData, axis=1).explode().reset_index()

ticket_chunks.columns = ['ID', 'text']

# First, let's create a mapping from 'ID' to 'CLASS' in the `documents_copy` DataFrame.
class_mapping = tickets['assignment_group'].to_dict()

# Now, let's add the 'CLASS' column to the `chunks_df` DataFrame using this mapping.
ticket_chunks['assignment_group'] = ticket_chunks['ID'].map(class_mapping)

ticket_chunks.head()

In [None]:
def createIndexStrKb(row):
    result = 'Topic: ' + remove_html_tags(str(row['question'])) + '\n\n'
    result += 'Answer: ' + remove_html_tags(str(row['answer'])) + '\n\n'
    result += 'Tags: ' + remove_html_tags(str(row['tags']))
    re.sub(r"\+91\s\d+\b", "", result, flags=re.DOTALL)
    result = remove_emails(result)
    result = preprocess_text_input(result)
    return remove_non_ascii(result.replace('\r',''))

In [None]:
knowledge_chunks = knowledge.apply(chunkKbData, axis=1).explode().reset_index()

knowledge_chunks.columns = ['ID', 'text']

class_mapping = knowledge['tags'].to_dict()

knowledge_chunks['tags'] = knowledge_chunks['ID'].map(class_mapping)

knowledge_chunks.head()

In [None]:
account = "example"

EMBEDDING_DIMENSION = 768 # may need to change this if embedding model changes
COLLECTION_NAME = f"AMS_{account}"
connections.connect(host='127.0.0.1', port=default_server.listen_port)

# Check if the server is ready.
print(utility.get_server_version())

# Remove collection if it already exists
if utility.has_collection(COLLECTION_NAME):
    utility.drop_collection(COLLECTION_NAME)

# Create collection which includes the id, title, and embedding.
fields = [
    FieldSchema(name='id', dtype=DataType.INT64, is_primary=True, auto_id=True),
    FieldSchema(name='ticket_id', dtype=DataType.INT64),
    FieldSchema(name='assignment_id', dtype=DataType.VARCHAR, max_length=128),
    FieldSchema(name='type', dtype=DataType.VARCHAR, max_length=2),
    FieldSchema(name='chunk', dtype=DataType.VARCHAR, max_length=6000),
    FieldSchema(name='chunk_embedding', dtype=DataType.FLOAT_VECTOR, dim=EMBEDDING_DIMENSION)
]

schema = CollectionSchema(fields=fields)
collection = Collection(name=COLLECTION_NAME, schema=schema)

# Create an FLAT index for collection.
index_params = {
    'metric_type':'IP',
    'index_type':"FLAT"
}

collection.create_index(field_name="chunk_embedding", index_params=index_params)
collection.load()

In [None]:
def tokenize_ticket_data(batch):
    results = tokenizer(["passage: " + x for x in batch['text']], add_special_tokens = True, truncation = True, padding = "max_length", return_attention_mask = True, return_tensors = "pt")
    batch['input_ids'] = results['input_ids']
    batch['token_type_ids'] = results['token_type_ids']
    batch['attention_mask'] = results['attention_mask']
    return batch

TOKENIZATION_BATCH_SIZE = 256 # may need to lower this for larger embedding models

ticket_dataset = Dataset.from_pandas(ticket_chunks)

# Generate the tokens for each entry.
ticket_dataset = ticket_dataset.map(tokenize_ticket_data, batch_size=TOKENIZATION_BATCH_SIZE, batched=True)
# Set the ouput format to torch so it can be pushed into embedding model
ticket_dataset.set_format('torch', columns=['input_ids', 'token_type_ids', 'attention_mask'], output_all_columns=True)

In [None]:
EMBEDDING_BATCH_SIZE = 16

# Embed the tokenized data and take the mean pool with respect to attention mask of hidden layer.
def embed(batch):
    sentence_embs = model(
                input_ids=batch['input_ids'],
                token_type_ids=batch['token_type_ids'],
                attention_mask=batch['attention_mask']
                )[0]
    input_mask_expanded = batch['attention_mask'].unsqueeze(-1).expand(sentence_embs.size()).float()
    batch['question_embedding'] = sum(sentence_embs * input_mask_expanded, 1) / clamp(input_mask_expanded.sum(1), min=1e-9)
    return batch

ticket_dataset = ticket_dataset.map(embed, remove_columns=['input_ids', 'token_type_ids', 'attention_mask'], batched = True, batch_size=EMBEDDING_BATCH_SIZE)

In [None]:
ticket_dataset

In [None]:
def insert_tickets(batch):
    insertable = [
        batch['ID'].tolist(),
        [x.lower().strip() for x in batch['assignment_group']],
        ['td' for _ in range(len(batch['text']))],
        batch['text'],
        normalize(batch['question_embedding'], dim=1).tolist()
    ]
    collection.insert(insertable)

ticket_dataset.map(insert_tickets, batched=True, batch_size=64)
collection.flush()

In [None]:
def tokenize_kb_chunk_data(batch):
    results = tokenizer(["passage: " + x for x in batch['text']], add_special_tokens = True, truncation = True, padding = "max_length", return_attention_mask = True, return_tensors = "pt")
    batch['input_ids'] = results['input_ids']
    batch['token_type_ids'] = results['token_type_ids']
    batch['attention_mask'] = results['attention_mask']
    return batch

TOKENIZATION_BATCH_SIZE = 256

knowledge_dataset = Dataset.from_pandas(knowledge_chunks)

# Generate the tokens for each entry.
knowledge_dataset = knowledge_dataset.map(tokenize_kb_chunk_data, batch_size=TOKENIZATION_BATCH_SIZE, batched=True)
# Set the ouput format to torch so it can be pushed into embedding model
knowledge_dataset.set_format('torch', columns=['input_ids', 'token_type_ids', 'attention_mask'], output_all_columns=True)

In [None]:
knowledge_dataset = knowledge_dataset.map(embed, remove_columns=['input_ids', 'token_type_ids', 'attention_mask'], batched = True, batch_size=EMBEDDING_BATCH_SIZE)

In [None]:
def insert_kb(batch):
    insertable = [
        [-1 for x in batch['ID']],
        [x.lower() for x in batch['tags']],
        ['kb' for _ in range(len(batch['text']))],
        batch['text'],
        normalize(torch.FloatTensor(batch['question_embedding']), dim=1).tolist()
    ]    
    collection.insert(insertable)
    
knowledge_dataset.map(insert_kb, batched=True, batch_size=64)
collection.flush()

In [None]:
collection.num_entities

# Text Input KB Files functions follow

In [7]:
import docx
from docx import Document
import os
from pdfminer.high_level import extract_text
from tika import parser

def extract_text_from_docx(docx_file):
    document = Document(docx_file)
    result = []
    
    for paragraph in document.paragraphs:
        result.append(paragraph.text)
    
    return "\n".join(result)

def extract_text_from_docx(docx_file):
    doc = docx.Document(docx_file)
    text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
    return text

def append_to_text_instances(text_instances, text):
    text_instances.append(text)

def extract_text_from_pptx(input_pptx_file):
    parsed = parser.from_file(input_pptx_file)
    return parsed["content"]

processed_base_filenames = set()
folder_path = "adani_source_files/GenAI/ISU/"  # Replace with the folder containing source files
text_instances = []

def process_file(file_path):
    base_filename, file_extension = os.path.splitext(os.path.basename(file_path))

    if base_filename not in processed_base_filenames:
        if file_extension.lower() == ".docx":
            text = extract_text_from_docx(file_path)
            append_to_text_instances(text_instances, text)
        elif file_extension.lower() == ".pdf":
            text = extract_text(file_path)
            append_to_text_instances(text_instances, text)
        elif file_extension.lower() == ".pptx":
            text = extract_text_from_pptx(file_path)
            append_to_text_instances(text_instances, text)
        
        processed_base_filenames.add(base_filename) # prevent duplicate files from being processed with diff extensions

for root, dirs, files in os.walk(folder_path):
    for file in files:
        file_path = os.path.join(root, file)
        if file.lower().endswith((".docx", ".pdf")):
            process_file(file_path)

In [None]:
text_chunks = []

for text_instance in text_instances:
    chunks = text_chunker.chunk(text_instance)
    for chunk in chunks:
        text_chunks.append(chunk)

text_chunks_df = pd.DataFrame()
text_chunks_df["text"] = text_chunks

In [None]:
from datasets import Dataset

def tokenize_kb_chunk_data(batch):
    results = tokenizer(["passage: " + x for x in batch['text']], add_special_tokens = True, truncation = True, padding = "max_length", return_attention_mask = True, return_tensors = "pt")
    batch['input_ids'] = results['input_ids']
    batch['token_type_ids'] = results['token_type_ids']
    batch['attention_mask'] = results['attention_mask']
    return batch

TOKENIZATION_BATCH_SIZE = 512

dataset_chunks = Dataset.from_pandas(text_chunks_df)

dataset_chunks = dataset_chunks.map(tokenize_kb_chunk_data, batch_size=TOKENIZATION_BATCH_SIZE, batched=True)
dataset_chunks.set_format('torch', columns=['input_ids', 'token_type_ids', 'attention_mask'], output_all_columns=True)

In [None]:
from torch.nn.functional import normalize
from torch import clamp, sum

def embed(batch):
    for key in ['input_ids', 'token_type_ids', 'attention_mask']:
        batch[key] = batch[key]
    sentence_embs = model(
                input_ids=batch['input_ids'],
                token_type_ids=batch['token_type_ids'],
                attention_mask=batch['attention_mask']
                )[0]
    input_mask_expanded = batch['attention_mask'].unsqueeze(-1).expand(sentence_embs.size()).float()
    batch['question_embedding'] = sum(sentence_embs * input_mask_expanded, 1) / clamp(input_mask_expanded.sum(1), min=1e-9)
    return batch

dataset_chunks = dataset_chunks.map(embed, remove_columns=['input_ids', 'token_type_ids', 'attention_mask'], batched = True, batch_size=EMBEDDING_BATCH_SIZE)

In [None]:
from torch.nn.functional import normalize
from torch import clamp, sum

def insert_text_chunk(batch):

    insertable = [
        [-1 for x in range(len(batch['text']))],
        ['' for x in range(len(batch['text']))],
        ['kb' for _ in range(len(batch['text']))],
        ['' for x in range(len(batch['text']))],
        batch['text'], # chunk itself - raw text
        normalize(batch['question_embedding'], dim=1).tolist() # embedding of the chunk - vector representation (for searching)
    ]
    collection.insert(insertable)

data_dataset_chunks.map(insert_text_chunk, batched=True, batch_size=256)
collection.flush()