In [9]:
# ========================HEADER_START=================================
# %%
#
# COPYING, REPRODUCTION OR DISTRIBUTION SHOULD BE LIMITED AND ONLY TO
# EMPLOYEES WITH A "NEED TO KNOW" TO DO THEIR JOB. ANY DISCLOSURE OF 
# THIS DOCUMENT TO THIRD PARTIES IS STRICTLY PROHIBITED.
#
# FILE: SUMMARIZE_TICKETS.IPYNB
# DESCRIPTION: CREATE SUMMARY FOR ALL THE HISTORICAL TICKETS DOWNLOADED
# FROM THE TEAMSUPPORT SYSTEM. COLUMNS USED FOR CREATING THE SUMMARY ARE
# TICKET NAME & ACTION DESCRIPTION. USER COMMENTS ARE DULY UPDATED 
# TIME-WISE UNDER NAME ACTION DESCRIPTION. THIS DATE-WISE ACTION UPDATES
# ARE FED INTO A LLM AND A SUMMARY IS CREATED FOR EACH TICKET.
# 
# =========================HEADER_END==================================
#
# @AUTHOR YUVARAJ KUMAR NA
# 14/JAN/2025


In [1]:
#INSTALL ALL REQUIRED LIBRARIES
#!pip install requests
#!pip install xlsxwriter
#!pip install tensorflow==2.12.0
#!pip install transformers==4.30.2
#!pip install sentence-transformers
#!pip install tf-keras
#!pip install huggingface_hub==0.14.1
#!pip install --upgrade optree>=0.13.0
#!pip install scikit-learn

In [2]:
#IMPORT ALL GENERIC MODULES
import logging
import os
import subprocess
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.neighbors import NearestNeighbors


#IMPORT ALL LIBRARIES FOR LLM COMMUNICATIONS
import openai
from openai.types.chat import ChatCompletion
import requests
import json
import ast


#IMPORT ALL LIBRARIES FOR EMBEDDINGS CREATION
from sentence_transformers import SentenceTransformer
from concurrent.futures import ThreadPoolExecutor, as_completed


#IMPORT ALL LIBRARIES FOR MONGODB OPERATIONS
from pymongo import MongoClient, ASCENDING
from pymongo.errors import BulkWriteError
from bson import ObjectId


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
#GET API KEYS SET FOR MISTRAL LLM
bat_file_path = r"C:\Yuvpersonal\AIML\M.Tech BITS-Pilani\Sem 4\Project Docs\APIKEYS.bat"

#RUN THE BATCH FILE TO SET THE API KEYS
try :
    result = subprocess.run(bat_file_path, shell=True, check=True, text=True, capture_output=True)
    print(result.stdout)
    if result.stderr :
        print(result.stderr)
except Exception as e:
    print(f"An unexpected error occurred {e}")

#GET THE API KEY FROM THE ENVIRONMENT.
try :
    MSTRLAPI_KEY = os.getenv("MSTRL_API_KEY")
    if not MSTRLAPI_KEY:
        raise ValueError("API keys not set")
except Exception as e:
    print(f"An unexpected error occurred {e}")

#SET MODEL FOR EMBEDDING ENCODER
model = SentenceTransformer('all-MiniLM-L6-v2')
 


c:\Yuvpersonal\AIML\M.Tech BITS-Pilani\Sem 4\Project Docs>setx MSTRL_API_KEY "ARjMS7D7bvpevisbN915oNCZnHgGTSPt" 

SUCCESS: Specified value was saved.



In [4]:
#KICK-START MISTRAL LLM COMMUNICATION FOR THE SUMMARIZATION STEPS REQUIRED
endpoint_url = 'https://api.mistral.ai/v1/chat/completions'


headers = {
    'Authorization': f'Bearer {MSTRLAPI_KEY}',
    'Content-Type': 'application/json'
}

payload = {
    'model': 'mistral-large-latest',  # Replace with the appropriate model name if different
    'messages': [
        {'role': 'user', 'content': 'Hello Mistral...I will feed you nearly 1000 text entries of actions taken by users on Incident tickets. Pls create a summary of the text which will be used for similarity checks with their embeddings!'}
    ]
}

response = requests.post(endpoint_url, headers=headers, json=payload)

if response.status_code == 200:
    data = response.json()
    print('Assistant Response:', data['choices'][0]['message']['content'])
else:
    print('Error:', response.status_code, response.text)

Assistant Response: Sure, I can help with that. When you provide the text entries, I will extract the most significant information from each entry to create a summary. This summary will then be used to generate embeddings, which can be utilized for similarity checks. Please go ahead and share the text entries one by one or in batches, and I will start processing them.


In [15]:
###STEP TO HAVE ALL THE FUNCTIONS REQUIRED FOR LLM ACTIVITIES, EMBEDDING GENERATION
##TASKS, MONGO-DB DEFINITIONS ETC., TO BE CREATED
##
##

#ALL MISTRAL LLM AND EMBEDDING FUNCTIONS ARE DEFINED BELOW
# FUNCTION TO GET TICKET SUMMARY
#PARAMETERS:
# - ticket_story (str): The text of the ticket story to summarize.
# - max_length (int): The maximum length of the summary.
# - min_length (int): The minimum length of the summary.
# - temperature (float): Controls the randomness of the output.
# - num_beams (int): The number of beams for beam search.
# - repetition_penalty (float): Penalty for repeating phrases.
# - length_penalty (float): Penalty for longer summaries.
def get_ticket_summary(ticket_story,max_length=4000, min_length=20, temperature=0.7, num_beams=4, repetition_penalty=2.0, length_penalty=1.0):
    headers = {
        'Authorization': f'Bearer {MSTRLAPI_KEY}',
        'Content-Type': 'application/json'
    }
    payload = {
        'model': 'mistral-large-latest',
        'messages': [{'role': 'user', 'content': ticket_story}]
    }
    try:
        response = requests.post(endpoint_url, headers=headers, json=payload)
        response.raise_for_status()  # Raise an error for bad status codes
        return response.json().get('choices', [{'message': {'content': ''}}])[0]['message']['content']
    except (requests.RequestException, ValueError, KeyError) as e:
        print(f'Error processing ticket story: {e}')
        return ""  # Return an empty string or a default value in case of error

# FUNCTION TO GET TICKET EMBEDDING
def get_ticket_embedding(ticket_summary):
    if ticket_summary:
        try:
            # Preprocess the text (if needed, e.g., lowercasing, removing special characters)
            preprocessed_summary = ticket_summary.lower().strip()
            # Generate embedding
            embedding = model.encode(preprocessed_summary)
            # Replace out-of-range float values with a suitable default
            embedding = np.nan_to_num(embedding, nan=0.0, posinf=0.0, neginf=0.0)
            # Normalize the embedding to unit length
            embedding_norm = embedding / np.linalg.norm(embedding)

            return embedding_norm.tolist()

        except Exception as e:
            print(f'Error encoding ticket summary: {e}')
            return [0.0] * model.get_sentence_embedding_dimension()  # Return a default embedding of zeros
    return None

### MONGODB CONNECTION DETAILS
mongo_uri = 'mongodb://localhost:27017/'  # Example for a local MongoDB instance
database_name = 'TSS-Percentage'
collection_name = 'TSSP-Project'

# CONNECT TO MONGODB, DATABASE & CREATE THE COLLECTION WITH UNIQUE INDEX ON "TICKET NUMBER"
client = MongoClient(mongo_uri)
db = client[database_name]
collection = db[collection_name]

# Create a unique index on the 'Ticket Number' field
# This ensures that each 'Ticket Number' is unique in the collection
# Define the index specification
index_name = 'Ticket Number_1'  # MongoDB automatically appends '_1' for single-field indexes
index_info = collection.index_information()

# CHECK IF THE INDEX ALREADY EXISTS
if index_name not in index_info:
    # Create the unique index
    collection.create_index('Ticket Number', unique=True, name=index_name)
    print(f"Index '{index_name}' created.")
else:
    print(f"Index '{index_name}' already exists.")

print(f"Collection '{collection_name}' created with a unique index on 'Ticket Number'.")


Index 'Ticket Number_1' created.
Collection 'TSSP-Project' created with a unique index on 'Ticket Number'.


In [6]:
###PRINT MIN STATISTICS OF THE INPUT FILE DOWNLOADED FROM THE TICKET SITE
##

file_path="C:\Yuvpersonal\AIML\M.Tech BITS-Pilani\Sem 4\Project Docs\InvestOne_1Y_Report.csv"
df = pd.read_csv(file_path)

print("\nColumn Names:")
print(df.columns.tolist())

print("\nBasic statistics")
tkt_count = df["Ticket Number"].nunique()
min_value = df["Ticket Number"].min()
max_value = df["Ticket Number"].max()
print("Total Tickets in Excel:", tkt_count)
print("Ticket Number range in Excel:", min_value , "~" , max_value)

print("\nData Types:")
print(df.dtypes)



Column Names:
['Group Name', 'Ticket Name', 'Ticket Number', 'Action Creator Name', 'Action Description', 'Date Action Created', 'Action Type']

Basic statistics
Total Tickets in Excel: 838
Ticket Number range in Excel: 3017645 ~ 3708324

Data Types:
Group Name             object
Ticket Name            object
Ticket Number           int64
Action Creator Name    object
Action Description     object
Date Action Created    object
Action Type            object
dtype: object


In [7]:
# COLUMNS UNWANTED FOR SUMMARIZATION ARE DROPPED
cols_to_drop=['Group Name','Action Creator Name']
df=df.drop(columns=cols_to_drop,axis=1)

print(df.columns.tolist())

['Ticket Name', 'Ticket Number', 'Action Description', 'Date Action Created', 'Action Type']


In [8]:
####STEP TO CREATE "TICKET STORY" BY CONCATENATING "ACTION DESCRIPTION" AND "COMMENT" ITEMS OF THE TICKET.
##THIS CONCATENATION RECORDS ALL THE ACTIONS THAT ARE PERFORMED FOR THE TICKET RESOLUTION. THIS TEXT WILL
##BE DESCRIPTIVE DETAIL IN PLAIN ENGLISH, OR FUNCTIONAL COMMANDS EXECUTED IN TECHNICAL JARGON, OR MIX OF
##DIFFERENT SUCH ACTIONS PERFORMED BY THE ENGINEERS. THIS WILL BE FED INTO THE LLM TO GENERATE "TICKET SUMMMARY"
##IN PLAIN ENGLISH.
##point to note :  ONLY STRING TEXT ARE TAKEN FOR THIS USE-CASE. IMAGES, SCREENSHOTS ETC., ARE DISCARDED
##FOR EASE OF THE USE-CASE

# COLUMNS WANTED FOR SUMMARIZATION PER TICKET ARE GROUPED "TICKET NAME","" ACTION DESCRIPTION" SORTED ON "DATE ACTION CREATED" ARE UPDATED INTO A NEW FILE WITH NEW  COLUMN NAMED "TICKET STORY" TO CREATE "TICKET SUMMARY"
filtered_df = df[df['Action Type'].isin(['Description','Comment'])]

# CONVERT 'DATE ACTION CREATED' TO DATETIME IF IT'S NOT ALREADY
#FILTERED_DF['DATE ACTION CREATED'] = PD.TO_DATETIME(FILTERED_DF['DATE ACTION CREATED'], FORMAT='%M/%D/%Y %I:%M %P')

# SORT VALUES BY 'DATE ACTION CREATED'
filtered_df = filtered_df.sort_values(by='Date Action Created')

# GROUP BY 'TICKET NUMBER' AND CONCATENATE 'ACTION DESCRIPTION'
ticket_story = filtered_df.groupby('Ticket Number').apply(
    lambda group: (
        f"{group['Ticket Name'].iloc[0]}\n" +  # Start with the unique 'Ticket Name'
        "\n".join(map(str, group['Action Description']))  # Join 'Action Description' with new lines
    )
).reset_index()

# RENAME THE COLUMNS FOR CLARITY
ticket_story.columns = ['Ticket Number', 'Ticket Story']

# REMOVE DUPLICATE ENTRIES BASED ON 'TICKET NUMBER'
ticket_story = ticket_story.drop_duplicates(subset='Ticket Number')

# PRINT THE NUMBER OF ROWS BEFORE FILTERING
print(f"Number of rows before filtering non-numeric 'Ticket Number': {ticket_story.shape[0]}")

# DROP ROWS WHERE 'TICKET NUMBER' IS NOT NUMERIC
ticket_story['Ticket Number'] = pd.to_numeric(ticket_story['Ticket Number'], errors='coerce')
ticket_story = ticket_story.dropna(subset=['Ticket Number'])

# PRINT THE NUMBER OF ROWS AFTER FILTERING
print(f"Number of rows after filtering non-numeric 'Ticket Number': {ticket_story.shape[0]}")

# WRITE TO CSV FILE WITH CLEAR COLUMN NAMES
output_file_path = r"C:\Yuvpersonal\AIML\M.Tech BITS-Pilani\Sem 4\Project Docs\Ticket_Story.csv"
ticket_story.to_csv(output_file_path, index=False)

print(ticket_story.head(2))

# PRINT THE RECORD COUNT
print(f"The CSV file has {(pd.read_csv(output_file_path)).shape[0]} records.")


Number of rows before filtering non-numeric 'Ticket Number': 838
Number of rows after filtering non-numeric 'Ticket Number': 838


  ticket_story = filtered_df.groupby('Ticket Number').apply(


   Ticket Number                                       Ticket Story
1        3018629  AMPBLUSAGEF abort\nJob aborted with following:...
The CSV file has 838 records.


In [12]:
#################     DANGER - TIME CONSUMING STEP AS LLM CONNECTION IS MADE ################
#                     DANGER - TIME CONSUMING STEP AS LLM CONNECTION IS MADE
#                     DANGER - TIME CONSUMING STEP AS LLM CONNECTION IS MADE

#####THIS IS A TIME CONSUMING STEP WHERE FOR ALL TICKETS, LLM CONNECTION IS MADE FOR CREATING
##SUMMARY, GENERATING EMBEDDINGS ETC., WE MAY RUN INTO LONG RUNS/DELAYS/ERRORS WITH NEW OR
##UNPROCESSED DATA. TIMLINES OF THIS STEP ARE GIVEN BELOW.
## 802  TICKETS (1 YR TICKETs) - 5h 35m
## 2567 TICKETS (3 YR TICKETs) - 17h 58m (High Chances of getting into 429 - RateLimitError with free version)
## 5744 TICKETS (5 YR TICKETs) - xh ym (got into 429 - RateLimitError due to free version of Mistral LLM)

# CREATE SUMMARY AND EMBEDDING FROM "TICKET STORY" FOR EACH TICKET
# USING LLMS AND VECTOR EMBEDDING TRANSFORMER MODELS
#############REAL DATASET#################
#file_path="C:\Yuvpersonal\AIML\M.Tech BITS-Pilani\Sem 4\Project Docs\Ticket_Story.csv" 
#output_file_path = "C:\Yuvpersonal\AIML\M.Tech BITS-Pilani\Sem 4\Project Docs\Ticket_output.csv"   

#############DEMO DATASET#################
file_path="C:\Yuvpersonal\AIML\M.Tech BITS-Pilani\Sem 4\Project Docs\Ticket_Story_sample_5.csv"    
output_file_path = "C:\Yuvpersonal\AIML\M.Tech BITS-Pilani\Sem 4\Project Docs\Ticket_output_sample_5.csv"

# PROCESS THE CSV FILE IN BATCHES
processed_data = []     # Initialize an empty list to store processed data
iteration = 0           # Initialize a counter for iterations
batch_size = 25         # Define the batch size

# INITIALIZE AN EMPTY LIST TO STORE PROCESSED DATA
processed_data = []

# INITIALIZE A COUNTER FOR ITERATIONS
iteration = 0

# READ THE CSV FILE IN CHUNKS
for chunk in pd.read_csv(file_path, chunksize=batch_size):
    # Increment the iteration counter
    iteration += 1
    current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print(f'Processing batch {iteration} at {current_time}...')

    # Process each chunk
    chunk['Ticket Summary'] = chunk['Ticket Story'].apply(get_ticket_summary)
    chunk['Ticket Embedding'] = chunk['Ticket Summary'].apply(get_ticket_embedding)

    # Append the processed chunk to the list
    processed_data.append(chunk)

# CONCATENATE ALL PROCESSED CHUNKS INTO A SINGLE DATAFRAME
final_data = pd.concat(processed_data, ignore_index=True)

# WRITE THE DATA TO A NEW CSV FILE
final_data.to_csv(output_file_path, index=False)

print('Data has been written to', output_file_path)

Processing batch 1 at 2025-03-14 11:53:36...
Data has been written to C:\Yuvpersonal\AIML\M.Tech BITS-Pilani\Sem 4\Project Docs\Ticket_output_sample_5.csv


In [None]:
####STEP TO LOAD THE CSV FILE {'TICKET NUMBER','TICKET STORY','TICKET SUMMARY','TICKET EMBEDDING'}
##INTO MONGODB WHICH GOT CREATED IN ABOVE STEP.
##

#output_file_path="C:\Yuvpersonal\AIML\M.Tech BITS-Pilani\Sem 4\Project Docs\Ticket_output.csv"   #REAL DATASET
output_file_path="C:\Yuvpersonal\AIML\M.Tech BITS-Pilani\Sem 4\Project Docs\Ticket_output_sample_5.csv"  #DEMO DATASET


print(f"The CSV file has {(pd.read_csv(output_file_path)).shape[0]} records.")

# READ THE CSV FILE INTO A DATAFRAME
df = pd.read_csv(output_file_path)

# CONVERT THE DATAFRAME TO A LIST OF DICTIONARIES
records = df.to_dict(orient='records')

# INSERT THE RECORDS INTO THE MONGODB COLLECTION
try:
    # Insert the records into the MongoDB collection, ignoring duplicates
    result = collection.insert_many(records, ordered=False)
    print(f'{len(result.inserted_ids)} records inserted into MongoDB.')
except BulkWriteError as bwe:
    # Handle duplicate key errors and other bulk write errors
    print(f'{len(bwe.details["writeErrors"])} records failed to insert due to duplicate key errors.')

# CLOSE THE MONGODB CONNECTION
#client.close()


The CSV file has 838 records.
838 records inserted into MongoDB.


In [17]:
####PREPARATORY STEP TO PICK ALL EMBEDDINGS FROM TSSP-PROJECT MONGODB COLLECTION WHICH GOT  
# INSERTED IN EARLIER STEPS AND GET IT MAPPED ON 384 DIMENSIONAL EUCLIDEAN SPACE TO FIND OUT THE 
##NEAREST KNN ELEMENTS WHICH CORRESPONDS TO SIMILAR TICKETS
##

# FETCH ALL EMBEDDINGS AND THEIR CORRESPONDING IDS AND TICKET NUMBERS FROM MONGODB
documents = list(collection.find({}, {'_id': 1, 'Ticket Embedding': 1, 'Ticket Number': 1}))

# ENSURE ALL EMBEDDINGS ARE NUMERIC, NON-EMPTY, AND HAVE THE SAME LENGTH
valid_embeddings = []
valid_ids = []
valid_ticket_numbers = []
dimension = None

for doc in documents:
    embedding = doc.get('Ticket Embedding', [])
    doc_id = str(doc['_id'])
    ticket_number = doc.get('Ticket Number', '')

    if not embedding:
        print(f"Skipping empty embedding for document ID: {doc_id}")
        continue

    try:
        # Convert string representation of embedding to a list of floats
        embedding_list = ast.literal_eval(embedding)
        embedding_np = np.array(embedding_list, dtype='float32')

        if embedding_np.ndim != 1:
            print(f"Skipping malformed embedding for document ID: {doc_id}")
            continue

        if dimension is None:
            dimension = embedding_np.shape[0]
        elif embedding_np.shape[0] != dimension:
            print(f"Skipping embedding with incorrect dimension for document ID: {doc_id}")
            continue

        valid_embeddings.append(embedding_np)
        valid_ids.append(doc_id)
        valid_ticket_numbers.append(ticket_number)
    except (ValueError, SyntaxError):
        print(f"Skipping non-numeric embedding for document ID: {doc_id}")

# CHECK IF THERE ARE ANY VALID EMBEDDINGS
if not valid_embeddings:
    raise ValueError("No valid embeddings found.")

# CONVERT VALID EMBEDDINGS TO A NUMPY ARRAY
embeddings_np = np.vstack(valid_embeddings)

# INITIALIZE A NEARESTNEIGHBORS MODEL
n_neighbors = len(valid_embeddings)  # Retrieve all to filter later
knn = NearestNeighbors(n_neighbors=n_neighbors, metric='euclidean')
knn.fit(embeddings_np)



In [21]:
####RUN STEP TO PICK 5NN NEAREST NEIGHBOURS IN THE 384 DIMENSIONAL EUCLIDEAN SPACE
##AND DISPLAY THE TICKET NUMBERS AND THEIR SIMILARITY DISTANCES
##
##TEST TYPE 1 : FINDING Knn - K nearest neigbours

# PROMPT THE USER FOR A TICKET NUMBER AND CONVERT IT TO AN INTEGER
ticket_number_input = input("Enter the Ticket Number: ")
try:
    ticket_number = int(ticket_number_input)
except ValueError:
    print("Invalid Ticket Number. Please enter a valid integer.")
    exit()

# FETCH THE EMBEDDING FOR THE GIVEN TICKET NUMBER
ticket_document = collection.find_one({'Ticket Number': ticket_number})

if ticket_document is None:
    print(f"No embedding found for Ticket Number: {ticket_number}")
else:
    try:
        # Convert the embedding to a numpy array
        query_vector = np.array(ast.literal_eval(ticket_document['Ticket Embedding']), dtype='float32')

        # Perform the search
        distances, indices = knn.kneighbors([query_vector])

        # Retrieve the corresponding ticket numbers and distances
        similar_ticket_numbers = [valid_ticket_numbers[i] for i in indices[0]]
        similar_distances = distances[0]

        # Exclude the given ticket from the results
        similar_ticket_numbers = [tn for tn, dist in zip(similar_ticket_numbers, similar_distances) if tn != ticket_number]
        similar_distances = [dist for tn, dist in zip(similar_ticket_numbers, similar_distances) if tn != ticket_number]

        # Calculate similarity percentages
        similar_tickets_with_similarity = [
            (tn, max(0, (1 - dist) * 100)) for tn, dist in zip(similar_ticket_numbers, similar_distances)
        ]

        # Filter tickets with similarity greater than or equal to 40%
        similar_tickets_with_similarity = [
            (tn, similarity) for tn, similarity in similar_tickets_with_similarity if similarity >= 40
        ]

        # Sort by similarity and limit to 5 results
        similar_tickets_with_similarity = sorted(similar_tickets_with_similarity, key=lambda x: x[1], reverse=True)[:5]

        # Print the results
        print(f"Given Ticket # is: '{ticket_number}'")
        print("Similar Tickets found are:")
        if similar_tickets_with_similarity:
            for tn, similarity in similar_tickets_with_similarity:
                print(f"{tn} - {similarity:.2f}%")
        else:
            print("No similar tickets found with 40% or higher similarity.")
    except (ValueError, SyntaxError):
        print(f"Invalid embedding format for Ticket Number: {ticket_number}")

# CLOSE THE MONGODB CONNECTION
#client.close()

Given Ticket # is: '3692365'
Similar Tickets found are:
3133798 - 100.00%


In [14]:
####DEFINE STEP TO GET A NEW TICKET NUMBER AND FIND CLOSEST Knn
##DISPLAY THE SIMILAR TICKET NUMBERS AND THEIR DISTANCES
##
##TEST TYPE 2 : FOR A NEW TICKET FIND Knn
def find_similar_tickets(ticket_number, ticket_description, knn, valid_ticket_numbers, top_n=5):
    # Generate embedding for the ticket description
    query_vector = get_ticket_embedding(ticket_description)

    # Convert the list to a NumPy array and reshape it
    query_vector = np.array(query_vector).reshape(1, -1)

    # Perform the k-NN search
    distances, indices = knn.kneighbors(query_vector)

    # Retrieve the corresponding ticket numbers and distances
    similar_ticket_numbers = [valid_ticket_numbers[i] for i in indices[0]]
    similar_distances = distances[0]

    # Exclude the given ticket from the results
    similar_ticket_numbers = [tn for tn, dist in zip(similar_ticket_numbers, similar_distances) if tn != ticket_number]
    similar_distances = [dist for tn, dist in zip(similar_ticket_numbers, similar_distances) if tn != ticket_number]

    # Calculate similarity percentages
    similar_tickets_with_similarity = [
        (tn, max(0, (1 - dist) * 100)) for tn, dist in zip(similar_ticket_numbers, similar_distances)
    ]

    # Filter tickets with similarity greater than or equal to 40%
    similar_tickets_with_similarity = [
        (tn, similarity) for tn, similarity in similar_tickets_with_similarity if similarity >= 1
    ]

    # Sort by similarity and limit to top_n results
    similar_tickets_with_similarity = sorted(similar_tickets_with_similarity, key=lambda x: x[1], reverse=True)[:top_n]

    return similar_tickets_with_similarity

In [15]:
####RUN STEP TO GET A NEW TICKET NUMBER AND FIND CLOSEST Knn
##DISPLAY THE SIMILAR TICKET NUMBERS AND THEIR DISTANCES
##
##TEST TYPE 2 : FOR A NEW TICKET FIND Knn
# Prompt the user for a ticket number and description
ticket_number_input = input("Enter the Ticket Number: ")
ticket_description_input = input("Enter the Ticket Description: ")

try:
    ticket_number = int(ticket_number_input)
except ValueError:
    print("Invalid Ticket Number. Please enter a valid integer.")
    exit()

# Find similar tickets
similar_tickets = find_similar_tickets(ticket_number, ticket_description_input, knn, valid_ticket_numbers)

# Print the results
print(f"Given Ticket # is: '{ticket_number}'")
print("Similar Tickets found are:")
if similar_tickets:
    for tn, similarity in similar_tickets:
        print(f"{tn} - {similarity:.2f}%")
else:
    print("No similar tickets found.")

Given Ticket # is: '3987654'
Similar Tickets found are:
3614378 - 5.97%
3513922 - 5.89%
3628470 - 4.23%
3163198 - 3.38%
3680806 - 3.14%
