In [101]:
import pandas as pd
import google.generativeai as genai
import os
import chromadb
import uuid
from collections import Counter 
import random
from dotenv import load_dotenv
import openai
import time
import numpy as np
import json
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# >>>>READ<<<<: 
# ADD GOOGLE API KEY in ./.env file
# GOOGLE_API_KEY=...
load_dotenv()

In [49]:
# init
train_data = pd.read_csv("./train.csv")
test_data = pd.read_csv("./test.csv")
# prepare google genai
genai.configure(api_key=os.environ['GOOGLE_API_KEY'])
# prepare groq client
groq_client = openai.OpenAI(
    base_url="https://api.groq.com/openai/v1",
    api_key=os.environ["GROQ_KEY"]
)

In [None]:
# prepare dataset for embeddings
# new doc
def mod_func(row):
    row['document'] = f"""The following complaint document written by a user is categorized as '{row.category}' and sub-categorized as '{row.sub_category}':
{row.crimeaditionalinfo}
    """
    return row

train_data['sub_category'] = train_data['sub_category'].fillna(train_data['category'])
# prepared_train_data =  train_data.apply(mod_func, axis=1)

train_data['combined_length'] = train_data['category'].fillna('').str.len() + \
                        train_data['sub_category'].fillna('').str.len() + \
                        train_data['crimeaditionalinfo'].fillna('').str.len()
train_data

In [None]:
# sanity check that if any row exceeds embeddings context length
embeddings_context_length = 2048
if len(train_data[train_data.combined_length > embeddings_context_length]) > 1:
    train_data[train_data.combined_length > embeddings_context_length]
else:
    print("Success! All rows are within the context length.")

In [None]:
# prepare vectordb

# start chromdb server
# chroma run --host localhost --port 8000 --path ./cyber_gaurd_ai_embeddings_vector_data

embeddings_collection = None
def collection_exists(collection_name):
    existing_collections = client.list_collections()
    return any(collection.name == collection_name for collection in existing_collections)

client = chromadb.HttpClient()
collection_name = "cyber-ai-embeddings"
summarised_collection_name = "cyber-ai-embeddings-summed"

if not collection_exists(collection_name):
    embeddings_collection = client.create_collection(collection_name)
else:
    embeddings_collection = client.get_collection(collection_name)
    
# summarised collection

if not collection_exists(summarised_collection_name):
    embeddings_collection_summed = client.create_collection(summarised_collection_name)
else:
    embeddings_collection_summed = client.get_collection(summarised_collection_name)
    
    
print("Collection created...")

In [7]:
def predict_categories(document,sample_size = 1,collection=embeddings_collection):
    predicted_category = ""
    predicted_scategory = ""
    result = genai.embed_content(model="models/text-embedding-004", content=[document])
    test_embeddings = result['embedding'][0]
    # query db
    query_result = collection.query(query_embeddings=test_embeddings,n_results=sample_size)
    # print results
    sampled_categories = []
    sampled_scategories = []
    for i in range(sample_size):
        if 'category' in query_result['metadatas'][0][0]:
            sampled_categories.append(query_result['metadatas'][0][0]['category'])
        else:
            sampled_categories.append(None)
    
    # get the most occurred
    predicted_category = Counter(sampled_categories).most_common(1)[0][0]
    
    # get sub category
    # query db
    query_result = collection.query(query_embeddings=test_embeddings,n_results=sample_size,where={"category":predicted_category})
       
    if 'scategory' in query_result['metadatas'][0][0]:
        sampled_scategories.append(query_result['metadatas'][0][0]['scategory'])
    else:
        sampled_scategories.append(None)

    predicted_scategory = Counter(sampled_scategories).most_common(1)[0][0]
    return (predicted_category,predicted_scategory,sampled_categories,sampled_scategories)

In [51]:
%%script true
# >>>>PLEASE READ!!!!!<<<<
# THIS CELL EXECUTION SHOULD BE A ONE TIME PROCESS
# REMOVE THE ABOVE "%%SCRIPT TRUE" TO RUN THIS CELL
# THIS CELL PREPARES AND POPULATES ALL THE VECTOR EMBEDDINGS OF THE TRAIN DATA
# 
step = 200
max_length = len(train_data)
for i in range(0, max_length, step):
    start_idx = i
    end_idx = i + step
    documents = []
    metadatas = []
    document_ids = []
    for index, row in train_data[start_idx:end_idx].iterrows():
        doc_ = f"""The following complaint document written by a user is categorized as '{row.category}' and sub-categorized as '{row.sub_category}':
{row.crimeaditionalinfo}"""
        documents.append(doc_)
        metadatas.append({"category": row.category, "scategory": row.sub_category})
        document_ids.append(str(uuid.uuid4()))

    result = genai.embed_content(model="models/text-embedding-004", content=documents)

    print("inserting results in vectordb")
    embeddings = result["embedding"]
    data_to_insert = embeddings_collection.add(
        documents=documents,
        metadatas=metadatas,
        ids=document_ids,
        embeddings=embeddings,
    )
    print(f"data inserted from index {start_idx} to {end_idx}")
print("Vector embeddings prepared...")

In [None]:
# TEST SINGLE
test_idx = int(len(test_data)*random.random())
row = test_data.iloc[test_idx]
print(f"Expected Category: {row.category}")
print(f"Expected Sub-category: {row.sub_category}")
print(f"Complaint: {row.crimeaditionalinfo}")
print()
predict_category,predict_scategory,sampled_categories,sampled_scategories  = predict_categories(f"""The following complaint document written by a user:
{row.crimeaditionalinfo}""",10)
print(sampled_categories)
print(sampled_scategories)
print()
print(f"Predicted Category: {predict_category}")
print(f"Predicted Sub-category: {predict_scategory}")

In [None]:
# TEST ALL
sample_size = 10
test_test_data = test_data.sample(n=sample_size).reset_index(drop=True)
failure_indexes = []
failure_indexes_category = []
failure_indexes_scategory = []
failure_predicted_categories = []
for index, row in test_test_data.iterrows():
    print(f"{index+1}/{sample_size}",end='\r')
    document = f"""The following complaint document written by a user:
{row.crimeaditionalinfo}"""
    predict_category,predict_scategory,sampled_categories,sampled_scategories = predict_categories(document,10)
    
    if row.category == predict_category:
        if row.sub_category == predict_scategory or pd.isna(row.sub_category) or row.sub_category == None:
            
            continue
        else:
            # print("Failed Sub-category---------")
            # print(f"Index: {index}")
            # print(f"Expected: {row.sub_category}")
            # print(f"Predicted: {predict_scategory}")
            # print("----------------------------")
            failure_indexes_scategory.append(index)
    else:
        print("Failed Category-------------")
        print(f"Index: {index}")
        print(f"Expected: {row.category}")
        print(f"Predicted: {predict_category}")
        print("----------------------------")
        failure_indexes_category.append(index)
    
    failure_predicted_categories.append([predict_category,predict_scategory])
    failure_indexes.append(index)
        
print(f"Success Rate Overall: {100 - len(failure_indexes)/sample_size*100}%") 
print(f"Success Rate (Category only): {100 - len(failure_indexes_category)/sample_size*100}%") 
print(f"Success Rate (Sub-category only): {100 - len(failure_indexes_scategory)/sample_size*100}%") 

# Summarised Embeddings

In [12]:
def prepare_paragraph(category,sub_category,sample_size=10):
    cat_1 = train_data[(train_data.category == category) & (train_data.sub_category == sub_category)].sample(n=sample_size).reset_index(drop=True)
    complaints = ""
    for index, row in cat_1.iterrows(): 
        if pd.isna(row.crimeaditionalinfo) or row.crimeaditionalinfo == None:
            continue
        complaints += f"\nComplaint {index+1}:\n" + row.crimeaditionalinfo + "\n\n"
        
    # processed = f"""Following are the user written complaints. Please tranform all the complaints to paragraphs such that it retains a general idea about the complaints and is ideal for vector embeddings. Please respond only with text, no markdown.
    #     {complaints}
    # """

    processed = f"""You are a cybersecurity incident analyst tasked with summarizing (in pure text) multiple user complaints about potential cyber issues written in english or hinglish. The complaints are written by users in India and the context of the complains are exclusively Indian. Your goal is to create a concise summary that captures the main concerns and patterns across all complaints while retaining the essential details.

Please provide a summary of the following user complaints, highlighting:

1. Common issues or themes
2. Specific types of cyber threats mentioned
3. Any patterns in affected systems or devices
4. Potential impact on users or organizations
5. Any unusual or standout complaints

Instructions to follow:
1. Please respond only with text, NO markdown.
2. If the language is not english then preserve the language elements.

Combine similar complaints and present the information in a clear, organized manner. Your summary should give a comprehensive overview of the situation without losing important details from individual complaints. 

User complaints:

{complaints}
    """

    return processed

In [13]:
def genai_predict(input_text):
    # system_instruction = "Do not respond in markdown, respond only with pure text, it can have paragraphs."
    system_instruction = ""
    model = genai.GenerativeModel('gemini-1.5-flash')
    response = model.generate_content(input_text)
    res_content = response.candidates[0].content
    res_part = res_content.parts[0]
    res_text = res_part.text
    res_text.replace("*","")
    return res_text,response._error

In [14]:
def groq_predict(input_text):
    # system_instruction = "Do not respond in markdown, respond only with pure text, it can have paragraphs."
    system_instruction = ""
    chat_completion = groq_client.chat.completions.create(
    messages=[
        {
            "role": "system",
            "content": system_instruction,
        },
        {
            "role": "user",
            "content": input_text,
        },
    ],
    model="llama-3.2-90b-text-preview",
)
    res_text = chat_completion.choices[0].message.content
    res_text.replace("*","")
    return res_text,None

In [15]:
def sample_and_summarise_categories(category,sub_category,sample_size=10,llm_channel="genai"):
    
    if llm_channel == "genai":
        summary_,err = genai_predict(prepare_paragraph(category,sub_category))
    elif llm_channel == "groq":
        summary_,err = groq_predict(prepare_paragraph(category,sub_category))
    
    if err is not None:
        print(err)
    documents = []
    metadatas = []
    document_ids = []

    metadatas.append({"category": category, "scategory": sub_category})
    documents.append(summary_)
    document_ids.append(str(uuid.uuid4()))
    
    result = genai.embed_content(model="models/text-embedding-004", content=documents)
    
    print("inserting results in summed collection")
    embeddings = result["embedding"]
    embeddings_collection_summed.add(
        documents=documents,
        metadatas=metadatas,
        ids=document_ids,
        embeddings=embeddings,
    )
    print(f"data inserted")

In [None]:
# %%script true
# >>>>PLEASE READ!!!!!<<<<
# THIS CELL EXECUTION SHOULD BE A ONE TIME PROCESS
# REMOVE THE ABOVE "%%SCRIPT TRUE" TO RUN THIS CELL
# THIS CELL PREPARES AND POPULATES ALL THE VECTOR EMBEDDINGS OF THE TRAIN DATA
# 
# # prepare data based on summarisation
skip_categories = ['RapeGang Rape RGRSexually Abusive Content','Sexually Explicit Act','Sexually Obscene material','Child Pornography CPChild Sexual Abuse Material CSAM']
categories = train_data.groupby('category')['sub_category'].agg(lambda x: sorted(set(x))).to_dict()
# sub_categories = train_data.sub_category.unique()
failures = []

for cat in categories:
    # llm_channel = "genai"
    llm_channel = "groq"
    if cat in skip_categories:
        llm_channel = "groq"
    for scat in categories[cat]:
        print(f"Doing for {cat} and {scat} using {llm_channel} ...")
        try:
            # do for 5 times
            sample_and_summarise_categories(cat,scat,10,llm_channel=llm_channel)
            time.sleep(1)
        except Exception as e:
            print(e)
            print(f"FAILED!")
            failures.append((cat,scat))

In [None]:
failures

In [None]:
# TEST SINGLE
test_idx = int(len(test_data)*random.random())
row = test_data.iloc[test_idx]
print(f"Index: {test_idx}")
print(f"Expected Category: {row.category}")
print(f"Expected Sub-category: {row.sub_category}")
print(f"Complaint: {row.crimeaditionalinfo}")
print()
predict_category,predict_scategory,sampled_categories,sampled_scategories  = predict_categories(f"""The following complaint document written by a user:
{row.crimeaditionalinfo}""",10,collection=embeddings_collection_summed)
print(sampled_categories)
print(sampled_scategories)
print()
print(f"Predicted Category: {predict_category}")
print(f"Predicted Sub-category: {predict_scategory}")

In [None]:
# TEST ALL
sample_size = 100
test_test_data = test_data.sample(n=sample_size).reset_index(drop=True)
failure_indexes = []
failure_indexes_category = []
failure_indexes_scategory = []
failure_predicted_categories = []
for index, row in test_test_data.iterrows():
    print(f"{index+1}/{sample_size}",end='\r')
    document = f"""The following complaint document written by a user:
{row.crimeaditionalinfo}"""
    predict_category,predict_scategory,sampled_categories,sampled_scategories = predict_categories(document,10,collection=embeddings_collection_summed)
    
    if row.category == predict_category:
        if row.sub_category == predict_scategory or pd.isna(row.sub_category) or row.sub_category == None:
            
            continue
        else:
            # print("Failed Sub-category---------")
            # print(f"Index: {index}")
            # print(f"Expected: {row.sub_category}")
            # print(f"Predicted: {predict_scategory}")
            # print("----------------------------")
            failure_indexes_scategory.append(index)
    else:
        print("Failed Category-------------")
        print(f"Index: {index}")
        print(f"Expected: {row.category}")
        print(f"Predicted: {predict_category}")
        print("----------------------------")
        failure_indexes_category.append(index)
    
    failure_predicted_categories.append([predict_category,predict_scategory])
    failure_indexes.append(index)
        
print(f"Success Rate Overall: {100 - len(failure_indexes)/sample_size*100}%") 
print(f"Success Rate (Category only): {100 - len(failure_indexes_category)/sample_size*100}%") 
print(f"Success Rate (Sub-category only): {100 - len(failure_indexes_scategory)/sample_size*100}%") 

In [None]:
test_test_data.iloc[2].crimeaditionalinfo

# Embedding Average

In [72]:
categories = train_data.groupby('category')['sub_category'].agg(lambda x: sorted(set(x))).to_dict()
del categories['Any Other Cyber Crime'] # TODO: !!???

# categories = {"Child Pornography CPChild Sexual Abuse Material CSAM":['Child Pornography CPChild Sexual Abuse Material CSAM']}

In [125]:
%%script true
# >>>>PLEASE READ!!!!!<<<<
# THIS CELL EXECUTION SHOULD BE A ONE TIME PROCESS
# REMOVE THE ABOVE "%%SCRIPT TRUE" TO RUN THIS CELL
# THIS CELL PREPARES AND POPULATES ALL THE AVERGAE VECTOR EMBEDDINGS OF THE TRAIN DATA
# average_embeddings.json in the repo already contains the averaged data so no need to execute this cell
# 
final_embeddings = {}
step = 200
for cat in categories:
    for scat in categories[cat]:
        print(f"Doing for {cat} and {scat} ...")
        specific_data = train_data[(train_data.category == cat) & (train_data.sub_category == scat)]
        max_length = len(specific_data)
        embeddings_all = []
        for i in range(0, max_length, step):
            start_idx = i
            end_idx = i + step
            documents = []
            metadatas = []
            document_ids = []
            print(f"{start_idx}-{end_idx}/{max_length}",end='\r')
            for index, row in specific_data[start_idx:end_idx].iterrows():
                doc_ = f"""The following complaint document written by a user is categorized as '{row.category}' and sub-categorized as '{row.sub_category}':
{row.crimeaditionalinfo}"""
                documents.append(doc_)
                metadatas.append({"category": row.category, "scategory": row.sub_category})
                document_ids.append(str(uuid.uuid4()))

            result = genai.embed_content(model="models/text-embedding-004", content=documents)

            embeddings_all.extend(result['embedding'])
        print()
        # do mean
        average_embeddings = np.mean(embeddings_all,axis=0)
        if cat not in final_embeddings:
            final_embeddings[cat] = {}
        if scat not in final_embeddings[cat]:
            final_embeddings[cat][scat] = []
            
        final_embeddings[cat][scat] = average_embeddings.tolist()
    
# save to file
f = open("./average_embeddings.json",'w')
json.dump(final_embeddings,f)
f.close()

In [109]:
def get_closest_embeddings(query_embedding, embedding_list,n_sorted=10):
    """
    Find the closest embeddings from a list of embeddings, sorted by similarity.
    
    Args:
    query_embedding (np.array): The embedding to compare against.
    embedding_list (list): List of embeddings to search through.
    
    Returns:
    tuple: (closest_embedding, index, similarity_score, sorted_embeddings, sorted_indices)
    """
    # Convert the query embedding to a 2D array
    query_embedding = np.array(query_embedding).reshape(1, -1)
    
    # Convert the list of embeddings to a 2D array
    embedding_array = np.array(embedding_list)
    
    # Compute cosine similarities
    similarities = cosine_similarity(query_embedding, embedding_array)[0]
    
    # Sort similarities in descending order and get sorted indices
    sorted_indices = np.argsort(similarities)[::-1]
    
    # Get the closest embedding, its index, and similarity score
    closest_index = sorted_indices[0]
    closest_embedding = embedding_list[closest_index]
    similarity_score = similarities[closest_index]
    
    # Create a list of tuples (embedding, original_index, similarity_score)
    sorted_embeddings = [(embedding_list[i], i, similarities[i]) for i in sorted_indices]
    
    return closest_embedding, closest_index, similarity_score, sorted_embeddings[:n_sorted], sorted_indices[:n_sorted]


In [96]:
# read final embeddings and prepare list
f = open("./average_embeddings.json",'r')
average_embeddings_dict = json.load(f)
average_embeddings_embeddings_list = []
average_embeddings_pair_list = []
f.close()
# prepare list and sub pairs
for i in average_embeddings_dict:
    for j in average_embeddings_dict[i]:
        average_embeddings_embeddings_list.append(average_embeddings_dict[i][j])
        average_embeddings_pair_list.append((i,j))

In [97]:
def predict_from_average_embeddings(document,embeddings_list=average_embeddings_embeddings_list,pair_list=average_embeddings_pair_list,sample_size = 10):
    result = genai.embed_content(model="models/text-embedding-004", content=[document])
    test_embeddings = result['embedding'][0]
    closest_embedding, closest_index, similarity_score, sorted_embeddings, sorted_indices = get_closest_embeddings(test_embeddings,embeddings_list,sample_size)
    
    predicted_pairs = [pair_list[i] for i in sorted_indices]
    
    predicted_pair = Counter(predicted_pairs).most_common(1)[0][0]
    
    return predicted_pair
    

In [None]:
# TEST SINGLE
test_idx = int(len(test_data)*random.random())
row = test_data.iloc[test_idx]
print(f"Index: {test_idx}")
print(f"Expected Category: {row.category}")
print(f"Expected Sub-category: {row.sub_category}")
print(f"Complaint: {row.crimeaditionalinfo}")
print()
predicted_pair  = predict_from_average_embeddings(f"""The following complaint document written by a user:
{row.crimeaditionalinfo}""",sample_size=10)
print()
print(f"Predicted Category: {predicted_pair[0]}")
print(f"Predicted Sub-category: {predicted_pair[1]}")

In [None]:
# TEST ALL
sample_size = 100
test_test_data = test_data.sample(n=sample_size).reset_index(drop=True)
failure_indexes = []
failure_indexes_category = []
failure_indexes_scategory = []
failure_predicted_categories = []
for index, row in test_test_data.iterrows():
    print(f"{index+1}/{sample_size}",end='\r')
    document = f"""The following complaint document written by a user:
{row.crimeaditionalinfo}"""
    predicted_pair = predict_from_average_embeddings(document,sample_size=10)
    predict_category = predicted_pair[0]
    predict_scategory= predicted_pair[1]
    if row.category == predict_category:
        if row.sub_category == predict_scategory or pd.isna(row.sub_category) or row.sub_category == None:
            
            continue
        else:
            # print("Failed Sub-category---------")
            # print(f"Index: {index}")
            # print(f"Expected: {row.sub_category}")
            # print(f"Predicted: {predict_scategory}")
            # print("----------------------------")
            failure_indexes_scategory.append(index)
    else:
        print("Failed Category-------------")
        print(f"Index: {index}")
        print(f"Expected: {row.category}")
        print(f"Predicted: {predict_category}")
        print("----------------------------")
        failure_indexes_category.append(index)
    
    failure_predicted_categories.append([predict_category,predict_scategory])
    failure_indexes.append(index)
        
print(f"Success Rate Overall: {100 - len(failure_indexes)/sample_size*100}%") 
print(f"Success Rate (Category only): {100 - len(failure_indexes_category)/sample_size*100}%") 
print(f"Success Rate (Sub-category only): {100 - len(failure_indexes_scategory)/sample_size*100}%") 