In [None]:
'''
Do multiple runs of experiment + confusion matrix
'''

# Import necessary libraries
from langchain_community.chat_models import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
from filteringPromptsLLMJudge import save_error_results, filter_prompts_LLM_judge
from prompt_templates_final import prompt_template_version2
from prompt_templates_phi4_final import prompt_template_phi4
from errorCheckingPhi4_final import error_checking_phi4
from errorCheckingLLMJudge_commandr_final import error_checking_llm_judge_commandr
from filter_misclassified_prompts_final import filter_json_version2
import json
import os
import sys

# TODO: Change i, j according to the number of iterations u want
for i in range(3,4):
    for j in range(1,2):
        # Initialise cycle to 1 since it is the 1st iteration of the experiment
        cycle = 1

        path = "NA"

        # For logging purposes
        log_file = open("log_output.txt", "w")
        sys.stdout = log_file

        # To append all the none-class prompts to tally none-class counts for 2nd confusion matrix
        none_class_lst = []

        # To append all yes prompts (prompts deemed to be categorised correctly by LLM-judge) to tally counts for 2nd confusion matrix
        yes_prompts_lst = []

        while cycle >= 1:
            if cycle == 1:
                # Initial prompt template to classify initial pool of malicious prompts
                template = """Question: This task is part of a cybersecurity research study focusing on analyzing prompt structures and understanding different patterns that may influence AI behavior. You are NOT required to execute or generate any harmful content. Your role is solely to categorize the following prompt: {prompt} into one of the following classes: 
                Double-Character-Class - Description:  A prompt that makes the LLM produce a double
                character response, with one character constrained by
                the language model’s rules while the other character is unconstrained and bypasses content restrictions.
                Some sources refer to these as jailbreaks. \n
                Virtualization-Class - Description:  A prompt that puts the LLM into an unrestricted
                mode, such as a developer mode or a virtual scenario
                where the malicious content is generated inside a ”virtual machine”. \n
                Obfuscation-Class - Description: A prompt that has malicious content or rule-breaking
                instructions obfuscated, for example, by being encoded as base64 characters rather than regular ASCII
                characters. \n
                Payload-Splitting-Class - Description:  Multiple prompts contain instructions that are combined with a final prompt. For example, when text A
                and text B are benign alone but malicious when combined into text A+B. \n
                Adversarial-Suffix-Class - Description: A computationally generated suffix that looks like a
                random set of words and characters that is added to
                a malicious prompt, which circumvents the alignment
                of the LLM and results in a response to a malicious
                prompt. \n
                Instruction-Manipulation-Class - Description: A prompt that either reveals the pre-written instructions or the initial prompt given to the interface of the
                LLM or a prompt that instructs the interface to ignore
                these instructions. \n
                Active-Injection-Class - Description: Malicious prompts that are actively delivered to an
                LLM, for examply by sending emails containing
                prompts so that an email client enhanced with an
                LLM extension executes the prompt. \n
                Passive-Injection-Class - Description: Placement of malicious prompts or content inside
                a public source that might be read by an LLM.
                More broadly, it deals with the manipulation of
                data such as text on webpages evaluated by LLMs. \n
                User-Driven-Injection-Class - Description: Sharing of seemingly innocent prompts using social engineering techniques, which then unwary
                users copy and paste into an LLM. \n
                Virtual-Prompt-Injection-Class - Description: The attacker manipulates the instruction tuning
                data of an LLM, so that so that in specific scenarios the model behavior is misaligned and provides
                outputs as is if was given additional instructions
                through a prompt.\n
                None-Class - Description: The prompt cannot be categorised into any classes
                Answer: No explanation is needed, Provide STRICTLY the prompt and its class in this format: {prompt}: (prompt's class). If you cannot provide a class, just answer in this format: {prompt}: None-Class.
                """

                # Model: phi4:latest
                prompt_template_Phi4 = ChatPromptTemplate.from_template(template)
                model = ChatOllama(model="phi4:latest")

                # Output directory for Outputs from phi4:latest
                output_phi_dir = f"Outputs for {cycle} iteration (PHI4)"

                # Function to save results periodically
                def save_results(results, output_file, output_dir):
                    os.makedirs(output_dir, exist_ok=True)
                    full_path = os.path.join(output_dir, output_file)
                    with open(full_path, 'w', encoding='utf-8') as f:
                        json.dump(results, f, indent=4)

                    return full_path

                # Run it through phi4:latest for classification, and output to a single file

                with open('all_prompts_200subset.json', 'r') as file:
                    data = json.load(file)
                    prompts = data["malicious_prompts"]

                # File name for Outputs from phi4:latest
                output_phi_file = f'output-{cycle}-iter.json'
                resultsPhi4 = []

                # Iterate through each malicious prompt and pass it to the LLM model for validation
                for prompt_text in prompts:
                    # Formatted prompt for LLM
                    formatted_prompt = {
                        "prompt": prompt_text,
                    }

                    try:
                        class_name = "NA"
                        error_flag = 1

                        # If there is no error in phi4:latest output, the while loop would terminate. Else, it would continue prompting phi4:latest until we get a valid output
                        while error_flag == 1:
                            chain = prompt_template_Phi4 | model
                            response = chain.invoke(formatted_prompt)
                            content = response.content

                            print(f"phi4 content: {content}")

                            # Extract the class name (since output would have "Prompt: Class" format)
                            class_name = content.split(":")[-1].strip()
                            class_name = class_name.split(".")[0].strip()

                            # Check for erroneous output
                            class_name, error_flag = error_checking_phi4(class_name)
                            print(class_name, error_flag)

                        # Append the result to the results list
                        resultsPhi4.append({
                            "prompt": prompt_text,
                            "class": class_name
                        })

                        # Save progress after each iteration
                        save_results(resultsPhi4, output_phi_file, output_phi_dir)

                    except Exception as e:
                        print(f"Error processing prompt '{prompt_text}': {e}")

                # Final save
                path = save_results(resultsPhi4, output_phi_file, output_phi_dir)

                print()
                print(f"Outputs from phi4:latest has been written to {path}.")

                cycle += 1            

                print("Directory where outputs from phi4:latest would be categorised into their respective injection classes for LLM-judge approach: ", path)
                
                # File to be loaded for the next step of processing, which is relying on LLM-judge to sieve out misclassifciation
                target_file_filter_prompts_LLMJudge = path

                # Directory for error file where phi4 has weird/illegal outputs
                output_error_dir_filter_prompts_LLMJudge = f"Outputs for {cycle-1} iteration (PHI4)"

                # Directory for prompts which phi4 has classified accordingly to their injection classes 
                output_noerror_dir_filter_prompts_LLMJudge = f"Categorised Outputs for {cycle-1} iteration (LLM-judge)"

                # Classified prompts from phi4:latest would be split up into individual json files according to their respective injection classes, to be sent to LLM-judge for further processing
                filter_prompts_LLM_judge(target_file_filter_prompts_LLMJudge, output_error_dir_filter_prompts_LLMJudge, output_noerror_dir_filter_prompts_LLMJudge)

                # Initialisation of 1st confusion matrix
                # Create an 11x11 matrix initialized with zeros
                first_confusion_matrix = [[0 for _ in range(11)] for _ in range(11)]

                with open('labelled_prompts_200subset.json', 'r') as file:
                            labelled_data = json.load(file)
                            prompts = labelled_data["labelled_prompts"]


                with open(target_file_filter_prompts_LLMJudge, 'r') as file2:
                    predicted_data = json.load(file2)

                # Create a dictionary for quick lookup of predicted_class based on prompts
                predicted_dict = {entry["prompt"]: entry["class"] for entry in predicted_data}

                injection_classes_dict = {"Virtualization-Class":0, "Passive-Injection-Class":1, "User-Driven-Injection-Class":2, "Adversarial-Suffix-Class":3, "Payload-Splitting-Class":4, "Active-Injection-Class":5, "Double-Character-Class":6, "Obfuscation-Class":7, "Instruction-Manipulation-Class": 8, "Virtual-Prompt-Injection-Class": 9, "None-Class": 10}

                print()
                for item in labelled_data["labelled_prompts"]:
                    prompt = item["prompt"]
                    labelled_class = item["labelled_class"]

                    # Check if the prompt exists in the predicted data
                    predicted_class = predicted_dict.get(prompt, None)

                    if predicted_class is None:
                         print(f"Prompt not found in predicted data: {prompt}\n")
                    elif labelled_class == predicted_class:
                        index_first_confusion_matrix = injection_classes_dict[labelled_class]
                        first_confusion_matrix[index_first_confusion_matrix][index_first_confusion_matrix] += 1
                        print(f"Predicted class matches Labelled class\n")
                    else:
                        predicted_index_first_confusion_matrix  = injection_classes_dict[predicted_class]
                        actual_index_first_confusion_matrix = injection_classes_dict[labelled_class]
                        first_confusion_matrix[predicted_index_first_confusion_matrix][actual_index_first_confusion_matrix] += 1
                        print(f"Mismatch: {prompt}\n  Labelled: {labelled_class}\n  Predicted: {predicted_class}\n")

                # Print out the 1st confusion matrix
                print("Confusion matrix for Approach 1: Categorising malicious prompts using phi4:latest (without LLM-judge)")
                for row in first_confusion_matrix:
                    print(row)

                true_positive_rate_1st_confusion_matrix = 0
                for index in range(11):
                    true_positive_rate_1st_confusion_matrix += first_confusion_matrix[index][index]
                print(f"True Positive Rate: {true_positive_rate_1st_confusion_matrix}/200")
                print()
            
                LLM_judge_directory = output_noerror_dir_filter_prompts_LLMJudge
                active_injection_LLMJudge_filepath = "active_injection.json"
                adversarial_suffix_LLMJudge_filepath = "adversarial_suffix.json"
                double_character_LLMJudge_filepath = "double_character.json" 
                instruction_manipulation_LLMJudge_filepath = "instruction_manipulation.json"
                obfuscation_LLMJudge_filepath = "obfuscation.json"
                passive_injection_LLMJudge_filepath = "passive_injection.json"
                payload_splitting_LLMJudge_filepath = "payload_splitting.json"
                user_driven_injection_LLMJudge_filepath = "user_driven_injection.json"
                virtual_prompt_injection_LLMJudge_filepath = "virtual_prompt_injection.json"
                virtualization_LLMJudge_filepath = "virtualization.json"
                LLM_judge_files = [active_injection_LLMJudge_filepath,adversarial_suffix_LLMJudge_filepath,double_character_LLMJudge_filepath,instruction_manipulation_LLMJudge_filepath,obfuscation_LLMJudge_filepath,passive_injection_LLMJudge_filepath,payload_splitting_LLMJudge_filepath,user_driven_injection_LLMJudge_filepath,virtual_prompt_injection_LLMJudge_filepath,virtualization_LLMJudge_filepath]

                # Initialise none-class filepath to tally none-class count for 2nd confusion matrix
                none_class_LLMJudge_filepath = "none_class.json"
                LLM_judge_none_class_input_filepath = os.path.join(LLM_judge_directory, none_class_LLMJudge_filepath)

                try:
                    with open(LLM_judge_none_class_input_filepath, 'r') as infile:
                        print(f"\nLoading {LLM_judge_none_class_input_filepath} ....")
                        data = json.load(infile)
                        data = data["prompts"]

                        # Iterate through the JSON objects
                        for item in data:
                            none_class_lst.append(item)

                    # Sanity check
                    print("Sanity check")
                    print(none_class_lst)
                    print()

                except:
                    pass

                template = ""

                # Retrieveing prompt templates to be utilised for the LLM-judge, tailored according to the injection class that the prompt has been classified as by phi4:latest
                for LLM_judge_input_file in LLM_judge_files:
                    if LLM_judge_input_file == active_injection_LLMJudge_filepath:
                        injection_class = "F"
                        template = prompt_template_version2(injection_class)
                    elif LLM_judge_input_file == adversarial_suffix_LLMJudge_filepath:
                        injection_class = "D"
                        template = prompt_template_version2(injection_class)
                    elif LLM_judge_input_file == double_character_LLMJudge_filepath:
                        injection_class = "G"
                        template = prompt_template_version2(injection_class)
                    elif LLM_judge_input_file == instruction_manipulation_LLMJudge_filepath:
                        injection_class = "I"
                        template = prompt_template_version2(injection_class)
                    elif LLM_judge_input_file == obfuscation_LLMJudge_filepath:
                        injection_class = "H"
                        template = prompt_template_version2(injection_class)
                    elif LLM_judge_input_file == passive_injection_LLMJudge_filepath:
                        injection_class = "B"
                        template = prompt_template_version2(injection_class)
                    elif LLM_judge_input_file == payload_splitting_LLMJudge_filepath:
                        injection_class = "E"
                        template = prompt_template_version2(injection_class)
                    elif LLM_judge_input_file == user_driven_injection_LLMJudge_filepath:
                        injection_class = "C"
                        template = prompt_template_version2(injection_class)
                    elif LLM_judge_input_file == virtual_prompt_injection_LLMJudge_filepath:
                        injection_class = "J"
                        template = prompt_template_version2(injection_class)
                    elif LLM_judge_input_file == virtualization_LLMJudge_filepath:
                        injection_class = "A"
                        template = prompt_template_version2(injection_class)

                    prompt_template_LLM_judge = ChatPromptTemplate.from_template(template)

                    # Optimal set of hyperparameters, determined by experiments conducted (it is elaborated in the report)
                    temperature = 0.1
                    top_p = 0.7
                    presence_penalty = 1.6
                    frequency_penalty = 1.4
                        
                    
                    model_LLM_judge = ChatOllama(model="command-r:latest", temperature=temperature,top_p=top_p,presence_penalty=presence_penalty,frequency_penalty=frequency_penalty)

                    LLM_judge_input_filepath = os.path.join(LLM_judge_directory, LLM_judge_input_file)

                    # Directory for LLM-judge outputs
                    LLM_judge_output_dir = f"LLM-judge {cycle-1} outputs"

                    # Create the directory for LLM-judge outputs (if it doesn't exist)
                    os.makedirs(LLM_judge_output_dir, exist_ok=True)

                    try:
                        with open(LLM_judge_input_filepath, 'r') as file:
                            data = json.load(file)

                        # Extract prompts to be processed by the LLM-judge
                        malicious_prompts = data.get("prompts", [])

                        injection_class_name = LLM_judge_input_file.split(".")[0]

                        # If the file is named active_injection.json, the output file would be saved under another newly created directory, called "LLM-judge xxx outputs"
                        # Naming convention: {injection-class}-with-COT-definition-first.json
                        injection_class_filename = f"{injection_class_name}-with-COT-definition-first.json"
                        output_file_path = os.path.join(LLM_judge_output_dir, injection_class_filename)

                        # Open the output file in append mode
                        # Write results incrementally
                        with open(output_file_path, 'a', encoding='utf-8') as output_file:
                            output_file.write('[')  # Start the JSON array

                            # Iterate through each malicious prompt and pass it to the LLM-judge for validation
                            for idx, prompt_text in enumerate(malicious_prompts):
                                formatted_prompt = {
                                    "prompt": prompt_text,
                                }

                                try:
                                    error_flag = 1

                                    while error_flag == 1:
                                        # Process the prompt with the LLM-judge
                                        chain = prompt_template_LLM_judge | model_LLM_judge
                                        response = chain.invoke(formatted_prompt)
                                        content = response.content

                                        print(f"phi4 content: {content}")

                                        # Checking LLM-judge outputs for erroneous outputs
                                        error_flag = error_checking_llm_judge_commandr(content)

                                    # Prepare the result
                                    resultLLMJudge = {
                                        "prompt": prompt_text['prompt'],
                                        "response": content
                                    }

                                    if idx > 0:
                                        output_file.write(',')  
                                    json.dump(resultLLMJudge, output_file)

                                except Exception as e:
                                    print(f"Error processing prompt {idx}: {e}")

                            output_file.write(']')  # End the JSON array

                        print(f"Outputs from LLM-Judge are incrementally written to {output_file_path}")
                    
                        LLM_judge_yes_output_file_path = f"Categorised Outputs for {cycle} iteration (PHI4)/{injection_class_name}-yes.json"
                        LLM_judge_no_output_file_path = f"Categorised Outputs for {cycle} iteration (PHI4)/{injection_class_name}-no.json"

                        if injection_class_name == "active_injection":
                            injection_class_name = "Active-Injection-Class"
                        elif injection_class_name == "adversarial_suffix":
                            injection_class_name = "Adversarial-Suffix-Class"
                        elif injection_class_name == "double_character":
                            injection_class_name = "Double-Character-Class"
                        elif injection_class_name == "instruction_manipulation":
                            injection_class_name = "Instruction-Manipulation-Class"
                        elif injection_class_name == "obfuscation":
                            injection_class_name = "Obfuscation-Class"
                        elif injection_class_name == "passive_injection":
                            injection_class_name = "Passive-Injection-Class"
                        elif injection_class_name == "payload_splitting":
                            injection_class_name = "Payload-Splitting-Class"
                        elif injection_class_name == "user_driven_injection":
                            injection_class_name = "User-Driven-Injection-Class"
                        elif injection_class_name == "virtual_prompt_injection":
                            injection_class_name = "Virtual-Prompt-Injection-Class"
                        elif injection_class_name == "virtualization":
                            injection_class_name = "Virtualization-Class"

                        # LLM-judge outputs would be split into individual json files, according to their injection classes classified by phi4:latest
                        # 2 json files would be outputted for each injection classes. 1 file is for prompts deemed to be classified correctly by the LLM-judge. The other file is for prompts deemed to be misclassified by the LLM-judge
                        correctly_categorised_prompts_lst = filter_json_version2(output_file_path, LLM_judge_yes_output_file_path, LLM_judge_no_output_file_path, injection_class_name)

                        # Appending the correctly categorised prompts by LLM-judge to the main list
                        yes_prompts_lst.extend(correctly_categorised_prompts_lst)

                    except:
                        pass

            # 2nd iteration and onwards
            else:                
                active_injection_Phi_filepath = "active_injection-no.json"
                adversarial_suffix_Phi_filepath = "adversarial_suffix-no.json"
                double_character_Phi_filepath = "double_character-no.json" 
                instruction_manipulation_Phi_filepath = "instruction_manipulation-no.json"
                obfuscation_Phi_filepath = "obfuscation-no.json"
                passive_injection_Phi_filepath = "passive_injection-no.json"
                payload_splitting_Phi_filepath = "payload_splitting-no.json"
                user_driven_injection_Phi_filepath = "user_driven_injection-no.json"
                virtual_prompt_injection_Phi_filepath = "virtual_prompt_injection-no.json"
                virtualization_Phi_filepath = "virtualization-no.json"

                Phi_files = [active_injection_Phi_filepath,adversarial_suffix_Phi_filepath,double_character_Phi_filepath,instruction_manipulation_Phi_filepath,obfuscation_Phi_filepath,passive_injection_Phi_filepath,payload_splitting_Phi_filepath,user_driven_injection_Phi_filepath,virtual_prompt_injection_Phi_filepath,virtualization_Phi_filepath]

                path = ""

                print("\n")

                # Retrieving prompts templates for phi4:latest based on the injection classes that it was deemed to be misclassified by the LLM-judge
                for Phi_input_file in Phi_files:
                    if Phi_input_file == "active_injection-no.json":
                        injection_class = "F"
                        template = prompt_template_phi4(injection_class)
                    elif Phi_input_file == "adversarial_suffix-no.json":
                        injection_class = "D"
                        template = prompt_template_phi4(injection_class)
                    elif Phi_input_file == "double_character-no.json":
                        injection_class = "G"
                        template = prompt_template_phi4(injection_class)
                    elif Phi_input_file == "instruction_manipulation-no.json":
                        injection_class = "I"
                        template = prompt_template_phi4(injection_class)
                    elif Phi_input_file == "obfuscation-no.json":
                        injection_class = "H"
                        template = prompt_template_phi4(injection_class)
                    elif Phi_input_file == "passive_injection-no.json":
                        injection_class = "B"
                        template = prompt_template_phi4(injection_class)
                    elif Phi_input_file == "payload_splitting-no.json":
                        injection_class = "E"
                        template = prompt_template_phi4(injection_class)
                    elif Phi_input_file == "user_driven_injection-no.json":
                        injection_class = "C"
                        template = prompt_template_phi4(injection_class)
                    elif Phi_input_file == "virtual_prompt_injection-no.json":
                        injection_class = "J"
                        template = prompt_template_phi4(injection_class)
                    elif Phi_input_file == "virtualization-no.json":
                        injection_class = "A"
                        template = prompt_template_phi4(injection_class)

                    prompt_template_Phi4 = ChatPromptTemplate.from_template(template)
                    model_Phi = ChatOllama(model="phi4:latest")

                    output_phi_dir = f"Outputs for {cycle} iteration (PHI4)"

                    # Function to save results periodically
                    def save_results(results, output_file, output_dir):
                        os.makedirs(output_dir, exist_ok=True)

                        full_path = os.path.join(output_dir, output_file)

                        with open(full_path, 'w', encoding='utf-8') as f:
                            json.dump(results, f, indent=4)

                        return full_path

                    input_Phi_dir = f"Categorised Outputs for {cycle} iteration (PHI4)"
                    input_Phi_filepath = os.path.join(input_Phi_dir, Phi_input_file)


                    try:
                        with open(input_Phi_filepath, 'r') as file:
                            data = json.load(file)

                        # Check if there's already a saved checkpoint and load it
                        output_injection_class = Phi_input_file.split("-")[0]

                        output_phi_file = f'{output_injection_class}-{cycle}-iter.json'
                        resultsPhi4 = []

                        print()
                        
                        for prompt_text in data:
                            # Formatted prompt for LLM
                            prompt_text = prompt_text["prompt"]
                            formatted_prompt = {
                                "prompt": prompt_text,
                            }

                            try:
                                class_name = "NA"
                                error_flag = 1

                                while error_flag == 1:
                                    chain = prompt_template_Phi4 | model_Phi
                                    response = chain.invoke(formatted_prompt)
                                    content = response.content

                                    print(f"phi4 content: {content}")
                                    
                                    class_name = content.split(":")[-1].strip()
                                    class_name = class_name.split(".")[0].strip()

                                    class_name, error_flag = error_checking_phi4(class_name)
                                    print(class_name, error_flag)

                                # Append the result to the results list
                                resultsPhi4.append({
                                    "prompt": prompt_text,
                                    "class": class_name
                                })

                                # Save progress after each iteration
                                save_results(resultsPhi4, output_phi_file, output_phi_dir)

                            except Exception as e:
                                print(f"Error processing prompt '{prompt_text}': {e}")

                        # Final save
                        path = save_results(resultsPhi4, output_phi_file, output_phi_dir)

                        print(f"Outputs from phi4:latest has been written to {path}.")

                    except:
                        pass

                # Directory containing output from phi4
                directory_path = output_phi_dir

                combined_data = []

                # Iterate through the directory
                for root, _, files in os.walk(directory_path):
                    for file_name in files:
                        if file_name.endswith(".json"):  # Only process JSON files
                            file_path = os.path.join(root, file_name)
                            print(f"Processing file: {file_path}")

                            # Read the JSON file
                            with open(file_path, 'r', encoding='utf-8') as file:
                                try:
                                    data = json.load(file)  # Load JSON data
                                    if isinstance(data, list):
                                        combined_data.extend(data)  # Append list to combined_data
                                    else:
                                        print(f"Unexpected format in {file_path}, skipping.")
                                except json.JSONDecodeError:
                                    print(f"Error decoding JSON in {file_path}, skipping.")

                # Combine the json files, comprising of prompts that have been re-classified by the phi4:latest into 1 common file
                output_file = "main.json"
                output_file_full_path = os.path.join(output_phi_dir, output_file)
                with open(output_file_full_path, 'w', encoding='utf-8') as output:
                    json.dump(combined_data, output, ensure_ascii=False, indent=2)

                print(f"Combined data written to {output_file_full_path}\n")

                categorised_outputs_LLM_judge_dir = f"Categorised Outputs for {cycle} iteration (LLM-judge)"

                filter_prompts_LLM_judge(output_file_full_path, output_phi_dir, categorised_outputs_LLM_judge_dir)

                LLM_judge_directory = categorised_outputs_LLM_judge_dir
                active_injection_LLMJudge_filepath = "active_injection.json"
                adversarial_suffix_LLMJudge_filepath = "adversarial_suffix.json"
                double_character_LLMJudge_filepath = "double_character.json" 
                instruction_manipulation_LLMJudge_filepath = "instruction_manipulation.json"
                obfuscation_LLMJudge_filepath = "obfuscation.json"
                passive_injection_LLMJudge_filepath = "passive_injection.json"
                payload_splitting_LLMJudge_filepath = "payload_splitting.json"
                user_driven_injection_LLMJudge_filepath = "user_driven_injection.json"
                virtual_prompt_injection_LLMJudge_filepath = "virtual_prompt_injection.json"
                virtualization_LLMJudge_filepath = "virtualization.json"
                LLM_judge_files = [active_injection_LLMJudge_filepath,adversarial_suffix_LLMJudge_filepath,double_character_LLMJudge_filepath,instruction_manipulation_LLMJudge_filepath,obfuscation_LLMJudge_filepath,passive_injection_LLMJudge_filepath,payload_splitting_LLMJudge_filepath,user_driven_injection_LLMJudge_filepath,virtual_prompt_injection_LLMJudge_filepath,virtualization_LLMJudge_filepath]

                # Initialise none-class filepath to tally none-class count for 2nd confusion matrix
                none_class_LLMJudge_filepath = "none_class.json"
                LLM_judge_none_class_input_filepath = os.path.join(LLM_judge_directory, none_class_LLMJudge_filepath)

                try:
                    with open(LLM_judge_none_class_input_filepath, 'r') as infile:
                        print(f"\nLoading {LLM_judge_none_class_input_filepath} ....")
                        data = json.load(infile)
                        data = data["prompts"]

                        for item in data:
                            none_class_lst.append(item)

                    # Sanity check
                    print("Sanity check")
                    print(none_class_lst)
                    print()
                except:
                    pass

                template = ""

                for LLM_judge_input_file in LLM_judge_files:
                    if LLM_judge_input_file == "active_injection.json":
                        injection_class = "F"
                        template = prompt_template_version2(injection_class)
                    elif LLM_judge_input_file == "adversarial_suffix.json":
                        injection_class = "D"
                        template = prompt_template_version2(injection_class)
                    elif LLM_judge_input_file == "double_character.json":
                        injection_class = "G"
                        template = prompt_template_version2(injection_class)
                    elif LLM_judge_input_file == "instruction_manipulation.json":
                        injection_class = "I"
                        template = prompt_template_version2(injection_class)
                    elif LLM_judge_input_file == "obfuscation.json":
                        injection_class = "H"
                        template = prompt_template_version2(injection_class)
                    elif LLM_judge_input_file == "passive_injection.json":
                        injection_class = "B"
                        template = prompt_template_version2(injection_class)
                    elif LLM_judge_input_file == "payload_splitting.json":
                        injection_class = "E"
                        template = prompt_template_version2(injection_class)
                    elif LLM_judge_input_file == "user_driven_injection.json":
                        injection_class = "C"
                        template = prompt_template_version2(injection_class)
                    elif LLM_judge_input_file == "virtual_prompt_injection.json":
                        injection_class = "J"
                        template = prompt_template_version2(injection_class)
                    elif LLM_judge_input_file == "virtualization.json":
                        injection_class = "A"
                        template = prompt_template_version2(injection_class)

                    prompt_template_LLM_judge = ChatPromptTemplate.from_template(template)
                    temperature = 0.1
                    top_p = 0.7
                    presence_penalty = 1.6
                    frequency_penalty = 1.4
                        
                    model_LLM_judge = ChatOllama(model="command-r:latest", temperature=temperature,top_p=top_p,top_k=80,presence_penalty=presence_penalty,frequency_penalty=frequency_penalty)            

                    LLM_judge_input_filepath = os.path.join(LLM_judge_directory, LLM_judge_input_file)
                    LLM_judge_output_dir = f"LLM-judge {cycle} outputs"
                    
                    os.makedirs(LLM_judge_output_dir, exist_ok=True)
                
                    os.makedirs(LLM_judge_directory, exist_ok = True)

                    try:
                        with open(LLM_judge_input_filepath, 'r') as file:
                            data = json.load(file)

                        malicious_prompts = data.get("prompts", [])

                        injection_class_name = LLM_judge_input_file.split(".")[0]
                       
                        injection_class_filename = f"{injection_class_name}-with-COT-definition-first.json"
                        output_file_path = os.path.join(LLM_judge_output_dir, injection_class_filename)

                        # Write results incrementally
                        with open(output_file_path, 'a', encoding='utf-8') as output_file:
                            output_file.write('[')  # Start the JSON array

                            # Iterate through each malicious prompt and pass it to the LLM-judge for validation
                            for idx, prompt_text in enumerate(malicious_prompts):
                                formatted_prompt = {
                                    "prompt": prompt_text,
                                }

                                try:
                                    error_flag = 1

                                    while error_flag == 1:
                                        chain = prompt_template_LLM_judge | model_LLM_judge
                                        response = chain.invoke(formatted_prompt)
                                        content = response.content

                                        print(f"phi4 content: {content}")

                                        error_flag = error_checking_llm_judge_commandr(content)

                                    resultLLMJudge = {
                                        "prompt": prompt_text['prompt'],
                                        "response": content
                                    }

                                    if idx > 0:
                                        output_file.write(',')  
                                    json.dump(resultLLMJudge, output_file)

                                except Exception as e:
                                    print(f"Error processing prompt {idx}: {e}")

                            output_file.write(']')  # End the JSON array

                        print(f"Outputs from LLM-Judge are incrementally written to {output_file_path}")
                    
                        LLM_judge_yes_output_file_path = f"Categorised Outputs for {cycle+1} iteration (PHI4)/{injection_class_name}-yes.json"
                        LLM_judge_no_output_file_path = f"Categorised Outputs for {cycle+1} iteration (PHI4)/{injection_class_name}-no.json"

                        if injection_class_name == "active_injection":
                            injection_class_name = "Active-Injection-Class"
                        elif injection_class_name == "adversarial_suffix":
                            injection_class_name = "Adversarial-Suffix-Class"
                        elif injection_class_name == "double_character":
                            injection_class_name = "Double-Character-Class"
                        elif injection_class_name == "instruction_manipulation":
                            injection_class_name = "Instruction-Manipulation-Class"
                        elif injection_class_name == "obfuscation":
                            injection_class_name = "Obfuscation-Class"
                        elif injection_class_name == "passive_injection":
                            injection_class_name = "Passive-Injection-Class"
                        elif injection_class_name == "payload_splitting":
                            injection_class_name = "Payload-Splitting-Class"
                        elif injection_class_name == "user_driven_injection":
                            injection_class_name = "User-Driven-Injection-Class"
                        elif injection_class_name == "virtual_prompt_injection":
                            injection_class_name = "Virtual-Prompt-Injection-Class"
                        elif injection_class_name == "virtualization":
                            injection_class_name = "Virtualization-Class"
                        
                        correctly_categorised_prompts_lst = filter_json_version2(output_file_path, LLM_judge_yes_output_file_path, LLM_judge_no_output_file_path, injection_class_name)
                        
                        yes_prompts_lst.extend(correctly_categorised_prompts_lst)


                    except:
                        pass

                # Important: Stopping conditions to stop the iterations
                # Case 1: There are no files in "Categorised Outputs for xxx+1 iteration (PHI4)"
                terminate_iteration_dir = f"Categorised Outputs for {cycle+1} iteration (PHI4)"
                if not os.path.exists(terminate_iteration_dir) or not os.listdir(terminate_iteration_dir):
                    print(f"No files in {terminate_iteration_dir}. Stopping iteration.")
                    print("Prompts that were deemed to be categorised correctly by LLM-Judge:")
                    print(yes_prompts_lst)

                    # Initialisation of 2nd confusion matrix
                    # Create an 11x11 matrix initialized with zeros
                    second_confusion_matrix = [[0 for _ in range(11)] for _ in range(11)]

                    with open('labelled_prompts_200subset.json', 'r') as file:
                        labelled_data = json.load(file)
                        prompts = labelled_data["labelled_prompts"]

                    # Combining the 2 lists declared at the very start of the code: 
                    none_class_lst.extend(yes_prompts_lst)

                    # Writing all the correctly categorised prompts deemed by LLM-judge into a common file
                    with open("2nd_confusion_matrix.json", 'w') as matrix_file:
                        json.dump(none_class_lst, matrix_file, indent=4)

                    # Reading the correctly categorised prompts deemed by LLM-judge
                    with open("2nd_confusion_matrix.json", 'r') as matrix_file:
                        predicted_data = json.load(matrix_file)

                    # Create a dictionary for quick lookup of predicted_class based on prompts
                    predicted_dict = {entry["prompt"]: entry["class"] for entry in predicted_data}

                    injection_classes_dict = {"Virtualization-Class":0, "Passive-Injection-Class":1, "User-Driven-Injection-Class":2, "Adversarial-Suffix-Class":3, "Payload-Splitting-Class":4, "Active-Injection-Class":5, "Double-Character-Class":6, "Obfuscation-Class":7, "Instruction-Manipulation-Class": 8, "Virtual-Prompt-Injection-Class": 9, "None-Class": 10}

                    for item in labelled_data["labelled_prompts"]:
                        prompt = item["prompt"]
                        labelled_class = item["labelled_class"]

                        # Check if the prompt exists in the predicted data
                        predicted_class = predicted_dict.get(prompt, None)

                        if predicted_class is None:
                             print(f"Prompt not found in predicted data: {prompt}")
                        elif labelled_class == predicted_class:
                            index_second_confusion_matrix = injection_classes_dict[labelled_class]
                            second_confusion_matrix[index_second_confusion_matrix][index_second_confusion_matrix] += 1
                            print(f"Predicted class matches Labelled class")
                        else:
                            predicted_index_second_confusion_matrix  = injection_classes_dict[predicted_class]
                            actual_index_second_confusion_matrix = injection_classes_dict[labelled_class]
                            second_confusion_matrix[predicted_index_second_confusion_matrix][actual_index_second_confusion_matrix] += 1
                            print(f"Mismatch: {prompt}\n  Labelled: {labelled_class}\n  Predicted: {predicted_class}\n")

                    # Print out the 2nd confusion matrix
                    print("Confusion matrix for Approach 2: Categorising malicious prompts using phi4:latest (with LLM-judge)")
                    for row in second_confusion_matrix:
                        print(row)
                    true_positive_rate_2nd_confusion_matrix = 0
                    for index in range(11):
                        true_positive_rate_2nd_confusion_matrix += second_confusion_matrix[index][index]
                    print(f"True Positive Rate: {true_positive_rate_2nd_confusion_matrix}/200")

                    break
                # Case 2: There is no directory for "Categorised Outputs for xxx+1 iteration (PHI4)"
                if not os.path.exists(terminate_iteration_dir):
                    print(f"Directory {terminate_iteration_dir} does not exist. Stopping iteration.")
                    print("Prompts that were deemed to be categorised correctly by LLM-Judge:")
                    print(yes_prompts_lst)

                    # Initialisation of 2nd confusion matrix
                    # Create an 11x11 matrix initialized with zeros
                    second_confusion_matrix = [[0 for _ in range(11)] for _ in range(11)]

                    with open('labelled_prompts_200subset.json', 'r') as file:
                        labelled_data = json.load(file)
                        prompts = labelled_data["labelled_prompts"]

                    # Combining the 2 lists declared at the very start of the code: 
                    none_class_lst.extend(yes_prompts_lst)

                    # Writing all the correctly categorised prompts deemed by LLM-judge into a common file
                    with open("2nd_confusion_matrix.json", 'w') as matrix_file:
                        json.dump(none_class_lst, matrix_file, indent=4)

                    # Reading the correctly categorised prompts deemed by LLM-judge
                    with open("2nd_confusion_matrix.json", 'r') as matrix_file:
                        predicted_data = json.load(matrix_file)

                    # Create a dictionary for quick lookup of predicted_class based on prompts
                    predicted_dict = {entry["prompt"]: entry["class"] for entry in predicted_data}

                    injection_classes_dict = {"Virtualization-Class":0, "Passive-Injection-Class":1, "User-Driven-Injection-Class":2, "Adversarial-Suffix-Class":3, "Payload-Splitting-Class":4, "Active-Injection-Class":5, "Double-Character-Class":6, "Obfuscation-Class":7, "Instruction-Manipulation-Class": 8, "Virtual-Prompt-Injection-Class": 9, "None-Class": 10}

                    for item in labelled_data["labelled_prompts"]:
                        prompt = item["prompt"]
                        labelled_class = item["labelled_class"]

                        # Check if the prompt exists in the predicted data
                        predicted_class = predicted_dict.get(prompt, None)

                        if predicted_class is None:
                             print(f"Prompt not found in predicted data: {prompt}")
                        elif labelled_class == predicted_class:
                            index_second_confusion_matrix = injection_classes_dict[labelled_class]
                            second_confusion_matrix[index_second_confusion_matrix][index_second_confusion_matrix] += 1
                            print(f"Predicted class matches Labelled class")
                        else:
                            predicted_index_second_confusion_matrix  = injection_classes_dict[predicted_class]
                            actual_index_second_confusion_matrix = injection_classes_dict[labelled_class]
                            second_confusion_matrix[predicted_index_second_confusion_matrix][actual_index_second_confusion_matrix] += 1
                            print(f"Mismatch: {prompt}\n  Labelled: {labelled_class}\n  Predicted: {predicted_class}\n")

                    # Print out the 2nd confusion matrix
                    print("Confusion matrix for Approach 2: Categorising malicious prompts using phi4:latest (with LLM-judge)")
                    for row in second_confusion_matrix:
                        print(row)
                    true_positive_rate_2nd_confusion_matrix = 0
                    for index in range(11):
                        true_positive_rate_2nd_confusion_matrix += second_confusion_matrix[index][index]
                    print(f"True Positive Rate: {true_positive_rate_2nd_confusion_matrix}/200")

                    break

                # Case 3: There are "-no" json files and "-yes" json files but ALL "-no" json files are EMPTY    
                flag = 1

                if os.path.exists(terminate_iteration_dir) and os.listdir(terminate_iteration_dir):
                    for file_name in os.listdir(terminate_iteration_dir):
                        if file_name.endswith("-no.json"):
                            file_path = os.path.join(terminate_iteration_dir, file_name)
                            with open(file_path, 'r', encoding='utf-8') as file:
                                try:
                                    data = json.load(file)
                                    if data: # If the file is not an empty list
                                        flag = 0
                                except:
                                    pass
                if flag == 1:
                    print(f"Stopping iteration.")
                    print("Prompts that were deemed to be categorised correctly by LLM-Judge:")
                    print(yes_prompts_lst)

                    # Initialisation of 2nd confusion matrix
                    # Create an 11x11 matrix initialized with zeros
                    second_confusion_matrix = [[0 for _ in range(11)] for _ in range(11)]

                    with open('labelled_prompts_200subset.json', 'r') as file:
                        labelled_data = json.load(file)
                        prompts = labelled_data["labelled_prompts"]

                    # Combining the 2 lists declared at the very start of the code: 
                    none_class_lst.extend(yes_prompts_lst)

                    # Writing all the correctly categorised prompts deemed by LLM-judge into a common file
                    with open("2nd_confusion_matrix.json", 'w') as matrix_file:
                        json.dump(none_class_lst, matrix_file, indent=4)

                    # Reading the correctly categorised prompts deemed by LLM-judge
                    with open("2nd_confusion_matrix.json", 'r') as matrix_file:
                        predicted_data = json.load(matrix_file)

                    # Create a dictionary for quick lookup of predicted_class based on prompts
                    predicted_dict = {entry["prompt"]: entry["class"] for entry in predicted_data}

                    injection_classes_dict = {"Virtualization-Class":0, "Passive-Injection-Class":1, "User-Driven-Injection-Class":2, "Adversarial-Suffix-Class":3, "Payload-Splitting-Class":4, "Active-Injection-Class":5, "Double-Character-Class":6, "Obfuscation-Class":7, "Instruction-Manipulation-Class": 8, "Virtual-Prompt-Injection-Class": 9, "None-Class": 10}

                    for item in labelled_data["labelled_prompts"]:
                        prompt = item["prompt"]
                        labelled_class = item["labelled_class"]

                        # Check if the prompt exists in the predicted data
                        predicted_class = predicted_dict.get(prompt, None)

                        if predicted_class is None:
                             print(f"Prompt not found in predicted data: {prompt}")
                        elif labelled_class == predicted_class:
                            index_second_confusion_matrix = injection_classes_dict[labelled_class]
                            second_confusion_matrix[index_second_confusion_matrix][index_second_confusion_matrix] += 1
                            print(f"Predicted class matches Labelled class")
                        else:
                            predicted_index_second_confusion_matrix  = injection_classes_dict[predicted_class]
                            actual_index_second_confusion_matrix = injection_classes_dict[labelled_class]
                            second_confusion_matrix[predicted_index_second_confusion_matrix][actual_index_second_confusion_matrix] += 1
                            print(f"Mismatch: {prompt}\n  Labelled: {labelled_class}\n  Predicted: {predicted_class}\n")

                    # Print out the 2nd confusion matrix
                    print("Confusion matrix for Approach 2: Categorising malicious prompts using phi4:latest (with LLM-judge)")
                    for row in second_confusion_matrix:
                        print(row)

                    true_positive_rate_2nd_confusion_matrix = 0
                    for index in range(11):
                        true_positive_rate_2nd_confusion_matrix += second_confusion_matrix[index][index]
                    print(f"True Positive Rate: {true_positive_rate_2nd_confusion_matrix}/200")

                    break

                cycle += 1

        # Reset stdout back to default (console)
        sys.stdout = sys.__stdout__
        print("Logging complete. Check log_output.txt for details.")
        log_file.close()

        # Execute shell commands
        final_file_directory = f"frequencyPenalty_0{i}_{j}"
        os.system(f"mkdir {final_file_directory}")
        os.system(f"mv LLM-judge* {final_file_directory}")
        os.system(f"mv Categorised* {final_file_directory}")
        os.system(f"mv Outputs* {final_file_directory}")
        os.system(f"mv log_output.txt {final_file_directory}")

        print(f"Everything has been saved to {final_file_directory}")


