In [8]:
import re
from collections import defaultdict
import pandas as pd

def clean_text(text):
    return re.sub(r'^[^a-zA-Z]+', '', text).strip()

def divide_classifications(text) -> dict:
    if text == "nan":
        return {}

    expected_risk_classes = [
        "Prohibited AI system",
        "High-risk AI system under Annex I",
        "High-risk AI system under Annex III",
        "System with transparency obligations",
        "High-risk AI system with transparency obligations",
        "Low-risk AI system"
    ]
    use_case_dict = defaultdict(list)
    
    use_case_blocks = re.findall(r"AI Use Case:(.*?)(?=AI Use Case:|$)", text, re.DOTALL)
    
    for block in use_case_blocks:
        use_case = re.search(r"^(.+)", block.strip())
        description = re.search(r"Use Case Description:\s*(.*?)\n", block, re.DOTALL)
        risk = re.search(r"Risk Classification:\s*(.*?)\n", block)
        reason = re.search(r"Reason:\s*(.*?)\n(?:Requires Additional Information|$)", block, re.DOTALL)
        additional_info = re.search(r"Requires Additional Information:\s*(.*?)(?:\n|$)", block)
        what_additional_info = re.search(r"What additional Information:\s*(.*?)(?=\n[A-Z]|$)", block, re.DOTALL)
        
        risk_classification = clean_text(risk.group(1)) if risk else "Unknown"
        
        use_case_data = f"#####AI Use Case:##### {clean_text(use_case.group(1)) if use_case else 'Unknown'}\n"
        use_case_data += f"#####Use Case Description:##### {clean_text(description.group(1)) if description else 'Missing Description'}\n"
        use_case_data += f"#####Risk Classification:##### {risk_classification}\n"
        use_case_data += f"#####Reason:##### {clean_text(reason.group(1)) if reason else 'Missing Reason'}\n"

        requires_additional_info = clean_text(additional_info.group(1)) if additional_info else 'Unknown'
        what_additional_info_text = what_additional_info.group(1).strip() if what_additional_info else ''

        if requires_additional_info.lower() == "no":
            what_additional_info_text = ""

        use_case_data += f"#####Requires Additional Information:##### {requires_additional_info}\n"
        use_case_data += f"#####What additional Information:##### {what_additional_info_text}"
        
        if risk_classification in expected_risk_classes:
            use_case_dict[risk_classification].append(use_case_data)
        else:
            use_case_dict["Unknown"].append(use_case_data)  # Explicitly add unknown cases

    ordered_use_case_dict = {risk_class: "\n\n\n\n".join(use_case_dict.get(risk_class, [])) for risk_class in expected_risk_classes}
    ordered_use_case_dict["Unknown"] = "\n\n\n\n".join(use_case_dict.get("Unknown", []))  # Ensure unknown is included

    # Total Risk Classification Counts
    counts = {key: 0 for key in expected_risk_classes}
    for key, value in use_case_dict.items():
        counts[key] = len(value)
    totals_string = "Risk Classification Counts:\n"
    for key, value in counts.items():
        totals_string += f"\n{str(key)}: {str(value)}"
    ordered_use_case_dict["Total Counts"] = totals_string
    
    # If all counts are 0, then no classification was found
    if all(value == 0 for value in counts.values()):
        ordered_use_case_dict = {key: "No Classification Found" for key in ordered_use_case_dict.keys()}

    return ordered_use_case_dict

# Load Excel file
df = pd.read_excel("Updated Claude Sonnet Results.xlsx")

# Apply classification function
divided_data = df['EU AI Act Risk Classification'].astype(str).apply(divide_classifications)

# Expand into DataFrame
expanded_df = pd.DataFrame(list(divided_data))
expanded_df.to_excel("Structured Results.xlsx", index=False)

# Display first few rows
expanded_df.head(2)

Unnamed: 0,Prohibited AI system,High-risk AI system under Annex I,High-risk AI system under Annex III,System with transparency obligations,High-risk AI system with transparency obligations,Low-risk AI system,Unknown,Total Counts
0,,,,,,#####AI Use Case:##### Predictive Battery Anal...,,Risk Classification Counts:\n\nProhibited AI s...


In [9]:
import re

def extract_response(text):
    match = re.search(r'\b(Yes|No)\b', text, re.IGNORECASE)
    
    if match:
        response = match.group(0)
        if response.lower() == "yes":
            reason_match = re.search(r'Yes\s*[-–]?\s*[^a-zA-Z]*([A-Za-z].*)', text, re.IGNORECASE | re.DOTALL)
            reason = reason_match.group(1).strip() if reason_match else ""
            return response, reason
        else:
            return response, ""
    
    return None, ""

# # Example usage
# text = "Yes. - Additional information about the specific defects being detected and the products being inspected would help determine if the system might alternatively be classified under Annex I. If the system is used for products covered by Union harmonization laws listed in Annex I that require third-party conformity assessment, it could be classified as a High-risk AI system under Annex I instead."

# requires_additonal_info, what_other_info = extract_response(text)
# print("requires_additonal_info:", requires_additonal_info)
# print("what_other_info:", what_other_info)

In [10]:
import re
import pandas as pd
import json

# List of columns to process in priority order (highest risk first)
columns_to_process = [
    "Prohibited AI system",
    "High-risk AI system under Annex I",
    "High-risk AI system under Annex III",
    "System with transparency obligations",
    "High-risk AI system with transparency obligations",
    "Low-risk AI system"
]

# Function to extract structured AI system data for a single row
def process_row(row):
    ai_systems_dict = {}

    for column in columns_to_process:
        if column in row and pd.notna(row[column]):
            ai_systems_dict[column] = {
                "requires-additional-info": {
                    "no": [],
                    "yes": []
                }
            }

            # Split use cases using \n\n\n\n
            use_case_blocks = re.findall(r"#####AI Use Case:#####(.*?)(?=AI Use Case:|$)", row[column], re.DOTALL)

            for block in use_case_blocks:
                block = block.strip()

                # Extract fields
                use_case = re.search(r"^(.+)", block)
                description = re.search(r"#####Use Case Description:#####\s*(.*?)\n", block, re.DOTALL)
                risk = re.search(r"#####Risk Classification:#####\s*(.*?)\n", block)
                reason = re.search(r"#####Reason:#####\s*(.*?)\n(?:#####Requires Additional Information:#####|$)", block, re.DOTALL)
                additional_info = re.search(r"#####Requires Additional Information:#####\s*(.*?)(?:\n|$)", block)
                what_additional_info = re.search(r"#####What additional Information:#####\s*(.*?)(?=\n[A-Z]|$)", block, re.DOTALL)

                # Assign values
                use_case = use_case.group(1).strip() if use_case else None
                description = description.group(1).strip() if description else None
                risk = risk.group(1).strip() if risk else None
                reason = reason.group(1).strip() if reason else None
                additional_info = additional_info.group(1).strip() if additional_info else "No"
                what_additional_info = what_additional_info.group(1).strip() if what_additional_info else ""

                # Ensure what_additional_info is empty if requires_additional_info is 'No'
                if additional_info.lower() == "no":
                    what_additional_info = ""
                
                # Determine category ("no" first, then "yes")
                category = "no" if additional_info.lower() == "no" else "yes"

                # Append to dictionary
                ai_systems_dict[column]["requires-additional-info"][category].append({
                    "AI Use Case": use_case,
                    "Use Case Description": description,
                    "Risk Classification": risk,
                    "Reason": reason,
                    "Requires Additional Information": additional_info,
                    "What Additional Information": what_additional_info
                })

    return ai_systems_dict

# Function to find the highest-risk use case for a single row
def find_highest_risk_use_case(ai_dict):
    for column in columns_to_process:  # Follow priority order
        if column in ai_dict:
            for priority in ["no", "yes"]:  # Prioritize "no" first
                use_cases = ai_dict[column]["requires-additional-info"][priority]
                if use_cases:  # Return the first available use case
                    return {"Category": column, "Use Case": use_cases[0]}
    return None  # If no use case is found

# Process each row and store results
json_outputs = []
highest_risk_full_data = []

highest_use_case_strings = []
highest_risk_classifications = []
requires_additonal_infos = []
what_additional_infos = []

for _, row in expanded_df.iterrows():
    ai_data = process_row(row)
    highest_risk = find_highest_risk_use_case(ai_data)

    if highest_risk is not None and isinstance(highest_risk, dict):
        highest_risk_classification = highest_risk['Use Case']['Risk Classification']
        highest_use_case_string = ""
        requires_additonal_info_full_string = highest_risk['Use Case']['Requires Additional Information']
        what_additional_info_full_string = highest_risk['Use Case'].get("What Additional Information", "")
        
        # Ensure what_additional_info is empty if requires_additional_info is 'No'
        if requires_additonal_info_full_string.lower() == "no":
            what_additional_info_full_string = ""
        # Remove the trailing newline and ##### if present
        what_additional_info_full_string = what_additional_info_full_string.rstrip('\n#####')
        
        for key, value in highest_risk['Use Case'].items():
            highest_use_case_string += f"#####{key}:##### {value}\n"
        # Remove the trailing newline and ##### if present
        highest_use_case_string = highest_use_case_string.rstrip('\n#####')

    else:
        highest_risk_classification = "PARSE ERROR - No Classification found."
        highest_use_case_string = "PARSE ERROR - No Classification found."
        requires_additonal_info_full_string = "PARSE ERROR - No Classification found."
        what_additional_info_full_string = "PARSE ERROR - No Classification found."

    json_outputs.append(json.dumps(ai_data, indent=4))  # Store structured JSON
    highest_risk_full_data.append(json.dumps(highest_risk, indent=4) if highest_risk else "No Classification found.")
    highest_use_case_strings.append(highest_use_case_string)
    highest_risk_classifications.append(highest_risk_classification)
    requires_additonal_infos.append(requires_additonal_info_full_string)
    what_additional_infos.append(what_additional_info_full_string)

# Add results to the dataframe
expanded_df["json_output"] = json_outputs
expanded_df["highest_risk_use_case"] = highest_risk_full_data
expanded_df["highest_use_case_string"] = highest_use_case_strings
expanded_df["highest_risk_classification"] = highest_risk_classifications
expanded_df["requires_additional_info"] = requires_additonal_infos
expanded_df["what_additional_info"] = what_additional_infos

# Save to Excel
output_file = "ai_risk_analysis.xlsx"
expanded_df.to_excel(output_file, index=False)

print(f"Excel file saved as {output_file}")


Excel file saved as ai_risk_analysis.xlsx
