In [None]:


import pandas as pd
import numpy as np
import io
from google.colab import files
from collections import defaultdict


uploaded = files.upload()

if not uploaded:
    print("\nNo file uploaded.")
else:

    file_name = next(iter(uploaded))



    df = pd.read_csv(io.BytesIO(uploaded[file_name]))

    df.drop_duplicates(inplace=True)

    df.reset_index(drop=True, inplace=True)
    df['id'] = df.index


    # Similarity Calculation


    SUBJECT_ATTRS = ['Subjects', 'role', 'isEq-subjUserId-resUserId', 'isSubjectsMeeting', 'isConflicted']
    ACTION_ATTRS = ['Actions', 'Effect']
    RESOURCE_ATTRS = ['Resources', 'isReviewContentInPlace']
    ENVIRONMENT_ATTRS = ['isMeeting']


    CATEGORY_PROBABILITY = 1/4

    def calculate_attribute_similarity(val1, val2):


        if pd.isna(val1) and pd.isna(val2):
            return 1.0
        if pd.isna(val1) or pd.isna(val2):
            return 0.0


        set1 = {str(val1)}
        set2 = {str(val2)}

        nsv = len(set1.intersection(set2))
        ndv = len(set1.union(set2))

        return nsv / ndv if ndv > 0 else 0.0

    def calculate_category_similarity(rule1, rule2, attributes):

        similarities = []


        common_attributes = [
            attr for attr in attributes
            if not pd.isna(rule1[attr]) and not pd.isna(rule2[attr])
        ]

        if not common_attributes:
            rule1_has_attrs = any(not pd.isna(rule1[attr]) for attr in attributes)
            rule2_has_attrs = any(not pd.isna(rule2[attr]) for attr in attributes)
            return 1.0 if not rule1_has_attrs and not rule2_has_attrs else 0.0

        for attr in common_attributes:
            sim = calculate_attribute_similarity(rule1[attr], rule2[attr])
            similarities.append(sim)

        return np.mean(similarities) if similarities else 0.0

    def calculate_total_similarity(rule1, rule2):

        s_subject = calculate_category_similarity(rule1, rule2, SUBJECT_ATTRS)
        s_action = calculate_category_similarity(rule1, rule2, ACTION_ATTRS)
        s_resource = calculate_category_similarity(rule1, rule2, RESOURCE_ATTRS)
        s_environment = calculate_category_similarity(rule1, rule2, ENVIRONMENT_ATTRS)

        total_sim = (
            s_subject * CATEGORY_PROBABILITY +
            s_action * CATEGORY_PROBABILITY +
            s_resource * CATEGORY_PROBABILITY +
            s_environment * CATEGORY_PROBABILITY
        )
        return total_sim

    def check_priority_match(rule1, rule2):

        return (rule1['PolicySet_Level'] == rule2['PolicySet_Level'] and
                rule1['Policy_Level'] == rule2['Policy_Level'] and
                rule1['Rule_Level'] == rule2['Rule_Level'])



    def rsca_cluster(df, similarity_threshold=0.8):

        num_rules = len(df)
        clusters = {}
        rule_to_cluster_id = {}
        next_cluster_id = 0

        print(f"clustering {num_rules} rules")

        for i in range(num_rules):
            rule_i = df.iloc[i]
            id_i = rule_i['id']

            for j in range(i + 1, num_rules):
                rule_j = df.iloc[j]
                id_j = rule_j['id']

                if not check_priority_match(rule_i, rule_j):
                    continue


                similarity = calculate_total_similarity(rule_i, rule_j)
                if similarity <= similarity_threshold:
                    continue
                cluster_id_i = rule_to_cluster_id.get(id_i)
                cluster_id_j = rule_to_cluster_id.get(id_j)

                if cluster_id_i is None and cluster_id_j is None:

                    clusters[next_cluster_id] = {id_i, id_j}
                    rule_to_cluster_id[id_i] = next_cluster_id
                    rule_to_cluster_id[id_j] = next_cluster_id
                    next_cluster_id += 1

                elif cluster_id_i is not None and cluster_id_j is None:

                    can_merge = True
                    for other_id in clusters[cluster_id_i]:
                        if other_id == id_i: continue
                        other_rule = df.iloc[other_id]
                        if not check_priority_match(rule_j, other_rule) or \
                           calculate_total_similarity(rule_j, other_rule) <= similarity_threshold:
                            can_merge = False
                            break
                    if can_merge:
                        clusters[cluster_id_i].add(id_j)
                        rule_to_cluster_id[id_j] = cluster_id_i

                elif cluster_id_i is None and cluster_id_j is not None:
                    can_merge = True
                    for other_id in clusters[cluster_id_j]:
                        if other_id == id_j: continue
                        other_rule = df.iloc[other_id]
                        if not check_priority_match(rule_i, other_rule) or \
                           calculate_total_similarity(rule_i, other_rule) <= similarity_threshold:
                            can_merge = False
                            break
                    if can_merge:
                        clusters[cluster_id_j].add(id_i)
                        rule_to_cluster_id[id_i] = cluster_id_j

        for i in range(num_rules):
            rule_id = df.iloc[i]['id']
            if rule_id not in rule_to_cluster_id:
                clusters[next_cluster_id] = {rule_id}
                rule_to_cluster_id[rule_id] = next_cluster_id
                next_cluster_id += 1

        print("Clustering complete.")

        return {cid: sorted(list(r_indices)) for cid, r_indices in clusters.items()}



    final_clusters = rsca_cluster(df)

    print(f"\n{len(final_clusters)} clusters.")


    sorted_clusters = sorted(final_clusters.items(), key=lambda item: len(item[1]), reverse=True)

    for cluster_id, rule_indices in sorted_clusters:
        print(f"Cluster {cluster_id} (Size: {len(rule_indices)}):")
        print(f"  Rule Indices: {rule_indices}")

        if len(rule_indices) > 1:
            print("  Rules in this cluster:")
            for rule_idx in rule_indices:
                rule_info = df.iloc[rule_idx]
                print(f"    - Index {rule_idx}: {rule_info['Effect']} {rule_info['Actions']} '{rule_info['Subjects']}' '{rule_info['Resources']}'")
            print()

Saving xacml_policies_transformed2.csv to xacml_policies_transformed2 (1).csv
clustering 358 rules
Clustering complete.

249 clusters.
Cluster 37 (Size: 5):
  Rule Indices: [np.int64(199), np.int64(200), np.int64(201), np.int64(202), np.int64(203)]
  Rules in this cluster:
    - Index 199: Permit read 'pc-member' 'urn:oasis:names:tc:xacml:1.0:resource:resource-id=paper-review_rc'
    - Index 200: Permit read 'pc-member' 'urn:oasis:names:tc:xacml:1.0:resource:resource-id=paper-review_rc'
    - Index 201: Permit read 'pc-member' 'urn:oasis:names:tc:xacml:1.0:resource:resource-id=paper-review_rc'
    - Index 202: Permit read 'pc-member' 'urn:oasis:names:tc:xacml:1.0:resource:resource-id=paper-review_rc'
    - Index 203: Deny read 'pc-member' 'urn:oasis:names:tc:xacml:1.0:resource:resource-id=paper-review_rc'

Cluster 40 (Size: 5):
  Rule Indices: [np.int64(217), np.int64(218), np.int64(219), np.int64(220), np.int64(221)]
  Rules in this cluster:
    - Index 217: Permit read 'pc-member' 'u

In [None]:
import pandas as pd
import numpy as np
import itertools

ALL_ATTRS = ['Subjects', 'Resources', 'isMeeting', 'isReviewContentInPlace', 'isSubjectsMeeting', 'isConflicted', 'isEq-subjUserId-resUserId']

def get_existing_attrs(df):
    return [attr for attr in ALL_ATTRS if attr in df.columns]

def parse_actions(action_string):
    if pd.isna(action_string):
        return set()
    return set(action.strip() for action in action_string.split(','))

def do_domains_intersect(rule1, rule2, existing_attrs):
    for attr in existing_attrs:
        val1, val2 = rule1.get(attr), rule2.get(attr)
        if pd.notna(val1) and pd.notna(val2) and val1 != val2:
            return False
    return True

def is_domain_subset(rule1, rule2, existing_attrs):
    for attr in existing_attrs:
        val1, val2 = rule1.get(attr), rule2.get(attr)
        if pd.notna(val1) and (pd.isna(val2) or val1 != val2):
             if pd.notna(val2) and val1 != val2:
                 return False
    return True

def detect_and_resolve_cluster_anomalies(cluster_ids, full_df, resolution_strategy='permissive'):
    existing_attrs = get_existing_attrs(full_df)

    if len(cluster_ids) < 2:
        return ({k: full_df.loc[k].to_dict() for k in cluster_ids}, ["Cluster has only one rule, no anomalies possible."])

    logs = []
    nodes = {rule_id: full_df.loc[rule_id].to_dict() for rule_id in cluster_ids}
    edges = list(itertools.combinations(cluster_ids, 2))
    logs.append(f"Initializing graph with {len(nodes)} nodes and {len(edges)} edges.")

    processed_edges = set()
    while edges:
        id1, id2 = edges.pop(0)

        if id1 not in nodes or id2 not in nodes:
            continue

        if (id1, id2) in processed_edges or (id2, id1) in processed_edges:
            continue
        processed_edges.add((id1, id2))

        rule1, rule2 = nodes[id1], nodes[id2]
        actions1, actions2 = parse_actions(rule1.get('Actions')), parse_actions(rule2.get('Actions'))

        anomaly_found = False

        if rule1.get('Effect') == rule2.get('Effect'):
            if actions1.issubset(actions2) and is_domain_subset(rule1, rule2, existing_attrs):
                logs.append(f"REDUNDANCY: Rule {id1} is redundant to Rule {id2}. Removing Rule {id1}.")
                del nodes[id1]
                anomaly_found = True
            elif actions2.issubset(actions1) and is_domain_subset(rule2, rule1, existing_attrs):
                logs.append(f" REDUNDANCY: Rule {id2} is redundant to Rule {id1}. Removing Rule {id2}.")
                del nodes[id2]
                anomaly_found = True

        if not anomaly_found and rule1.get('Effect') != rule2.get('Effect'):
            common_actions = actions1.intersection(actions2)
            if common_actions and do_domains_intersect(rule1, rule2, existing_attrs):
                logs.append(f"CONFLICT : Between Rule {id1} ('{rule1.get('Effect')}') and Rule {id2} ('{rule2.get('Effect')}') on actions: {common_actions}.")
                anomaly_found = True

                permit_rule = rule1 if rule1.get('Effect') == 'Permit' else rule2
                deny_rule = rule2 if rule1.get('Effect') == 'Permit' else rule1
                permit_id = id1 if rule1.get('Effect') == 'Permit' else id2
                deny_id = id2 if rule1.get('Effect') == 'Permit' else id1

                if resolution_strategy == 'permissive':
                    original_deny_actions = parse_actions(deny_rule.get('Actions'))
                    new_deny_actions = original_deny_actions - common_actions
                    nodes[deny_id]['Actions'] = ",".join(sorted(list(new_deny_actions)))
                    logs.append(f"    RESOLUTION (Permissive): Modifying Rule {deny_id} actions to '{nodes[deny_id]['Actions']}'.")
                    if not new_deny_actions:
                        logs.append(f"    Rule {deny_id} now has no actions and is removed.")
                        del nodes[deny_id]

                elif resolution_strategy == 'restrictive':
                    original_permit_actions = parse_actions(permit_rule.get('Actions'))
                    new_permit_actions = original_permit_actions - common_actions
                    nodes[permit_id]['Actions'] = ",".join(sorted(list(new_permit_actions)))
                    logs.append(f"   -> RESOLUTION (Restrictive): Modifying Rule {permit_id} actions to '{nodes[permit_id]['Actions']}'.")
                    if not new_permit_actions:
                        logs.append(f"   -> Rule {permit_id} now has no actions and is removed.")
                        del nodes[permit_id]

        if not anomaly_found:
            logs.append(f" ({id1}, {id2}) no  anomalies.")

    logs.append("Finished processing. ")
    return nodes, logs


if __name__ == "__main__":

    all_resolved_rules = {}


    if 'id' in df.columns and df.index.name != 'id':
        df.set_index('id', drop=False, inplace=True)
    elif 'id' not in df.columns:
        df['id'] = df.index
        df.set_index('id', drop=False, inplace=True)


    for cluster_id, rule_indices in final_clusters.items():
        if len(rule_indices) <= 1:
            if rule_indices:
                rule_id = rule_indices[0]
                all_resolved_rules[cluster_id] = {rule_id: df.loc[rule_id].to_dict()}
            continue

        print(f"\n Processing Cluster {cluster_id} (size: {len(rule_indices)}) ")

        resolved_rules, logs = detect_and_resolve_cluster_anomalies(
            cluster_ids=rule_indices,
            full_df=df,
            resolution_strategy='permissive'
        )
        all_resolved_rules[cluster_id] = resolved_rules


        print("Resolution Log ")
        for log_entry in logs:
            print(log_entry)


        print("Final Rules for this Cluster")
        if not resolved_rules:
            print("All rules in this cluster were removed due to resolution.")
        else:
            resolved_df = pd.DataFrame.from_dict(resolved_rules, orient='index')
            display_cols = [col for col in ['Effect', 'Actions', 'Subjects', 'Resources', 'isMeeting'] if col in resolved_df.columns]
            print(resolved_df[display_cols].to_string())



    final_rules_list = []
    for cluster_rules_dict in all_resolved_rules.values():
        for rule_dict in cluster_rules_dict.values():
            final_rules_list.append(rule_dict)

    if not final_rules_list:
        print("\nNo rules remained after resolution. CSV file not created.")
    else:
        final_df = pd.DataFrame.from_records(final_rules_list)
        if 'id' in final_df.columns:
            cols = ['id'] + [col for col in final_df.columns if col != 'id']
            final_df = final_df[cols]

        output_filename = 'resolved_policy.csv'
        final_df.to_csv(output_filename, index=False)
        print(f"\n{'='*20}\nProcessing complete. All {len(final_df)} final rules saved to '{output_filename}'.")


 Processing Cluster 0 (size: 2) 
Resolution Log 
Initializing graph with 2 nodes and 1 edges.
 (0, 1) no  anomalies.
Finished processing. 
Final Rules for this Cluster
   Effect Actions Subjects                                                        Resources  isMeeting
0  Permit    read    admin  urn:oasis:names:tc:xacml:1.0:resource:resource-id=conference_rc        NaN
1  Permit   write    admin  urn:oasis:names:tc:xacml:1.0:resource:resource-id=conference_rc        NaN

 Processing Cluster 1 (size: 2) 
Resolution Log 
Initializing graph with 2 nodes and 1 edges.
 (5, 6) no  anomalies.
Finished processing. 
Final Rules for this Cluster
   Effect Actions Subjects                                                            Resources  isMeeting
5  Permit    read    admin  urn:oasis:names:tc:xacml:1.0:resource:resource-id=conferenceInfo_rc        NaN
6  Permit   write    admin  urn:oasis:names:tc:xacml:1.0:resource:resource-id=conferenceInfo_rc        NaN

 Processing Cluster 2 (size: 2)

In [None]:
import csv
import os


CONDITION_COLUMNS = [
    'isMeeting', 'isEq-subjUserId-resUserId', 'isSubjectsMeeting',
    'isConflicted', 'isReviewContentInPlace'
]

def is_attribute_more_general(attr_a, attr_b):

    if not attr_a:
        return True
    return attr_a == attr_b

def parse_resource_attributes(resource_str):

    if not resource_str:
        return set()
    return {part.strip() for part in resource_str.split(';')}

def is_resource_more_general(res_a, res_b):

    attrs_a = parse_resource_attributes(res_a)
    attrs_b = parse_resource_attributes(res_b)
    return attrs_a.issubset(attrs_b)

def is_more_general(rule_a, rule_b):

    if not is_attribute_more_general(rule_a['Subjects'], rule_b['Subjects']):
        return False
    if not is_attribute_more_general(rule_a['Actions'], rule_b['Actions']):
        return False

    if not is_resource_more_general(rule_a['Resources'], rule_b['Resources']):
        return False

    for col in CONDITION_COLUMNS:

        if rule_a[col] == 'True' and rule_b[col] != 'True':
            return False

    return True


def detect_shadowing(input_filename, output_filename):

    if not os.path.exists(input_filename):
        print(f"Error: Input file '{input_filename}' not found.")
        return

    with open(input_filename, mode='r', newline='', encoding='utf-8') as infile:
        reader = csv.DictReader(infile)
        rules = list(reader)
        fieldnames = reader.fieldnames

    shadowed_indices = set()
    shadowing_details = []


    for i in range(len(rules)):
        if i in shadowed_indices:
            continue

        rule_i = rules[i]

        for j in range(i + 1, len(rules)):
            if j in shadowed_indices:
                continue

            rule_j = rules[j]

            if rule_i['Effect'] == rule_j['Effect']:
                if is_more_general(rule_i, rule_j):
                    shadowed_indices.add(j)
                    details = (
                        f"  - Rule at index {j} ('{rule_j['RuleId']}') is shadowed by "
                        f"rule at index {i} ('{rule_i['RuleId']}')"
                    )
                    shadowing_details.append(details)


    print("\n--- Shadowing Detection ---")
    if not shadowed_indices:
        print("No shadowed rules found.")
    else:
        print(f"Found {len(shadowed_indices)} shadowed rules to be removed:")
        for detail in sorted(shadowing_details):
            print(detail)

    final_rules = []
    for i, rule in enumerate(rules):
        if i not in shadowed_indices:
            final_rules.append(rule)

    print(f"\nOriginal rule count: {len(rules)}")
    print(f"Removed rule count: {len(shadowed_indices)}")
    print(f"Final rule count: {len(final_rules)}")

    with open(output_filename, mode='w', newline='', encoding='utf-8') as outfile:
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(final_rules)



if __name__ == "__main__":
    INPUT_FILE = "/content/resolved_policy.csv"
    OUTPUT_FILE = "output_cleaned.csv"
    detect_shadowing(INPUT_FILE, OUTPUT_FILE)


--- Shadowing Detection ---
Found 82 shadowed rules to be removed:
  - Rule at index 11 ('RPSlist.3.0.3.1.r.1') is shadowed by rule at index 10 ('RPSlist.3.0.3.4.0.r.1')
  - Rule at index 143 ('RPSlist.2.0.2.r.1') is shadowed by rule at index 141 ('RPSlist.2.0.4.3.r.1')
  - Rule at index 144 ('RPSlist.3.0.3.4.1.r.1') is shadowed by rule at index 14 ('RPSlist.3.0.0.r.1')
  - Rule at index 148 ('RPSlist.3.0.3.2.r.1') is shadowed by rule at index 146 ('RPSlist.3.0.3.4.3.r.1')
  - Rule at index 149 ('RPSlist.3.0.1.r.1') is shadowed by rule at index 147 ('RPSlist.3.0.3.0.r.1')
  - Rule at index 150 ('RPSlist.3.0.2.r.1') is shadowed by rule at index 146 ('RPSlist.3.0.3.4.3.r.1')
  - Rule at index 151 ('RPSlist.4.0.3.4.1.r.1') is shadowed by rule at index 21 ('RPSlist.4.0.0.r.1')
  - Rule at index 155 ('RPSlist.4.0.3.2.r.1') is shadowed by rule at index 153 ('RPSlist.4.0.3.4.3.r.1')
  - Rule at index 156 ('RPSlist.4.0.1.r.1') is shadowed by rule at index 154 ('RPSlist.4.0.3.0.r.1')
  - Rule 

In [None]:
import pandas as pd

input_file_path = '/content/output_cleaned.csv'
output_file_path = '/content/cleaned_resources_aft_shdow.csv'

def clean_resource_value(resource_string):

    if pd.isna(resource_string):
        return resource_string

    try:

        main_part = resource_string.split('resource-id=')[1]
        cleaned_part = main_part.split(';')[0]
        return cleaned_part
    except (IndexError, AttributeError):
        return resource_string

try:
    print(f"Reading data from: {input_file_path}")
    df = pd.read_csv(input_file_path)

    df['Resources'] = df['Resources'].apply(clean_resource_value)

    df.to_csv(output_file_path, index=False)



    print(df.head(10))

except FileNotFoundError:
    print(f"\n ERROR: The file was not found at '{input_file_path}'.")
    print("Please make sure the file is uploaded and the path is correct.")
except Exception as e:
    print(f"\n An error occurred: {e}")

Reading data from: /content/output_cleaned.csv
   id      PolicySetId           PolicyId                 RuleId  Effect  \
0   0      RPSlist.0.0      RPSlist.0.0.0      RPSlist.0.0.0.r.1  Permit   
1   1      RPSlist.0.0      RPSlist.0.0.0      RPSlist.0.0.0.r.1  Permit   
2   5    RPSlist.1.0.1    RPSlist.1.0.1.0    RPSlist.1.0.1.0.r.1  Permit   
3   6    RPSlist.1.0.1    RPSlist.1.0.1.0    RPSlist.1.0.1.0.r.1  Permit   
4  11    RPSlist.2.0.4    RPSlist.2.0.4.0    RPSlist.2.0.4.0.r.1  Permit   
5  12    RPSlist.2.0.4    RPSlist.2.0.4.0    RPSlist.2.0.4.0.r.1  Permit   
6  18      RPSlist.2.0      RPSlist.2.0.1      RPSlist.2.0.1.r.1  Permit   
7  20      RPSlist.2.0      RPSlist.2.0.3      RPSlist.2.0.3.r.1  Permit   
8  21  RPSlist.3.0.3.4  RPSlist.3.0.3.4.0  RPSlist.3.0.3.4.0.r.1  Permit   
9  22  RPSlist.3.0.3.4  RPSlist.3.0.3.4.0  RPSlist.3.0.3.4.0.r.1  Permit   

  Subjects Actions                Resources   role isMeeting  \
0    admin    read            conference_rc  admin  

In [None]:
import pandas as pd
from itertools import product
import time

def detect_incompleteness(file_path):

    print("--- Incompleteness Anomaly Detection ---")

    try:
        df = pd.read_csv(file_path)
        print(f"Successfully loaded {file_path}. Found {len(df)} rules.")
    except FileNotFoundError:
        print(f" ERROR: File not found at '{file_path}'. Please ensure the path is correct.")
        return
    except Exception as e:
        print(f" ERROR: Could not read the file. {e}")
        return

    unique_roles = df['role'].dropna().unique()
    unique_actions = df['Actions'].dropna().unique()
    unique_resources = df['Resources'].dropna().unique()

    print(f"\nFound {len(unique_roles)} unique roles, {len(unique_actions)} unique actions, and {len(unique_resources)} unique resources.")


    all_possible_requests = list(product(unique_roles, unique_actions, unique_resources))
    total_requests_to_check = len(all_possible_requests)
    print(f"Generated {total_requests_to_check} possible request combinations to check for coverage.")

    incompleteness_anomalies = []
    start_time = time.time()

    print("\nChecking each request against the policy rules")
    for i, (role, action, resource) in enumerate(all_possible_requests):

        is_covered = not df[
            (df['Resources'] == resource) &
            ((df['role'] == role) | (df['role'].isnull())) &
            ((df['Actions'] == action) | (df['Actions'].isnull()))
        ].empty

        if not is_covered:
            incompleteness_anomalies.append((role, action, resource))

    end_time = time.time()

    print("\n--- Incompleteness Anomaly Detection ---")
    if not incompleteness_anomalies:
        print("\nNo incompleteness anomalies found.")

    else:
        print(f"\nDetected {len(incompleteness_anomalies)} incompleteness anomalies (uncovered requests):")
        for r, a, res in incompleteness_anomalies[:20]:
            print(f"  - Request: [Role: '{r}', Action: '{a}', Resource: '{res}'] is not covered.")
        if len(incompleteness_anomalies) > 20:
            print(f"  ... and {len(incompleteness_anomalies) - 20} more.")

file_path = '/content/cleaned_resources_aft_shdow.csv'
detect_incompleteness(file_path)

--- Incompleteness Anomaly Detection ---
Successfully loaded /content/cleaned_resources_aft_shdow.csv. Found 234 rules.

Found 4 unique roles, 4 unique actions, and 25 unique resources.
Generated 400 possible request combinations to check for coverage.

Checking each request against the policy rules

--- Incompleteness Anomaly Detection ---

No incompleteness anomalies found.
