# File counter for Legal Document Discrepancy Detection Dataset
It counts the files for each subfolder of the benchmark dataset.


In [108]:
# Change this
folder_path = "Intext_Doc_test"

In [2]:
import os

def get_all_folders(root_folder="benchmark_dataset_v1", skip_folder=".ipynb_checkpoints"):
    end_folders = []

    for dirpath, dirnames, _ in os.walk(root_folder, topdown=True):
        # Remove the folders that should be skipped
        dirnames[:] = [d for d in dirnames if d != skip_folder]

        # For nested-folder system
        if not dirpath.__eq__(root_folder):
            end_folders.append(os.path.join(dirpath, ""))

        # For dual-folder system
        # If there are no subdirectories left, it's an end folder
        # if not dirnames:
        #     end_folders.append(os.path.join(dirpath, ""))  # Ensure trailing backslash

    return end_folders

# Example usage:
# result = get_all_folders(folder_path)
# print(len(result))

In [111]:
def count_folder_files(root_folder):
    end_folders = get_all_folders(root_folder)

    for folder in end_folders:
        # print(os.listdir(folder))
        files = os.listdir(folder)
        print(folder, end=": ")
        print(len(files))
        # for file in os.listdir(folder):
        #     file_path = os.path.join(folder, file)
        #     if os.path.isfile(file_path) and not file.endswith(".pdf"):
        #         print(f"Non-PDF File Found: {file_path}")

print("Folders count:")
count_folder_files(folder_path)

Folders count:
Intext_Doc_test/omission_InText/: 29
Intext_Doc_test/omission_InText/modified_files/: 28
Intext_Doc_test/ambiguity_InText/: 30
Intext_Doc_test/ambiguity_InText/modified_files/: 29
Intext_Doc_test/misaligned_term_InText/: 30
Intext_Doc_test/misaligned_term_InText/modified_files/: 29
Intext_Doc_test/structural_flaws_InText/: 28
Intext_Doc_test/structural_flaws_InText/modified_files/: 27
Intext_Doc_test/inconsistencies_InText/: 27
Intext_Doc_test/inconsistencies_InText/modified_files/: 26


# Law validation Score Counter

In [68]:
from collections import defaultdict
import os
import json

In [103]:
def count_contradiction_scores(folder_path):
    # Hardcoded score buckets
    score_buckets = {
        0.0: 0,
        0.1: 0,
        0.2: 0,
        0.3: 0,
        0.4: 0,
        0.5: 0,
        0.6: 0,
        0.7: 0,
        0.8: 0,
        0.9: 0,
        1.0: 0
    }

    total_scores = 0

    # Loop through all files in the folder
    for file_name in os.listdir(folder_path):
        print("This is the file name:", file_name)
        if not file_name.endswith(".json"):
            continue

        file_path = os.path.join(folder_path, file_name)
        with open(file_path, 'r') as f:
            data = json.load(f)
            root = data[0] if isinstance(data, list) else data

            for perturbation in root.get("perturbation", []):
                score = perturbation.get("contradiction_score")
                print("These are the scores: ", score)
                if score is not None:
                    rounded_score = round(score, 1)
                    if rounded_score in score_buckets:
                        score_buckets[rounded_score] += 1
                        total_scores += 1

    # Print the final counts
    print("\n Contradiction Score Counts:")
    for score in sorted(score_buckets.keys()):
        print(f"{score:.1f}: {score_buckets[score]}")

    print(f"\n Total Perturbations Scored: {total_scores}")

    return score_buckets

In [112]:
def count_contradiction_scores2(folder_path):
    # Track only LOW, MEDIUM, HIGH
    score_buckets = {
        "NO": 0,
        "YES": 0
    }

    total_scores = 0

    # Loop through all files
    for file_name in os.listdir(folder_path):
        print("This is the file name:", file_name)
        if not file_name.endswith(".json"):
            continue

        file_path = os.path.join(folder_path, file_name)
        with open(file_path, 'r') as f:
            data = json.load(f)
            root = data[0] if isinstance(data, list) else data

            for perturbation in root.get("perturbation", []):
                score = perturbation.get("contradiction_score")
                print("These are the scores: ", score)
                if score in score_buckets:
                    score_buckets[score] += 1
                    total_scores += 1

    # Print results
    print("\n📊 Contradiction Score Category Counts:")
    for label in ["LOW", "HIGH"]:
        print(f"{label}: {score_buckets[label]}")

    print(f"\n🔢 Total Perturbations Scored: {total_scores}")

    return score_buckets

In [91]:
count_contradiction_scores("scraped_laws/ambiguity_legal/law_validation/")



 Contradiction Score Counts:
0.0: 0
0.1: 1
0.2: 1
0.3: 1
0.4: 7
0.5: 0
0.6: 17
0.7: 20
0.8: 5
0.9: 0
1.0: 0

 Total Perturbations Scored: 52


{0.0: 0,
 0.1: 1,
 0.2: 1,
 0.3: 1,
 0.4: 7,
 0.5: 0,
 0.6: 17,
 0.7: 20,
 0.8: 5,
 0.9: 0,
 1.0: 0}

In [94]:
count_contradiction_scores("scraped_laws_v7/ambiguity_legal/law_validation/")


This is the file name: perturbed_WHITESMOKE,INC_11_08_2011-EX-10.26-PROMOTIONANDDISTRIBUTIONAGREEMENT.txt.snippet.json
These are the scores:  0.7
These are the scores:  0.6
These are the scores:  0.7
This is the file name: perturbed_DovaPharmaceuticalsInc_20181108_10-Q_EX-10.2_11414857_EX-10.2_PromotionAgreement.txt.snippet.json
These are the scores:  0.7
These are the scores:  0.6
These are the scores:  0.7
This is the file name: perturbed_PACIRAPHARMACEUTICALS,INC.-A_RSTRATEGICLICENSING,DISTRIBUTIONANDMARKETINGAGREEMENT.txt.snippet.json
These are the scores:  0.4
These are the scores:  0.6
These are the scores:  0.8
This is the file name: perturbed_PfHospitalityGroupInc_20150923_10-12G_EX-10.1_9266710_EX-10.1_FranchiseAgreement3.txt.snippet.json
These are the scores:  0.6
These are the scores:  0.7
These are the scores:  0.6
This is the file name: perturbed_LohaCompanyltd_20191209_F-1_EX-10.16_11917878_EX-10.16_SupplyAgreement.txt.snippet.json
These are the scores:  0.7
These are the

{0.0: 0,
 0.1: 1,
 0.2: 1,
 0.3: 2,
 0.4: 2,
 0.5: 0,
 0.6: 27,
 0.7: 19,
 0.8: 6,
 0.9: 0,
 1.0: 0}

In [95]:
count_contradiction_scores("scraped_laws_v7/ommision_legal/law_validation/")

This is the file name: perturbed_WHITESMOKE,INC_11_08_2011-EX-10.26-PROMOTIONANDDISTRIBUTIONAGREEMENT.txt.snippet.json
These are the scores:  0.8
These are the scores:  0.8
These are the scores:  0.7
This is the file name: perturbed_DovaPharmaceuticalsInc_20181108_10-Q_EX-10.2_11414857_EX-10.2_PromotionAgreement.txt.snippet.json
These are the scores:  0.9
These are the scores:  0.6
These are the scores:  0.7
This is the file name: perturbed_PACIRAPHARMACEUTICALS,INC.-A_RSTRATEGICLICENSING,DISTRIBUTIONANDMARKETINGAGREEMENT.txt.snippet.json
These are the scores:  0.1
These are the scores:  0.6
These are the scores:  0.6
This is the file name: perturbed_PfHospitalityGroupInc_20150923_10-12G_EX-10.1_9266710_EX-10.1_FranchiseAgreement3.txt.snippet.json
These are the scores:  0.6
These are the scores:  0.8
These are the scores:  0.8
This is the file name: perturbed_LohaCompanyltd_20191209_F-1_EX-10.16_11917878_EX-10.16_SupplyAgreement.txt.snippet.json
These are the scores:  0.7
These are the

{0.0: 0,
 0.1: 1,
 0.2: 1,
 0.3: 1,
 0.4: 3,
 0.5: 1,
 0.6: 20,
 0.7: 22,
 0.8: 6,
 0.9: 2,
 1.0: 0}

In [96]:
count_contradiction_scores("scraped_laws_v7/misaligned_legal/law_validation/")

This is the file name: perturbed_WHITESMOKE,INC_11_08_2011-EX-10.26-PROMOTIONANDDISTRIBUTIONAGREEMENT.txt.snippet.json
These are the scores:  0.7
These are the scores:  0.7
These are the scores:  0.8
This is the file name: perturbed_DovaPharmaceuticalsInc_20181108_10-Q_EX-10.2_11414857_EX-10.2_PromotionAgreement.txt.snippet.json
These are the scores:  0.6
These are the scores:  0.4
These are the scores:  0.8
This is the file name: perturbed_PACIRAPHARMACEUTICALS,INC.-A_RSTRATEGICLICENSING,DISTRIBUTIONANDMARKETINGAGREEMENT.txt.snippet.json
These are the scores:  0.7
These are the scores:  0.7
These are the scores:  0.7
This is the file name: perturbed_PfHospitalityGroupInc_20150923_10-12G_EX-10.1_9266710_EX-10.1_FranchiseAgreement3.txt.snippet.json
These are the scores:  0.9
These are the scores:  0.8
These are the scores:  0.9
This is the file name: perturbed_LohaCompanyltd_20191209_F-1_EX-10.16_11917878_EX-10.16_SupplyAgreement.txt.snippet.json
These are the scores:  0.8
These are the

{0.0: 0,
 0.1: 0,
 0.2: 0,
 0.3: 0,
 0.4: 3,
 0.5: 0,
 0.6: 13,
 0.7: 23,
 0.8: 15,
 0.9: 5,
 1.0: 0}

In [97]:
count_contradiction_scores("scraped_laws_v7/structural_legal/law_validation/")

This is the file name: perturbed_WHITESMOKE,INC_11_08_2011-EX-10.26-PROMOTIONANDDISTRIBUTIONAGREEMENT.txt.snippet.json
These are the scores:  0.6
These are the scores:  0.6
These are the scores:  0.6
This is the file name: perturbed_DovaPharmaceuticalsInc_20181108_10-Q_EX-10.2_11414857_EX-10.2_PromotionAgreement.txt.snippet.json
These are the scores:  0.7
These are the scores:  0.7
These are the scores:  0.6
This is the file name: perturbed_PACIRAPHARMACEUTICALS,INC.-A_RSTRATEGICLICENSING,DISTRIBUTIONANDMARKETINGAGREEMENT.txt.snippet.json
These are the scores:  0.6
These are the scores:  0.7
These are the scores:  0.7
This is the file name: perturbed_PfHospitalityGroupInc_20150923_10-12G_EX-10.1_9266710_EX-10.1_FranchiseAgreement3.txt.snippet.json
These are the scores:  0.6
These are the scores:  0.6
These are the scores:  0.3
This is the file name: perturbed_LohaCompanyltd_20191209_F-1_EX-10.16_11917878_EX-10.16_SupplyAgreement.txt.snippet.json
These are the scores:  0.4
These are the

{0.0: 0,
 0.1: 0,
 0.2: 0,
 0.3: 3,
 0.4: 10,
 0.5: 0,
 0.6: 31,
 0.7: 15,
 0.8: 0,
 0.9: 0,
 1.0: 0}

In [98]:
count_contradiction_scores("scraped_laws_v7/inconsistencies_legal/law_validation/")

This is the file name: perturbed_WHITESMOKE,INC_11_08_2011-EX-10.26-PROMOTIONANDDISTRIBUTIONAGREEMENT.txt.snippet.json
These are the scores:  0.6
These are the scores:  0.6
These are the scores:  0.7
This is the file name: perturbed_DovaPharmaceuticalsInc_20181108_10-Q_EX-10.2_11414857_EX-10.2_PromotionAgreement.txt.snippet.json
These are the scores:  0.2
These are the scores:  0.6
These are the scores:  0.4
This is the file name: perturbed_PACIRAPHARMACEUTICALS,INC.-A_RSTRATEGICLICENSING,DISTRIBUTIONANDMARKETINGAGREEMENT.txt.snippet.json
These are the scores:  0.2
These are the scores:  0.6
These are the scores:  0.6
This is the file name: perturbed_PfHospitalityGroupInc_20150923_10-12G_EX-10.1_9266710_EX-10.1_FranchiseAgreement3.txt.snippet.json
These are the scores:  0.8
These are the scores:  0.7
These are the scores:  0.8
This is the file name: perturbed_LohaCompanyltd_20191209_F-1_EX-10.16_11917878_EX-10.16_SupplyAgreement.txt.snippet.json
These are the scores:  0.7
These are the

{0.0: 0,
 0.1: 0,
 0.2: 2,
 0.3: 2,
 0.4: 6,
 0.5: 0,
 0.6: 26,
 0.7: 14,
 0.8: 6,
 0.9: 4,
 1.0: 0}

In [105]:
count_contradiction_scores2("scraped_laws_v7/ambiguity_legal/law_validation2/")

This is the file name: perturbed_WHITESMOKE,INC_11_08_2011-EX-10.26-PROMOTIONANDDISTRIBUTIONAGREEMENT.txt.snippet.json
These are the scores:  MEDIUM
These are the scores:  MEDIUM
These are the scores:  MEDIUM
This is the file name: perturbed_DovaPharmaceuticalsInc_20181108_10-Q_EX-10.2_11414857_EX-10.2_PromotionAgreement.txt.snippet.json
These are the scores:  MEDIUM
These are the scores:  MEDIUM
These are the scores:  MEDIUM
This is the file name: perturbed_PACIRAPHARMACEUTICALS,INC.-A_RSTRATEGICLICENSING,DISTRIBUTIONANDMARKETINGAGREEMENT.txt.snippet.json
These are the scores:  MEDIUM
These are the scores:  MEDIUM
These are the scores:  HIGH
This is the file name: perturbed_PfHospitalityGroupInc_20150923_10-12G_EX-10.1_9266710_EX-10.1_FranchiseAgreement3.txt.snippet.json
These are the scores:  MEDIUM
These are the scores:  MEDIUM
These are the scores:  MEDIUM
This is the file name: perturbed_LohaCompanyltd_20191209_F-1_EX-10.16_11917878_EX-10.16_SupplyAgreement.txt.snippet.json
These 

{'LOW': 1, 'MEDIUM': 49, 'HIGH': 7}

In [107]:
count_contradiction_scores2("scraped_laws_v7/ambiguity_legal/law_validation3/")

This is the file name: perturbed_WHITESMOKE,INC_11_08_2011-EX-10.26-PROMOTIONANDDISTRIBUTIONAGREEMENT.txt.snippet.json
These are the scores:  HIGH
These are the scores:  HIGH
These are the scores:  HIGH
This is the file name: perturbed_DovaPharmaceuticalsInc_20181108_10-Q_EX-10.2_11414857_EX-10.2_PromotionAgreement.txt.snippet.json
These are the scores:  HIGH
These are the scores:  HIGH
These are the scores:  HIGH
This is the file name: perturbed_PACIRAPHARMACEUTICALS,INC.-A_RSTRATEGICLICENSING,DISTRIBUTIONANDMARKETINGAGREEMENT.txt.snippet.json
These are the scores:  LOW
These are the scores:  HIGH
These are the scores:  HIGH
This is the file name: perturbed_PfHospitalityGroupInc_20150923_10-12G_EX-10.1_9266710_EX-10.1_FranchiseAgreement3.txt.snippet.json
These are the scores:  LOW
These are the scores:  HIGH
These are the scores:  HIGH
This is the file name: perturbed_LohaCompanyltd_20191209_F-1_EX-10.16_11917878_EX-10.16_SupplyAgreement.txt.snippet.json
These are the scores:  HIGH
Th

{'LOW': 8, 'HIGH': 50}