In [1]:
import random
import json

In [6]:
# Named Entities
investor_names = ["Alice Johnson", "Michael Chen", "Nina Patel", "Carlos Rivera", "Olivia Thompson"]
company_names = [
    "JP Morgan Chase", "Goldman Sachs", "Wells Fargo", "Citibank", "Bank of America",
    "Morgan Stanley", "Capital One", "US Bancorp", "PNC Financial", "American Express"
]


def aml_fine_calculation():
    """Basic: Fine for non-compliance"""
    investor = random.choice(investor_names)
    company = random.choice(company_names)
    violations = random.randint(2, 5)
    fine_per_violation = random.choice([50000, 75000, 100000])
    question = (
        f"{company} was found to have {violations} AML violations. Each violation incurs a fine of ${fine_per_violation}. "
        f"What is the total fine?"
    )
    total_fine = violations * fine_per_violation
    solution = (
        f"Step 1: Multiply number of violations by fine per violation:\n"
        f"  {violations} × ${fine_per_violation} = ${total_fine}\n"
        f"Answer: Total fine = ${total_fine}"
    )
    return question, solution


def compliance_risk_score():
    """Basic: Risk score calculation"""
    investor = random.choice(investor_names)
    company = random.choice(company_names)
    likelihood = random.randint(1, 5)
    impact = random.randint(1, 5)
    detectability = random.randint(1, 5)
    question = (
        f"{investor} assesses {company}'s compliance risk using a Risk Priority Number (RPN), calculated as:\n"
        f"  RPN = Likelihood × Impact × Detectability\n"
        f"Given: Likelihood = {likelihood}, Impact = {impact}, Detectability = {detectability}\n"
        f"What is the RPN?"
    )
    rpn = likelihood * impact * detectability
    solution = (
        f"Step 1: Multiply all risk factors:\n"
        f"  {likelihood} × {impact} × {detectability} = {rpn}\n"
        f"Answer: RPN = {rpn}"
    )
    return question, solution


def delayed_compliance_fine():
    """Intermediate: Fines with time escalation"""
    investor = random.choice(investor_names)
    company = random.choice(company_names)
    base_fine = 20000
    delay_days = random.randint(3, 7)
    fine_per_day = 5000
    total_fine = base_fine + delay_days * fine_per_day
    question = (
        f"{company} was fined ${base_fine} for late AML reporting. Additionally, ${fine_per_day} is charged for each of the {delay_days} delay days. "
        f"What is the total fine?"
    )
    solution = (
        f"Step 1: Calculate additional fine = {delay_days} × ${fine_per_day} = ${delay_days * fine_per_day}\n"
        f"Step 2: Total fine = ${base_fine} + additional = ${total_fine}\n"
        f"Answer: ${total_fine}"
    )
    return question, solution

# Weighted risk of regions
def regional_risk_weighted_score():
    """Intermediate: Weighted risk of regions"""
    investor = random.choice(investor_names)
    company = random.choice(company_names)
    regions = ["North America", "Europe", "Asia"]
    weights = [round(random.uniform(0.2, 0.5), 2) for _ in regions]
    weights = [round(w / sum(weights), 2) for w in weights]
    risks = [random.randint(3, 9) for _ in regions]
    weighted_risk = round(sum(w * r for w, r in zip(weights, risks)), 2)
    question = (
        f"{company} evaluates compliance risk in three regions: {regions}. "
        f"Assigned weights: {weights}, Risk scores: {risks}. "
        f"What is the weighted average risk score?"
    )
    solution = (
        f"Step 1: Multiply weight × risk for each region:\n"
        + "\n".join([f"  {w} × {r} = {round(w*r,2)}" for w, r in zip(weights, risks)]) + "\n"
        f"Step 2: Sum all weighted risks: {weighted_risk}\n"
        f"Answer: Weighted Risk Score = {weighted_risk}"
    )
    return question, solution


def composite_risk_rating():
    """Advanced: Composite risk rating"""
    investor = random.choice(investor_names)
    company = random.choice(company_names)
    categories = ["Customer", "Transaction", "Geography"]
    scores = [random.randint(1, 5) for _ in categories]
    weights = [0.4, 0.3, 0.3]
    composite = round(sum(s * w for s, w in zip(scores, weights)), 2)
    question = (
        f"{investor} assigns risk scores to {company} as follows:\n"
        f"  Customer Risk = {scores[0]}, Transaction Risk = {scores[1]}, Geography Risk = {scores[2]}\n"
        f"With weights {weights}, what is the composite risk rating?"
    )
    solution = (
        f"Step 1: Multiply scores by weights:\n"
        + "\n".join([f"  {scores[i]} × {weights[i]} = {round(scores[i]*weights[i],2)}" for i in range(3)]) + "\n"
        f"Step 2: Add all values = {composite}\n"
        f"Answer: Composite Risk Rating = {composite}"
    )
    return question, solution

In [7]:
# selected templates

templates = [
        aml_fine_calculation,
        compliance_risk_score,
        delayed_compliance_fine,
        regional_risk_weighted_score,
        composite_risk_rating
]
for func in templates:
    question, solution = func()
    print(func.__name__)
    print(f"Question: {question}\n",)
    print(f"Solution: {solution}\n\n",)

aml_fine_calculation
Question: Bank of America was found to have 2 AML violations. Each violation incurs a fine of $50000. What is the total fine?

Solution: Step 1: Multiply number of violations by fine per violation:
  2 × $50000 = $100000
Answer: Total fine = $100000


compliance_risk_score
Question: Carlos Rivera assesses Citibank's compliance risk using a Risk Priority Number (RPN), calculated as:
  RPN = Likelihood × Impact × Detectability
Given: Likelihood = 3, Impact = 5, Detectability = 1
What is the RPN?

Solution: Step 1: Multiply all risk factors:
  3 × 5 × 1 = 15
Answer: RPN = 15


delayed_compliance_fine
Question: PNC Financial was fined $20000 for late AML reporting. Additionally, $5000 is charged for each of the 4 delay days. What is the total fine?

Solution: Step 1: Calculate additional fine = 4 × $5000 = $20000
Step 2: Total fine = $20000 + additional = $40000
Answer: $40000


regional_risk_weighted_score
Question: JP Morgan Chase evaluates compliance risk in three r

In [8]:

# ----------- Export All to JSONL -----------

templates = [
    aml_fine_calculation,
    compliance_risk_score,
    delayed_compliance_fine,
    regional_risk_weighted_score,
    composite_risk_rating
]

# List to store all generated problems
all_problems = []

# Generate 10 problems for each template
for template_func in templates:
    template_name = template_func.__doc__.split(':')[0].strip()
    print(f"Generating problems for template: {template_name}")
    
    for i in range(10):
        # Generate a unique seed for each problem
        seed = random.randint(1000000000, 4000000000)
        random.seed(seed)
        
        # Generate the problem and solution
        question, solution = template_func()
        
        # Create a JSON entry
        problem_entry = {
            "seed": seed,
            "level": template_name,
            "question": question,
            "solution": solution
        }
        
        # Add to the list of problems
        all_problems.append(problem_entry)
        
        # Reset the random seed
        random.seed()

random.shuffle(all_problems)
# Write all problems to a .jsonl file
output_file = "../../testset/compliance.jsonl"
with open(output_file, "w") as file:
    for problem in all_problems:
        file.write(json.dumps(problem))
        file.write("\n")

print(f"Successfully generated {len(all_problems)} problems and saved to {output_file}")


Generating problems for template: Basic
Generating problems for template: Basic
Generating problems for template: Intermediate
Generating problems for template: Intermediate
Generating problems for template: Advanced
Successfully generated 50 problems and saved to ../../testset/compliance.jsonl
