In [4]:
import random
import csv
import json 

def generate_random_data(num_records, simple):
    random.seed(0)
    data = []
    
    first_names = ["Emma", "Liam", "Olivia", "Noah", "Ava", "Ethan", "Sophia", "Mason", "Isabella", "William",
                   "Mia", "James", "Charlotte", "Benjamin", "Amelia", "Lucas", "Harper", "Henry", "Evelyn", "Alexander"]
    last_names = ["Smith", "Johnson", "Williams", "Brown", "Jones", "Garcia", "Miller", "Davis", "Rodriguez", "Martinez",
                  "Hernandez", "Lopez", "Gonzalez", "Wilson", "Anderson", "Thomas", "Taylor", "Moore", "Jackson", "Martin"]
    
    accession_numbers = random.sample(range(100000, 999999), num_records)
    
    for i in range(num_records):
        accession_number = accession_numbers[i]
        patient_name = f"{random.choice(first_names)} {random.choice(last_names)}"
        age = random.randint(20, 80)
        sex = random.choice(["Male", "Female"])
        
        source_rb = random.choice([0, 1])
        source_lb = random.choice([0, 1])
        if source_rb == 0:
            cancer_rb = 0
        else:
            cancer_rb = random.choice([0, 1])
        
        if source_lb == 0:
            cancer_lb = 0
        else:
            cancer_lb = random.choice([0, 1])
        
        breast_density = random.choice(["a) Almost entirely fatty", "b) Scattered areas of fibroglandular density", 
                                        "c) Heterogeneously dense", "d) Extremely dense"])
        family_history = random.choice(["Positive family history of breast cancer", "No family history of breast cancer"])
        previous_biopsies = random.choice(["Previous breast biopsies", "No previous breast biopsies"])
        hormone_therapy = random.choice(["Current hormone therapy", "No current hormone therapy"])
        
        calcifications = random.choice(["No calcifications", "Benign-appearing calcifications", "Suspicious calcifications"])
        masses = random.choice(["No masses", "Benign-appearing mass", "Suspicious mass"])
        asymmetries = random.choice(["No asymmetries", "Focal asymmetry", "Global asymmetry"])
        skin_changes = random.choice(["No skin changes", "Skin thickening", "Skin retraction"])
        
        basic_info = [
            f"Patient Name: {patient_name}",
            f"Age: {age} years",
            f"Sex: {sex}"
        ]
        
        if simple:
            additional_sentences = [
                "Right breast examined. " if source_rb else "Right breast not examined. ",
                "Left breast examined. " if source_lb else "Left breast not examined. ",
                "Cancer identified in right breast." if cancer_rb else "No cancer identified in right breast.",
                "Cancer identified in left breast." if cancer_lb else "No cancer identified in left breast.",
            ]
        else:
            additional_sentences = [
                f"Breast Density: {breast_density}",
                f"Family History: {family_history}",
                f"Previous Biopsies: {previous_biopsies}",
                f"Hormone Therapy: {hormone_therapy}",
                "Right breast examined." if source_rb else "Right breast not examined.",
                "Left breast examined:" if source_lb else "Left breast not examined.",
                f"Right breast findings: {masses}. {calcifications}. {asymmetries}. {skin_changes}." if source_rb else "",
                f"Left breast findings: {masses}. {calcifications}. {asymmetries}. {skin_changes}." if source_lb else "",
                "Suspicious lesion identified in right breast, further investigation recommended." if cancer_rb else "No suspicious lesions identified in right breast.",
                "Suspicious lesion identified in left breast, biopsy advised." if cancer_lb else "No suspicious lesions identified in left breast.",
                "Follow-up mammogram recommended in 6 months." if (cancer_rb or cancer_lb) else "Routine screening mammogram recommended in 1 year.",
                "Additional ultrasound imaging may provide further clarification." if random.choice([True, False]) else "",
                "Patient reported breast pain, which is likely unrelated to imaging findings." if random.choice([True, False]) else "",
                "Dense breast tissue may lower the sensitivity of mammography." if breast_density in ["c) Heterogeneously dense", "d) Extremely dense"] else "",
            ]
        
        additional_sentences = [s for s in additional_sentences if s]
        
        random.shuffle(additional_sentences)
        
        all_sentences = basic_info + additional_sentences
        
        report_text = "\n".join(all_sentences)
        
        data.append([accession_number, report_text, source_rb, source_lb, cancer_rb, cancer_lb])
    
    return data

def save_to_csv(data, filename):
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["Accession Number", "Report Text", "Source_RB", "Source_LB", "Cancer_RB", "Cancer_LB"])
        writer.writerows(data)

def save_to_json(data, filename):
    datapoints = []
    for acc, text, source_rb, source_lb, cancer_rb, cancer_lb in data: 
        findings = {}
        if source_rb:
            findings["Right Breast"] = cancer_rb
        if source_lb: 
            findings["Left Breast"] = cancer_lb
        datapoints.append({
            "Accession Number": acc,
            "Report Text": text,
            "Cancer_Diagnosis": json.dumps(findings)
        })
    print(datapoints)

    with open(filename, 'w') as jsonfile:
        json.dump(datapoints, jsonfile, indent=4)

simple = False
num_records = 100
random_data = generate_random_data(num_records, simple)

save_to_csv(random_data, "random_medical_data.csv")
save_to_json(random_data, "random_medical_data.json")

print(f"{num_records} random records have been generated and saved to 'random_medical_data.csv' and 'random_medical_data.json'")

print("\nExample of the first 3 generated records:")
for record in random_data[:3]:
    print(f"\nAccession Number: {record[0]}")
    print(f"Report Text:\n{record[1]}")
    print(f"Source_RB: {record[2]}, Source_LB: {record[3]}, Cancer_RB: {record[4]}, Cancer_LB: {record[5]}")

[{'Accession Number': 985440, 'Report Text': 'Patient Name: Alexander Moore\nAge: 57 years\nSex: Female\nLeft breast not examined.\nFollow-up mammogram recommended in 6 months.\nSuspicious lesion identified in right breast, further investigation recommended.\nHormone Therapy: Current hormone therapy\nRight breast examined.\nRight breast findings: No masses. No calcifications. No asymmetries. Skin retraction.\nBreast Density: c) Heterogeneously dense\nNo suspicious lesions identified in left breast.\nPrevious Biopsies: No previous breast biopsies\nDense breast tissue may lower the sensitivity of mammography.\nFamily History: Positive family history of breast cancer', 'Cancer_Diagnosis': '{"Right Breast": 1}'}, {'Accession Number': 503958, 'Report Text': 'Patient Name: Harper Davis\nAge: 74 years\nSex: Male\nFollow-up mammogram recommended in 6 months.\nRight breast findings: Suspicious mass. Benign-appearing calcifications. Global asymmetry. Skin thickening.\nSuspicious lesion identifie