# Biorepsoitory Validation Code - BioVal 

Code validates individually the current redcap repository and the biorep data intended to be uploaded.  

In [1]:
#Example für Download von RedCap using requests;
import requests


api_url = "https://redcap.uni-heidelberg.de/api/"
api_token = "19C2091A845FCAB1954F79E7F1A44374"

data = {
    'token': api_token,
    'content': 'record',
    'format': 'json',
    'type': 'flat',
    'forms[0]': 'biorepository',  # Name des Instruments/Forms
    'rawOrLabel': 'raw',
    'rawOrLabelHeaders': 'raw',
    'exportSurveyFields': 'true',
    'exportDataAccessGroups': 'true'
}

response = requests.post(api_url, data=data)
records = response.json()

In [2]:
#import required python libaries
import csv
import sys
import re
from datetime import datetime
import requests
import csv


# === Define allowed values and required fields ===
REQUIRED_FIELDS = [
    "study_id", "lab_id", "redcap_event_name", "biomaterial", "tube_pos",
    "tube_id", "box_id", "freezer", "rack", "box", "tube_status"
]

#Valid Biomaterial in aliquots
VALID_MATERIALS = [
    "csf", "csf pellet", "dna", "edta plasma", "fibroblasten",
    "paxgene", "pbmc", "serum", "urin"
]

#Study_id pattern is in the order XXX-XXX-XXX with x being a natural number
STUDY_ID_PATTERN = re.compile(r"^\d{3}-\d{3}-\d{3}$") #d means here digit betwenn 0-9

#Valid redcap events/ #Redcap event name stellvertretend für Arm der Studie
VALID_EVENTS = ["baseline_arm_1", "screening_arm_1", "follow_up_arm_1"]
#Das muss noch verändert werden

VALID_tube_status = [""]

#Allowed matrices (positions) per biomaterial
VALID_POS_FLUIDS = [f"{row}{col}" for row in "ABCDEFGH" for col in range(1, 13)]
VALID_POS_PAXGENE = [f"{row}{col}" for row in "ABCDEFG" for col in range(1, 8)]
VALID_POS_DNA_CELLS_PBMC =  [f"{row}{col}" for row in "ABCDEFGHJ" for col in range(1, 11)]

#Freezers
VALID_FREEZER = ["1", "2", "3", "nitrogen", "4deg"]

#Boxes (same for all, unless exception later)
VALID_BOX = [str(i) for i in range(1, 43)]  # 1–42

#Racks
VALID_RACK = [str(i) for i in range(1, 101)]  # 1–100



STORAGE_RULES = {
    "BIOFLUID": {
        "freezers": ["1", "2", "3"],
        "racks": list(map(str, range(1, 8))),   # 1–7
        "boxes_per_rack": 7,
        "rows": list("ABCDEFGH"),
        "cols": list(map(str, range(1, 13))),   # 1–12
    },

    "PAXGENE": {
        "freezers": ["1", "2", "3"],
        "racks": list(map(str, range(1, 8))),
        "boxes_per_rack": 7,
        "rows": list("ABCDEFG"),
        "cols": list(map(str, range(1, 8))),    # 1–7
    },

    "DNA": {
        "freezers": ["4deg"],
        "racks": list(map(str, range(1, 8))),
        "boxes_per_rack": 7,
        "rows": list("ABCDEFGHIJ"),
        "cols": list(map(str, range(1, 11))),   # 1–10
    },

    "CELLS": {
        "freezers": ["nitrogen"],
        "racks": list(map(str, range(1, 8))),   # assumption for now
        "boxes_per_rack": 14,                   # boxnum fortlaufend
        "rows": list("ABCDEFGHIJ"),
        "cols": list(map(str, range(1, 11))),
    },
}

         

In [3]:
# === Functions === 
# Helperfunction, Validation function 

def read_csv(path):
    """
    Helper function. Reads a CSV file and returns its headers and row data.

    Args:
        path (str): Path to the CSV file

    Returns:
        tuple: (List[str] headers, List[Dict] rows)
    """
    with open(path, newline='', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        rows = list(reader)
    return reader.fieldnames, rows


def check_structure(headers):
    """
    Validation function. Checks whether all required fields are present in the CSV headers.

    Args:
        headers (List[str]): Column headers from the CSV

    Returns:
        List[str]: List of missing required fields (if any)
    """
    missing = [field for field in REQUIRED_FIELDS if field not in headers]
    return missing



def validate_row(row, index):
    """
    Validation function. Validates a single row for required values and correct formats if needed.
    The validation process can be done for any type of data - to be uploaded ones or the RedCap Reference
    data. 
    
    Steps: 
    (1) Checks again for the Required fields (it does that to safe it as error message)
    (2) Checks the materialspecific storage rules. Checks 
    (3) 
    Args:
        row (Dict): A row from the CSV as a dictionary
        index (int): The row number (for error reporting)

    Returns:
        errors List[str]: List of validation error messages for this row
    """
    errors = []

    # (1) Required fields   #das ist aktuell doppelt
    for field in REQUIRED_FIELDS:
        if row.get(field, "").strip() == "":
            errors.append(f"Row {index}: Missing value in '{field}'")
    # (2) Checks the Biomaterialspecific storage rules
    # Get material 
    biomaterial = row.get("biomaterial", "").strip().lower()
    tube_pos = row.get("tube_pos", "").strip()
    freezer = row.get("freezer", "").strip()
    rack = row.get("rack", "").strip()
    box = row.get("box", "").strip()
    #  Material-specific storage rules - safe in errors if there is a mistake
    if biomaterial in BIOFLUIDS:  # fluids
        if tube_pos not in VALID_POS_FLUIDS:
            errors.append(f"Row {index}: Invalid tube-pos '{tub_pos}' for {biomaterial} (must be A1–H10)")
        if freezer not in ["1", "2", "3"]:
            errors.append(f"Row {index}: {biomaterial} must be stored in -80 freezers (1–3).")

    elif biomaterial in PAXGENE:
        if tube_pos not in VALID_POS_PAXGENE:
            errors.append(f"Row {index}: Invalid tube-pos '{tube_pos}' for PAXgene (must be A1–G7)")
        if freezer not in ["1", "2", "3"]:
            errors.append(f"Row {index}: PAXgene must be stored in -80 freezers (1–3).")

    elif biomaterial in DNA:
        if tube_pos not in VALID_POS_DNA_CELLS_PBMC:
            errors.append(f"Row {index}: Invalid tube-pos '{tube_pos}' for DNA (must be A1–J10)")
        if freezer != "4deg":
            errors.append(f"Row {index}: DNA must be stored in 4-degree freezer.")

    elif biomaterial in CELLS:
        if tube_pos not in VALID_POS_DNA_CELLS_PBMC:
            errors.append(f"Row {index}: Invalid tube-pos '{tube_pos}' for {biomaterial} (must be A1–J10)")
        if freezer != "nitrogen":
            errors.append(f"Row {index}: {biomaterial} must be stored in nitrogen tank.")
    #Eception
    else:
        errors.append(f"Row {index}: Unknown or unsupported material '{biomaterial}'")
        
    # Please hier noch den Check einfügen für den Tube_status

    # General checks still valid - hier muss eigentlich unterschieden werden zwischen Racks 
    # da z.B. cells ja in den Tank gehen und da ist die Struktur anders
    if rack and rack not in VALID_RACK:
        errors.append(f"Row {index}: Invalid rack '{rack}'")

    if box and box not in VALID_BOX:
        errors.append(f"Row {index}: Invalid box '{box}'")

    return errors
        
def validate_file(path, label):
    """
    Core function from GUI. Validates an entire CSV file and raises ValueError if anything is wrong. The validation
    is done line by line.
    
    Args:
        path (str): Path to the CSV file
        label (str): Descriptive label (e.g. "Import file")

    Returns:
        rows List[dict]: List of rows from the to be validated file. The row is handeled like a dictionary.
    """
    print(f" Checking {label}: {path}")
    headers, rows = read_csv(path)
    
    # Check for required column headers
    structure_errors = check_structure(headers)
    if structure_errors:
        raise ValueError(f"Missing required columns in {label}: {structure_errors}")
    #errors hier definieren und dann mitgeben anstatt den Validation check for required columns doppelt zu machen
    # Validate each row; could be done vectorized probably even faster but for now it is ok!
    for i, row in enumerate(rows, start=2):
        validate_row(row, i)  # will raise immediately if invalid

    print(f" {label} passed all validation checks.\n")
    return rows  # return rows if valid

#hier muss auch der status integriert werden
def get_occupied_positions(rows):
    """
    Core function from GUI. Extracts all occupied positions from a list of data rows.

    Args:
        rows (List[Dict]): Data rows (e.g. from reference file)

    Returns:
        positions Set[Tuple[str, str, str, str]]: Set of (freezer, rack, box, pos)
    """
    positions = set()
    for row in rows:
        key = (
            row.get("freezer", "").strip(),
            row.get("rack", "").strip(),
            row.get("box", "").strip(),
            row.get("tube_pos", "").strip(),
        )
        if all(key):
            positions.add(key)
    return positions



def get_occupied_positions(rows):
    """
    Core function from GUI. Extracts all occupied positions from a list of data rows, only if
    they match the tube_status stored. 

    Args:
        rows (List[Dict]): Data rows (e.g. from reference file)

    Returns:
        positions Set[Tuple[str, str, str, str]]: Set of (freezer, rack, box, pos)
    """
    positions = set()

    for row in rows:
        status = row.get("tube_status", "").strip()

        # Only "Stored" tubes occupy a position
        if status != "1":
            continue

        key = (
            row.get("freezer", "").strip(),
            row.get("rack", "").strip(),
            row.get("box", "").strip(),
            row.get("tube_pos", "").strip(),
        )

        if all(key):
            positions.add(key)

    return positions



def check_duplicate_positions(import_rows, occupied_positions, label="Import vs Reference"):
    """
    Core function from GUI. Compares import rows against existing positions and raises errors for conflicts.

    Args:
        import_rows (List[Dict]): Rows from the import file
        occupied_positions (Set[Tuple]): Set of existing (freezer, rack, box, pos)
        label (str): Descriptive name for error reporting

    Raises:
        ValueError: if duplicate positions are found or 0 if none are found
    """
    errors = []
    for i, row in enumerate(import_rows, start=2):
        key = (
            row.get("freezer", "").strip(),
            row.get("rack", "").strip(),
            row.get("box", "").strip(),
            row.get("tube_pos", "").strip(),
        )
        if key in occupied_positions:
            errors.append(f"Row {i}: Position {key} is already occupied in reference data.")

    if errors:
        raise ValueError(f"{label} – {len(errors)} duplicate position error(s):\n" + "\n".join(errors))
    else:
        print(f"No duplicate positions found between import and reference data.")
        return 0



def check_internal_duplicates(rows, label):
    """
    Core function in GUI. Checks for duplicate positions within a single file.

    Args:
        rows (List[Dict]): Rows to check
        label (str): Descriptive name for error output

    Returns:
        ValueError: if internal duplicates found otherwise returns
    """
    position_map = {}

    for i, row in enumerate(rows, start=2):  # row index starts from line 2
        key = (
            row.get("freezer", "").strip(),
            row.get("rack", "").strip(),
            row.get("box", "").strip(),
            row.get("tube_pos", "").strip(),
        )

        if all(key):
            position_map.setdefault(key, []).append(i)

    # Look for positions used more than once
    duplicates = {k: v for k, v in position_map.items() if len(v) > 1}
    
    # Duplicates Handling
    if duplicates: 
        details = "\n".join([f" - Position {k} found on rows {v}" for k, v in duplicates.items()])
        raise ValueError(f"Duplicate positions found within {label}:\n{details}")
    else:
        print(f"No duplicate positions found within {label}.")
        
    return 

def check_internal_tube_id_duplicates(rows, label):
    """
    Validation function. Checks for duplicate tube_id values within a single dataset (e.g. the reference file).

    Args:
        rows (List[Dict]): Data rows to check
        label (str): Name of the file being checked (for reporting)

    Returns:
        None: exits if duplicates are found
    """
    tube_id_map = {}

    for i, row in enumerate(rows, start=2):
        tid = row.get("tube_id", "").strip()
        if tid:
            tube_id_map.setdefault(tid, []).append(i)

    duplicates = {tid: idxs for tid, idxs in tube_id_map.items() if len(idxs) > 1}

    if duplicates:
        print(f"Duplicate tube_id(s) found in {label}:")
        for tid, idxs in duplicates.items():
            print(f" - tube_id '{tid}' appears in rows {idxs}")
        sys.exit(1)
    else:
        print(f"All tube_id values in {label} are unique.")

def write_report(filename, import_file, reference_file, labid_messages =None, import_rows=None,
                 errors=None, recommendation=None):
    """
    Writes a validation report to a text file.

    Args:
        filename (str): Path to save the report
        import_file (str): Path to import CSV
        reference_file (str): Path to reference CSV
        import_rows (List[Dict], optional): Imported rows, for summary stats
        errors (List[str], optional): List of error messages collected
        recommendation (str, optional): Recommendation to upload or not
    """
    with open(filename, "w", encoding="utf-8") as f:
        f.write("Biorepository Data Validation Report\n")
        f.write(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M')}\n")
        f.write("="*50 + "\n\n")

        f.write("Input files:\n")
        f.write(f" - Import: {import_file}\n")
        f.write(f" - Reference: {reference_file}\n\n")

        # Summary
        f.write("Summary:\n")
        if import_rows:
            f.write(f" - Number of import rows processed: {len(import_rows)}\n")
        f.write("\n")

        # Errors
        f.write("Errors / Warnings:\n")
        if errors and len(errors) > 0:
            for e in errors:
                f.write(f" - {e}\n")
        else:
            f.write(" - None\n")
        f.write("\n")
        
        # Lab Id assignment
        f.write("Lab Id assignment")
        if labid_messages: 
            for msg in id_assignment_messages:
                f.write(f" - {msg}\n")
        else:
            f.write(" - None\n")
        f.write("\n")
        
        # Recommendation
        f.write("Recommendation:\n")
        if recommendation:
            f.write(f"{recommendation}\n")
        else:
            f.write("No recommendation provided.\n")


def download_reference_from_redcap(api_url, api_token, form_name="biorepository"):
    """
    Helper function form GUI. Downloads reference data directly from REDCap via API.

    Args:
        api_url (str): The REDCap API endpoint URL
        api_token (str): Your REDCap API token
        form_name (str): The name of the REDCap instrument to export

    Returns:
        records List[Dict]: A list of flat REDCap records
    """

    data = {
        'token': api_token,
        'content': 'record',
        'format': 'json',
        'type': 'flat',
        'forms[0]': form_name,
        'rawOrLabel': 'raw',
        'rawOrLabelHeaders': 'raw',
        'exportSurveyFields': 'true',
        'exportDataAccessGroups': 'true'
    }
    #EAFP style
    try:
        response = requests.post(api_url, data=data, timeout=20)

        # API communication errors
        if response.status_code != 200:
            raise Exception(f"REDCap API returned status code {response.status_code}: {response.text}")

        # JSON decode errors 
        try:
            records = response.json()
        except Exception:
            raise Exception("Could not decode JSON returned from REDCap. Response was:\n" + response.text)

        if not isinstance(records, list):
            raise Exception("Unexpected API response format. Expected list of records.")

        print(f"Successfully downloaded {len(records)} records from REDCap.")
        return records

    except requests.exceptions.ConnectTimeout:
        raise Exception("Connection timed out while contacting REDCap API.")
    except requests.exceptions.ConnectionError:
        raise Exception("Could not connect to REDCap. Check VPN, URL, or internet.")
    except Exception as e:
        raise Exception(f"REDCap API error: {str(e)}")


def save_data_as_csv(records, out_path):
    """
    Helper function form GUI. Safes data from to csv at out_path.

    Args:
        records List[Dict]: A list of flat REDCap records
        out_path (str): Storage path
    """
    if not records:
        raise Exception("No records to write.")

    keys = sorted(records[0].keys())

    with open(out_path, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=keys)
        writer.writeheader()
        writer.writerows(records)

    print(f"Data saved to {out_path}")
    
    
def build_patient_map(reference_rows):
    """
    Helper function. Builds study_id → lab_patient_id mapping from REDCap reference data.
    
    study_to_lab 
    
    Args:
        refernce_rows: records form Reference Dataset. Usually csv file.

    Returns:
        study_to_lab List[Dict]: Mapping from study_id -> pat_id
        lab_to_study List[Dict]: Mapping from lab_id -> study_id
        used_lab_ids List: list of used lab ids. 
    """

    study_to_lab = {}
    lab_to_study = {}
    used_lab_ids = set()

    for row in reference_rows:
        study_id = row.get("study_id", "").strip()
        lab_id = row.get("lab_id", "").strip()

        if not study_id or not lab_id:
            continue
         
        if study_id in study_to_lab and study_to_lab[study_id] != lab_id:
            raise Exception(f"Study ID {study_id} has multiple lab IDs.")

        if lab_id in lab_to_study and lab_to_study[lab_id] != study_id:
            raise Exception(f"Lab ID {lab_id} is linked to multiple study IDs.")

        lab_to_study[lab_id] = study_id
        study_to_lab[study_id] = lab_id
        used_lab_ids.add(int(lab_id))

    return study_to_lab, lab_to_study, used_lab_ids

def get_next_lab_patient_id(used_lab_ids):
    """
    Helper function. Gets next available Lab ID.
    
    Args:
        used_lab_ids List: list of used lab ids. 
    Returns:
        Float: next available Lab ID 
    """
    #vielleicht sollte ich hier lieber einen string testen? ich will ja 00001 und nicht 1 
    if not used_lab_ids:
        return 1
    return max(used_lab_ids) + 1

def assign_lab_patient_ids(import_rows, reference_rows):
    """
    Core function from Gui. Assigns lab_id based on study_id and reference data. The lab id does not need to be plugged in; 
    BioVal finds the last lab id in the reference data and automatically asigns to the to be importated
    data the new lab id. 
    
    Args:
    
    Returns:
    
    """
    # !!!!!!!!!!!!!!! Make absolutley sure, that the labID is never filled in; also wenn mal eine "frei" wird sozusagen
    study_to_lab, lab_to_study, used_lab_ids = build_patient_map(reference_rows)
    next_id = get_next_lab_patient_id(used_lab_ids)
    labid_messages = [f"Next available lab patient ID: {next_id:05d}"]
    
    #ich checke hier actuell nur die imported rows! 
    for i, row in enumerate(import_rows, start=2):
        study_id = row.get("study_id", "").strip()

        if not study_id:
            raise Exception(f"Row {i}: Missing study_id")

        if not STUDY_ID_PATTERN.match(study_id):
            raise Exception(f"Row {i}: Invalid study_id format '{study_id}'")

        if study_id in study_to_lab:
            row["lab_id"] = study_to_lab[study_id]
        else:
            lab_id = f"{next_id:05d}"
            row["lab_id"] = lab_id
            study_to_lab[study_id] = lab_id
            used_lab_ids.add(next_id)
            next_id += 1
            labid_messages.append(
                f"Assigned lab patient ID {lab_id} to study ID {study_id}"
            )

    return import_rows, labid_messages






def is_position_occupied(freezer, rack, box, pos, occupied_positions):
    """
    Validation function. Validates the position and checks if it is occupied.

    Args:
        freezer (str): Freezer number or name (e.g. "1", "2", "nitrogen")
        rack (str): Rack number (1–100)
        box (str): Box number (1–100)
        pos (str): Position (e.g., A1–H12)
        occupied_positions (Set[Tuple[str, str, str, str]]): Known used positions

    Returns:
        bool: True if occupied, False if available

    Raises:
        ValueError: If input values are invalid
    """

    freezer = str(freezer).strip()
    rack = str(rack).strip()
    box = str(box).strip()
    pos = str(pos).strip().upper()

    valid_freezers = {"1", "2", "3", "nitrogen"}
    if freezer not in valid_freezers:
        raise ValueError(f"Invalid freezer: '{freezer}' (must be 1, 2, 3, or nitrogen)")

    if not rack.isdigit() or not (1 <= int(rack) <= 100):
        raise ValueError(f"Invalid rack: '{rack}' (must be integer 1–100)")

    if not box.isdigit() or not (1 <= int(box) <= 100):
        raise ValueError(f"Invalid box: '{box}' (must be integer 1–100)")

    if not re.match(r"^[A-H](?:[1-9]|1[0-2])$", pos):
        raise ValueError(f"Invalid position: '{pos}' (must be A1–H12)")

    key = (freezer, rack, box, pos)
    return key in occupied_positions


from itertools import product

def generate_positions_for_material(material_key):
    """
    Helper function. Generates the matrices for different Biomaterial according to the freezer
    set-ups.

    Args:
        material_key (str): Biomaterial intended to store.

    Returns:
        positions (list): 
    Raises:
        ValueError: If input values are invalid #not yet but would be good
    """
    rules = STORAGE_RULES[material_key]
    positions = set()

    for freezer, rack in product(rules["freezers"], rules["racks"]):
        for box in map(str, range(1, rules["boxes_per_rack"] + 1)):
            for row, col in product(rules["rows"], rules["cols"]):
                pos = f"{row}{col}"
                positions.add((freezer, rack, box, pos))

    return positions


def get_available_positions(material_key, reference_rows):
    """
    GUI core Function. Generates the matrices for different Biomaterial according to the freezer
    set-ups. Calculates which positions are occupied and returns the sorted positions.

    Args:
        material_key (str): Biomaterial intended to store.
        reference_rows (List[Dict]): Existing REDCap data
 
    Returns:
        positions (list): Avaialbe positions for the selected Biomaterial.    
    """
    all_pos = generate_positions_for_material(material_key)
    occupied = get_occupied_positions(reference_rows)
    return sorted(all_pos - occupied)

FREEZER_ORDER = {
    "1": 1,
    "2": 2,
    "3": 3,
    "4deg": 4,
    "nitrogen": 5,
}

def split_pos(pos):
    """
    Helper function. Splits the tube position in row and column such that the sorting algorithm
    is able to sort the positions after row and col sepperately. 

    Args:
        pos (list): Available positions
 
    Returns:
        row (str): Row position in box depends on matrix  
        col (int): Col position in box depends on matrix
    """
    row = pos[0]
    col = int(pos[1:])
    return row, col


def position_sort_key(item):
    """
    Helper function. Generates a sort key for Python inbuild sorted() function, such that the 
    freezer position is sorted via freezer, rack, box, pos for any particular Biofluid.

    Args:
        item: 
 
    Returns:
        positions (list): sorted positions       
    """
    freezer, rack, box, pos = item
    row, col = split_pos(pos)  #seperates it in for example A and 7 instead of A7, so that the sorting works 
    #print(type(col))
    return (
        FREEZER_ORDER.get(freezer, 99),
        int(rack),
        int(box),
        row,
        col,
    )

def pos_sort_key(pos):
    row = pos[0]
    col = int(pos[1:])
    return (row, col)


import csv

def save_positions_to_csv(path, positions):
    """
    Saves selected positions to CSV.

    Columns:
    freezer, rack, box, pos
    """
    with open(path, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["freezer", "rack", "box", "pos"])
        
        for freezer, rack, box, pos in positions:
            
            freezer_code = FREEZER_ORDER[freezer]

            
            writer.writerow([pos, box, rack, freezer_code])



In [10]:
#### Generate the available positions
#downloads Biorep

#ref_rows = read_csv("/home/aaron/Desktop/BioVal/data/Ref_file_test.csv")
reference_rows = download_reference_from_redcap(api_url, api_token)
#save_reference_as_csv(records, "/home/aaron/Desktop/BioVal/data/Ref_file_test.csv"
available_positions = get_available_positions("BIOFLUID", reference_rows)

available_positions = sorted(available_positions, key=position_sort_key)



Successfully downloaded 8 records from REDCap.


In [11]:
available_pos = select_positions_for_material("BIOFLUID", available_positions)
available_pos
#save_positions_to_csv("/home/aaron/Desktop/BioVal/data/available_positions.csv",available_positions)

[('1', '1', '1', 'A2'),
 ('1', '1', '1', 'A3'),
 ('1', '1', '1', 'A4'),
 ('1', '1', '1', 'A5'),
 ('1', '1', '1', 'A6'),
 ('1', '1', '1', 'A7'),
 ('1', '1', '1', 'A8'),
 ('1', '1', '1', 'A9'),
 ('1', '1', '1', 'A10'),
 ('1', '1', '1', 'A11'),
 ('1', '1', '1', 'A12'),
 ('1', '1', '1', 'B1'),
 ('1', '1', '1', 'B2'),
 ('1', '1', '1', 'B3'),
 ('1', '1', '1', 'B4'),
 ('1', '1', '1', 'B5'),
 ('1', '1', '1', 'B6'),
 ('1', '1', '1', 'B7'),
 ('1', '1', '1', 'B8'),
 ('1', '1', '1', 'B9'),
 ('1', '1', '1', 'B10'),
 ('1', '1', '1', 'B11'),
 ('1', '1', '1', 'B12'),
 ('1', '1', '1', 'C1'),
 ('1', '1', '1', 'C2'),
 ('1', '1', '1', 'C3'),
 ('1', '1', '1', 'C4'),
 ('1', '1', '1', 'C5'),
 ('1', '1', '1', 'C6'),
 ('1', '1', '1', 'C7'),
 ('1', '1', '1', 'C8'),
 ('1', '1', '1', 'C9'),
 ('1', '1', '1', 'C10'),
 ('1', '1', '1', 'C11'),
 ('1', '1', '1', 'C12'),
 ('1', '1', '1', 'D1'),
 ('1', '1', '1', 'D2'),
 ('1', '1', '1', 'D3'),
 ('1', '1', '1', 'D4'),
 ('1', '1', '1', 'D5')]

In [6]:
def select_positions_for_material(material, available_positions):
    """
    GUI core function. Selects available positions based on STORAGE_RULES for BIOFLUID, PAXGENE, DNA
    and CELLS. 

    Args:
        material (str): Biomaterial (must exist in STORAGE_RULES (Global var))
        available_positions (list): List of available positions

    Returns:
        list: Selected positions

    Raises:
        ValueError: If material has no storage rule
    """
    material = material.strip().upper()

    if material not in STORAGE_RULES:
        raise ValueError(
            f"Material '{material}' has no defined STORAGE_RULE."
        )

    # Biofluids get special selection logic; 15 at least in one box.
    if material == "BIOFLUID":
        return select_positions_biofluids(available_positions)

    # All others use single-position logic because it is more likely to have single aliquots
    return select_positions_single(material, available_positions)


    
def select_positions_single(material, available_positions, positionnumber = 20):
    """
    Helper function. Selects single positions (default 20) sequentially for PAXGENE, CELLS and DNA. 


    Args: 
        material (str): Biomaterial (must exist in STORAGE_RULES (Global var))
        available_positions (list): List of available positions
    Returns:
        list: Selected sorted positions    
    """
    return sorted(available_positions, key=position_sort_key)[:positionnumber]


def select_positions_biofluids(available_positions):
    """
    Helper function. Selects positions per box sequentially for BIOFLUIDS. 

    Args: 
        available_positions (list): List of available positions
    Returns:
        list: Selected sorted positions    
    """
    box_map = group_positions_by_box(available_positions)

    selected = []
    box_count = 0

    for (freezer, rack, box), positions in box_map.items():
        if len(positions) < 15:
            continue

        box_count += 1
        for pos in positions:
            selected.append((freezer, rack, box, pos))
            if len(selected) >= 40:
                return selected

        if box_count >= 2:
            break

    return selected

from collections import defaultdict

def group_positions_by_box(positions):
    """
    Helper function. Groups positions by (freezer, rack, box).
    args
        positions (list): List of available positions
    Returns:
        box_map: Dict[(freezer, rack, box), List[pos]]
    """
    box_map = defaultdict(list)

    for freezer, rack, box, pos in positions:
        box_map[(freezer, rack, box)].append(pos)

    # sort positions inside each box
    for key in box_map:
        box_map[key] = sorted(box_map[key], key=pos_sort_key)

    return box_map

FREEZER_ORDER = {
    "1": 1,
    "2": 2,
    "3": 3,
    "4deg": 4,
    "nitrogen": 5,
}

In [5]:
### Unterschied zwischen Biofluids und Cells 
"""
Biofluids: CSF (Cerebrospinalfluid), EDTA Plasam (Also nicht geronnen), Serum (geronnen abzentrifugiert), Urin, CSF Pellets
Cells: Fibroblasten, PBMC (Peripher mononukläre Blutzellen)?, 
Other: DNA, PAXgene (RNA)

Specific Fluids/Cells go into specific locations. 
"""

### Für Positions: 
"""
Für die Biofluids gibt es pro Rack (Gestell); 7 Schubladen a 6 Boxpositionen; die Boxen haben dann widerum ABCDEFGH 1-12. 

Variable	Wertbeispiel	Bedeutung
pos	B3	Raster-Position in der Box
tube_id	20250123-1	Eindeutige Probenkennung
box_id	BX-00017	Box-Nummer
freezer	2	Tiefkühler Nummer 2
rack	14	Rack (Gestell) Nummer 14 im Schrank
box	31	Box Nummer 31 im Rack (die Schulade wird nicht spezifisch genannt)
A1-H12	D7	Alternative Positionsangabe in 96er-Box
nitrogen	-	Lagerort im Stickstofftank (anstelle von „freezer“)

Ok box meint die position 1-42 im Rack.!

"""


'\nFür die Biofluids gibt es pro Rack (Gestell); 7 Schubladen a 6 Boxpositionen; die Boxen haben dann widerum ABCDEFGH 1-12. \n\nVariable\tWertbeispiel\tBedeutung\npos\tB3\tRaster-Position in der Box\ntube_id\t20250123-1\tEindeutige Probenkennung\nbox_id\tBX-00017\tBox-Nummer\nfreezer\t2\tTiefkühler Nummer 2\nrack\t14\tRack (Gestell) Nummer 14 im Schrank\nbox\t31\tBox Nummer 31 im Rack (die Schulade wird nicht spezifisch genannt)\nA1-H12\tD7\tAlternative Positionsangabe in 96er-Box\nnitrogen\t-\tLagerort im Stickstofftank (anstelle von „freezer“)\n\nOk box meint die position 1-42 im Rack.!\n\n'

In [5]:
#downloads Biorep
api_url = "https://redcap.uni-heidelberg.de/api/"
api_token = "19C2091A845FCAB1954F79E7F1A44374"
records = download_reference_from_redcap(api_url, api_token)

save_reference_as_csv(records, "/home/aaron/Desktop/BioVal/data/Ref_file_test.csv")

Successfully downloaded 7 records from REDCap.
Reference data saved to /home/aaron/Desktop/BioVal/data/Ref_file_test.csv


In [6]:
_, ref_rows = read_csv("/home/aaron/Desktop/BioVal/data/Ref_file_test.csv")
build_patient_map(ref_rows)

({}, set())

In [7]:
ref_rows

[{'accession_nr': '',
  'biomaterial': '',
  'biorepository_complete': '0',
  'box': '2',
  'box_id': '200',
  'cell_number': '',
  'cohort': '',
  'comment': '',
  'date_received': '',
  'extern_processed': '',
  'extraction': '',
  'extraction_time': '',
  'fasting': '',
  'fibro_passage': '',
  'freeze_date': '',
  'freeze_time': '',
  'freezer': '1',
  'lab_id': '',
  'processed': '',
  'processing_time': '',
  'rack': '1',
  'redcap_event_name': 'screening_arm_1',
  'redcap_repeat_instance': '',
  'redcap_repeat_instrument': '',
  'reserved_date': '',
  'reserved_for': '',
  'sent_date': '',
  'sent_project': '',
  'study': '',
  'study_id': '1',
  'study_visit': '',
  'thaw_date': '',
  'tube_id': '2345',
  'tube_pos': '',
  'virus_diag': '',
  'volume': ''},
 {'accession_nr': '',
  'biomaterial': '',
  'biorepository_complete': '0',
  'box': '1',
  'box_id': '1',
  'cell_number': '',
  'cohort': '',
  'comment': '',
  'date_received': '',
  'extern_processed': '',
  'extraction'

# RUN IT Block

In [None]:
# die beispiele funktionieren aufgrund der anderen datenstruktur nicht mehr

In [69]:
#validate test data
validate_file("/home/aaron/Desktop/BioVal/data/NDEGTest_import_file_double_position.csv", "Import file")

 Checking Import file: /home/aaron/Desktop/BioVal/data/NDEGTest_import_file_double_position.csv
 Import file passed all validation checks.



[{'study_id': '1',
  'redcap_event_name': 'screening_arm_1',
  'redcap_repeat_instrument': '',
  'redcap_repeat_instance': '',
  'lab_id': '124 123 4ll',
  'cohort': '',
  'study': '',
  'study_visit': '',
  'fasting': '',
  'extraction_time': '',
  'extraction': '',
  'processing_time': '',
  'processed': '',
  'extern_processed': '',
  'date_received': '',
  'biomaterial': 'CSF',
  'volume': '',
  'cell_number': '',
  'tube_pos': 'A1',
  'tube_id': '2345',
  'box_id': '',
  'box': '1',
  'rack': '1',
  'freezer': '1',
  'freeze_date': '',
  'freeze_time': '',
  'fibro_passage': '',
  'virus_diag': '',
  'thaw_date': '',
  'sent_date': '',
  'sent_project': '',
  'reserved_date': '',
  'reserved_for': '',
  'accession_nr': '',
  'comment': '',
  'biorepository_complete': '',
  '': '0'},
 {'study_id': '2',
  'redcap_event_name': 'screening_arm_1',
  'redcap_repeat_instrument': '',
  'redcap_repeat_instance': '',
  'lab_id': '111 111 111',
  'cohort': '',
  'study': '',
  'study_visit':

In [82]:
_, import_rows = read_csv("/home/aaron/Desktop/BioVal/data/NDEGTest_import_file.csv")

In [83]:
_, ref_rows = read_csv("/home/aaron/Desktop/BioVal/data/Biorepository_ref_file_2026-01-10_1511.csv")


In [84]:
validate_file("/home/aaron/Desktop/BioVal/data/NDEGTest_import_file.csv", "Import file")
validate_file("/home/aaron/Desktop/BioVal/data/Biorepository_ref_file_2026-01-10_1511.csv", "data file")

 Checking Import file: /home/aaron/Desktop/BioVal/data/NDEGTest_import_file.csv
 Import file passed all validation checks.

 Checking data file: /home/aaron/Desktop/BioVal/data/Biorepository_ref_file_2026-01-10_1511.csv
 data file passed all validation checks.



[{'study_id': '222-222-222',
  'redcap_event_name': 'screening_arm_1',
  'redcap_repeat_instrument': '',
  'redcap_repeat_instance': '',
  'lab_id': '1',
  'cohort': '',
  'study': '',
  'study_visit': '',
  'fasting': '',
  'extraction_time': '',
  'extraction': '',
  'processing_time': '',
  'processed': '',
  'extern_processed': '',
  'date_received': '',
  'biomaterial': '',
  'volume': '',
  'cell_number': '',
  'tube_pos': '',
  'tube_id': '2345',
  'box_id': '200',
  'box': '2',
  'rack': '1',
  'freezer': '1',
  'freeze_date': '',
  'freeze_time': '',
  'fibro_passage': '',
  'virus_diag': '',
  'thaw_date': '',
  'sent_date': '',
  'sent_project': '',
  'reserved_date': '',
  'reserved_for': '',
  'accession_nr': '',
  'comment': '',
  'biorepository_complete': '0'},
 {'study_id': '222-222-222',
  'redcap_event_name': 'baseline_arm_1',
  'redcap_repeat_instrument': '',
  'redcap_repeat_instance': '',
  'lab_id': '1',
  'cohort': '',
  'study': '',
  'study_visit': '',
  'fasti

In [73]:
check_internal_duplicates(ref_rows, "data file")
check_internal_tube_id_duplicates(ref_rows, "Reference data file")


No duplicate positions found within data file.
All tube_id values in Reference data file are unique.


In [85]:
import_rows, account_id = assign_redcap_ids(import_rows, ref_rows)


REDCap record_id assigned based on lab_id.


In [86]:
occupied = get_occupied_positions(ref_rows)
check_duplicate_positions(import_rows, occupied)

No duplicate positions found between import and reference data.


0

In [87]:
occupied = get_occupied_positions(ref_rows)

# Check a specific position manually
freezer = "1"
rack = "1"
box = "1"
pos = "G1"

if is_position_occupied(freezer, rack, box, pos, occupied):
    print(f"Position ({freezer}, {rack}, {box}, {pos}) is already occupied. Please try another one.")
else:
    print(f"Position ({freezer}, {rack}, {box}, {pos}) is free. ")

Position (1, 1, 1, G1) is free. 


In [77]:
map1,map2, used_lab_ids = build_patient_map(ref_rows)

In [88]:
get_next_lab_patient_id(used_lab_ids)

4

In [90]:
#returns the import rows with new assigned lab ID
assign_lab_patient_ids(import_rows,ref_rows)

[{'study_id': '111-111-111',
  'redcap_event_name': 'screening_arm_1',
  'redcap_repeat_instrument': '',
  'redcap_repeat_instance': '',
  'lab_id': '1',
  'cohort': '',
  'study': '',
  'study_visit': '',
  'fasting': '',
  'extraction_time': '',
  'extraction': '',
  'processing_time': '',
  'processed': '',
  'extern_processed': '',
  'date_received': '',
  'biomaterial': 'CSF',
  'volume': '',
  'cell_number': '',
  'tube_pos': 'A12',
  'tube_id': '2345',
  'box_id': '1',
  'box': '1',
  'rack': '1',
  'freezer': '1',
  'freeze_date': '',
  'freeze_time': '',
  'fibro_passage': '',
  'virus_diag': '',
  'thaw_date': '',
  'sent_date': '',
  'sent_project': '',
  'reserved_date': '',
  'reserved_for': '',
  'accession_nr': '',
  'comment': '',
  'biorepository_complete': '',
  '': '0',
  'record_id': '1',
  'lab_patient_id': '00004'}]

In [16]:
def ask_for_material(parent):
    dialog = tk.Toplevel(parent)
    dialog.title("Select Biomaterial")
    dialog.geometry("300x150")
    dialog.grab_set()

    tk.Label(dialog, text="Select biomaterial:").pack(pady=10)

    material_var = tk.StringVar()
    combo = ttk.Combobox(
        dialog,
        textvariable=material_var,
        values=sorted(STORAGE_RULES.keys()),
        state="readonly"
    )
    combo.pack()
    combo.current(0)

    result = {"material": None}

    def confirm():
        result["material"] = material_var.get()
        dialog.destroy()

    ttk.Button(dialog, text="OK", command=confirm).pack(pady=10)

    dialog.wait_window()
    return result["material"]


# GUI - BioVal 

In [103]:
import tkinter as tk
from tkinter import filedialog, messagebox
from tkinter import ttk
from PIL import Image, ImageTk  # Pillow muss installiert sein: pip install pillow

### aktuell ist der download mit eingeschlossen; 
### das ref file ist aufgrund der falschen Dateneingabe auf Redcap
### fehlerhaft, deswegen kommt noch eine Fehlermeldung

def run_validation():
    import_path = filedialog.askopenfilename(title="Select Import CSV")
    #ref_path = filedialog.askopenfilename(title="Select Reference CSV")
    

    

    try:
        # Beispiel: CSV lesen & validieren (Funktionen müssen definiert sein)
        reference_rows = download_reference_from_redcap(api_url, api_token)
        ref_path = "/home/aaron/Desktop/BioVal/data/Ref_file_test.csv"
        save_data_as_csv(reference_rows,ref_path)
        ## Hier List Übersicht, was ist frei im Freezer: 
        import_path = filedialog.askopenfilename(title="Select Import CSV")
        
        
        _, import_rows = read_csv(import_path)
        #_, ref_rows = read_csv(ref_path)
        #Check if either of the files have not the required fields and structure
        validate_file(import_path, "Import file")
        validate_file(ref_path, "Reference data")
        #Check if they have internal dublicates
        check_internal_duplicates(import_rows, "Import file")
        check_internal_duplicates(ref_rows,"Reference date")
        #Check occupied position in freezer #keeps at the moment track of the history! 
        occupied_pos = get_occupied_positions(ref_rows)
        duplicate_positions_count = check_duplicate_positions(import_rows, occupied_pos)
        #Check lab IDs autogenerate them if neccessary
        import_rows, labid_messages = assign_lab_patient_ids(import_rows,ref_rows)
        #hier fänd ichs noch gut, wenn da stehen würde im
        #Val_report jo es wurde eine ID assigned
        save_data_as_csv(import_rows,import_path)
        
        
        # Bericht speichern
        report_path = filedialog.asksaveasfilename(
            defaultextension=".txt",
            filetypes=[("Text files", "*.txt")],
            title="Save Validation Report As"
        )
        if report_path:
            #Write report . 
            write_report(report_path, import_path, ref_path, labid_messages, import_rows, duplicate_positions_count) #new_ids_count ist weg
            messagebox.showinfo("Success", f"Validation completed!\nReport saved at:\n{report_path}")
        else:
            messagebox.showinfo("Success", "Validation completed! No report was saved.")

    except Exception as e:
        messagebox.showerror("Validation Error", str(e))


# --- GUI Setup ---
root = tk.Tk()
root.title("BioVal – Biorepository Validator")
root.geometry("600x400")

main_frame = ttk.Frame(root, padding=20)
main_frame.pack(fill="both", expand=True)

# --- Optional Image ---
try:
    img = Image.open("logo.jpeg")  # <-- Dein Bildpfad
    img = img.resize((150, 150))
    photo = ImageTk.PhotoImage(img)
    logo_label = ttk.Label(main_frame, image=photo)
    logo_label.image = photo
    logo_label.pack(pady=(0, 10))
except Exception:
    print("No image found – skipping logo.")

# --- Welcome Text ---
welcome = ttk.Label(
    main_frame,
    text=(
        "Welcome to BioVal!\n\n"
        "This tool helps you validate biorepository REDCap import files.\n"
        "You’ll be prompted to select:\n"
        " - A new import CSV file\n - the data you want to upload"
        " - A reference dataset for comparison - the data already stored in RedCap \n\n"
        "The tool checks patient IDs, sample positions, and generates a report."
    ),
    justify="center",
    wraplength=500
)
welcome.pack(pady=10)

# --- Start Button ---
start_button = ttk.Button(main_frame, text="Run Validation", command=run_validation)
start_button.pack(pady=20)

root.mainloop()


Successfully downloaded 7 records from REDCap.
Data saved to /home/aaron/Desktop/BioVal/data/Ref_file_test.csv
 Checking Import file: /home/aaron/Desktop/BioVal/data/NDEGTest_import_file.csv
 Import file passed all validation checks.

 Checking Reference data: /home/aaron/Desktop/BioVal/data/Ref_file_test.csv
 Reference data passed all validation checks.

No duplicate positions found within Import file.
No duplicate positions found within Reference date.
No duplicate positions found between import and reference data.
Data saved to /home/aaron/Desktop/BioVal/data/NDEGTest_import_file.csv


# Für Morgen 

#### BioVal: 
- Was ist seit letzdem Treffen passiert? 
    - BioVal hat jetzt eine tube_status variable, die Track über den Tube_status übernimmt
        - Stored, shipped, discarded... and what you wish
    - Außerdem hat BioVal jetzt eine labID, die sich automatisch generiert, wenn eine neue Treat-HSP ID mit Daten hochgeladen wird im Importfile. Die ID ist unique, das bedeutet, für jede TreatHSP ID gibt es nur ein labID; Die labID kann nicht frei werden, sie wird nur ein Mal vergeben und bleibt dann beständig gleich. Auch im Fall dass die Tubes weggeschmissen, freigegeben werden, kann die Tube ID nicht neu assigniert werden. 
    - Gerade mache ich noch die Visualisierung, damit ihr im Labor direkt sehen könntet, welche Plätze noch frei sind. 

#### Aufgaben: 
- Integration des fertigen Instruments in the RedCap RDRegistry Base; erst wenn absolut fertig 
- Laptop von Sophie ready machen; Python, Jupyter Notebook GitHub Download (02.02- in der Woche)
- Testen, Testen, Testen 
    
    
#### Fragen: 
- Offen: 
    - Lab: Required field - an Sophie habe nicht ganz verstanden, worauf sie in der Mail heraus wollte. "Könntest du mir bitte eine Auflistung geben welche Felder für den Upload essentiell sind damit Redcap die Probe zuordnen kann? Dann kann ich entsprechend nicht nötige Felder die anderweitig gepflegt sind löschen. Davor macht es keinen Sinn das Uploadfile usw. an die neuen Variablen anzupassen." 
    - Aissignation für Red
    - Welche instances sind erlaubt; ist es ein required field? wo kann ich das einsehen? das muss ich wissen.
    - Können wir nach dem Testen auf Development Phase die eingegebenen Daten auf die RD Registry Seite übertragen? Müsste ja eigentlich gehen. 
    

In [18]:
import os
import sys
import subprocess

def open_file(path):
    if sys.platform.startswith("win"):
        os.startfile(path)              # Windows
    elif sys.platform == "darwin":
        subprocess.run(["open", path])  # macOS
    else:
        subprocess.run(["xdg-open", path])  # Linux


In [19]:
import tkinter as tk
from tkinter import filedialog, messagebox
from tkinter import ttk
from PIL import Image, ImageTk  # Pillow muss installiert sein: pip install pillow
import os


def run_validation():
    try:
        # ===============================
        # 1. Download reference data
        # ===============================
        reference_rows = download_reference_from_redcap(api_url, api_token)
        ref_path = "/home/aaron/Desktop/BioVal/data/Ref_file_test.csv"
        save_data_as_csv(reference_rows, ref_path)

        # ===============================
        # 2. NEW: Show available positions
        # ===============================
        material = ask_for_material(root)
        if not material:
            messagebox.showinfo("Cancelled", "No biomaterial selected.")
            return

        available_positions = get_available_positions(material, reference_rows)

        selected_positions = select_positions_for_material(
            material,
            available_positions
        )

        csv_path = filedialog.asksaveasfilename(
            defaultextension=".csv",
            filetypes=[("CSV files", "*.csv")],
            title="Save Available Positions",
            initialfile=f"available_positions_{material}.csv"
        )

        if csv_path:
            save_positions_to_csv(csv_path, selected_positions)
            open_file(csv_path)

        # ===============================
        # 3. Continue with import validation
        # ===============================
        import_path = filedialog.askopenfilename(title="Select Import CSV")
        _, import_rows = read_csv(import_path)

        validate_file(import_path, "Import file")
        validate_file(ref_path, "Reference data")

        check_internal_duplicates(import_rows, "Import file")
        check_internal_duplicates(reference_rows, "Reference data")

        occupied_pos = get_occupied_positions(reference_rows)
        duplicate_positions_count = check_duplicate_positions(import_rows, occupied_pos)

        import_rows, labid_messages = assign_lab_patient_ids(import_rows, reference_rows)
        save_data_as_csv(import_rows, import_path)

        # ===============================
        # 4. Report
        # ===============================
        report_path = filedialog.asksaveasfilename(
            defaultextension=".txt",
            filetypes=[("Text files", "*.txt")],
            title="Save Validation Report As"
        )

        if report_path:
            write_report(
                report_path,
                import_path,
                ref_path,
                labid_messages,
                import_rows,
                duplicate_positions_count
            )
            messagebox.showinfo("Success", "Validation completed!")
        else:
            messagebox.showinfo("Success", "Validation completed! No report saved.")

    except Exception as e:
        messagebox.showerror("Validation Error", str(e))

# --- GUI Setup ---
root = tk.Tk()
root.title("BioVal – Biorepository Validator")
root.geometry("600x400")

main_frame = ttk.Frame(root, padding=20)
main_frame.pack(fill="both", expand=True)

# --- Optional Image ---
try:
    img = Image.open("logo.jpeg")  # <-- Dein Bildpfad
    img = img.resize((150, 150))
    photo = ImageTk.PhotoImage(img)
    logo_label = ttk.Label(main_frame, image=photo)
    logo_label.image = photo
    logo_label.pack(pady=(0, 10))
except Exception:
    print("No image found – skipping logo.")

# --- Welcome Text ---
welcome = ttk.Label(
    main_frame,
    text=(
        "Welcome to BioVal!\n\n"
        "This tool helps you validate biorepository REDCap import files.\n"
        "You’ll be prompted to select:\n"
        " - A new import CSV file\n - the data you want to upload"
        " - A reference dataset for comparison - the data already stored in RedCap \n\n"
        "The tool checks patient IDs, sample positions, and generates a report."
    ),
    justify="center",
    wraplength=500
)
welcome.pack(pady=10)

# --- Start Button ---
start_button = ttk.Button(main_frame, text="Run Validation", command=run_validation)
start_button.pack(pady=20)

root.mainloop()

Successfully downloaded 8 records from REDCap.
Data saved to /home/aaron/Desktop/BioVal/data/Ref_file_test.csv
