# Biorepsoitory Validation Code - BioVal 

Code validates individually the current redcap repository and the biorep data intended to be uploaded.  

In [1]:
#GLOBAL PARAMS

REQUIRED_FIELDS = [
    "pat_id",    #does sophie use this? yes! we want to use this but need to find a solution here
    "redcap_event_name",
    "visit_date",
    "material",
    "pos",
    "tube_id",
    "box_id",
    "freezer",
    "rack",
    "box",
    "inf_deseas"
]

#Biorep Sampel Materials
VALID_MATERIALS = [
    "CSF", "CSF Pellet", "DNA", "EDTA Plasma", "Fibroblasten",
    "PAXgene", "PBMC", "Serum", "Urin"
]

VALID_EVENTS = ["baseline_arm_1", "screening_arm_1", "follow_up_arm_1"]

# Allowed matrices (positions) per material
VALID_POS_FLUIDS = [f"{row}{col}" for row in "ABCDEFGH" for col in range(1, 13)]
VALID_POS_PAXGENE = [f"{row}{col}" for row in "ABCDEFG" for col in range(1, 8)]
VALID_POS_DNA_CELLS_PBMC =  [f"{row}{col}" for row in "ABCDEFGHJ" for col in range(1, 11)]

# Freezers
VALID_FREEZER = ["1", "2", "3", "nitrogen", "4deg"]

# Boxes (same for all, unless exception later)
VALID_BOX = [str(i) for i in range(1, 43)]  # 1–42

# Racks
VALID_RACK = [str(i) for i in range(1, 101)]  # 1–100


In [6]:
import csv
import sys
import re

# --- Define allowed values and required fields ---

REQUIRED_FIELDS = [
    "pat_id", "redcap_event_name", "visit_date", "material", "pos",
    "tube_id", "box_id", "freezer", "rack", "box"
]

VALID_MATERIALS = [
    "csf", "csf pellet", "dna", "edta plasma", "fibroblasten",
    "paxgene", "pbmc", "serum", "urin"
]

### Unterschied zwischen Biofluids und Cells 
"""
Biofluids: CSF (Cerebrospinalfluid), EDTA Plasam (Also nicht geronnen), Serum (geronnen abzentrifugiert), Urin, CSF Pellets
Cells: Fibroblasten, PBMC (Peripher mononukläre Blutzellen)?, 
Other: DNA, PAXgene (RNA)

Specific Fluids/Cells go into specific locations. 
"""

### Für Positions: 
"""
Für die Biofluids gibt es pro Rack (Gestell); 7 Schubladen a 6 Boxpositionen; die Boxen haben dann widerum ABCDEFGH 1-12. 

Variable	Wertbeispiel	Bedeutung
pos	B3	Raster-Position in der Box
tube_id	20250123-1	Eindeutige Probenkennung
box_id	BX-00017	Box-Nummer
freezer	2	Tiefkühler Nummer 2
rack	14	Rack (Gestell) Nummer 14 im Schrank
box	31	Box Nummer 31 im Rack (die Schulade wird nicht spezifisch genannt)
A1-H12	D7	Alternative Positionsangabe in 96er-Box
nitrogen	-	Lagerort im Stickstofftank (anstelle von „freezer“)

Ok box meint die position 1-42 im Rack.!

"""


# Positions A1–H12 (plate layout)
#VALID_POS = [f"{row}{col}" for row in "ABCDEFGH" for col in range(1, 13)]
#VALID_RACK = list(map(str, range(1, 101)))  # 1–100
#VALID_BOX = [f"{box}" for box in range(1,43)] #42 positions per Rack
#VALID_FREEZER = ["1", "2", "3", "nitrogen"]
VALID_EVENTS = ["baseline_arm_1", "screening_arm_1", "follow_up_arm_1"]

# Allowed matrices (positions) per material
VALID_POS_FLUIDS = [f"{row}{col}" for row in "ABCDEFGH" for col in range(1, 13)]
VALID_POS_PAXGENE = [f"{row}{col}" for row in "ABCDEFG" for col in range(1, 8)]
VALID_POS_DNA_CELLS_PBMC =  [f"{row}{col}" for row in "ABCDEFGHJ" for col in range(1, 11)]

# Freezers
VALID_FREEZER = ["1", "2", "3", "nitrogen", "4deg"]

# Boxes (same for all, unless exception later)
VALID_BOX = [str(i) for i in range(1, 43)]  # 1–42

# Racks
VALID_RACK = [str(i) for i in range(1, 101)]  # 1–100

# === Functions ===

def read_csv(path):
    """
    Reads a CSV file and returns its headers and row data.

    Args:
        path (str): Path to the CSV file

    Returns:
        tuple: (List[str] headers, List[Dict] rows)
    """
    with open(path, newline='', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        rows = list(reader)
    return reader.fieldnames, rows


def check_structure(headers):
    """
    Checks whether all required fields are present in the CSV headers.

    Args:
        headers (List[str]): Column headers from the CSV

    Returns:
        List[str]: List of missing required fields (if any)
    """
    missing = [field for field in REQUIRED_FIELDS if field not in headers]
    return missing


def validate_row_v0(row, index):
    """
    Old version
    Validates a single row for required values and correct formats.

    Args:
        row (Dict): A row from the CSV as a dictionary
        index (int): The row number (for error reporting)

    Returns:
        List[str]: List of validation error messages for this row
    """
    errors = []

    # Check if all required fields are non-empty
    for field in REQUIRED_FIELDS:
        if row.get(field, "").strip() == "":
            errors.append(f"Row {index}: Missing value in '{field}'")

    ######## General Validation - is everything there?         
            
    # Validate pat_id format (3x alphanumeric with spaces)
    pat_id = row.get("pat_id", "").strip()
    if pat_id and not re.match(r"^[A-Za-z0-9]{3} [A-Za-z0-9]{3} [A-Za-z0-9]{3}$", pat_id):
        errors.append(f"Row {index}: Invalid pat_id format: '{pat_id}'")
        #that is not necessary anymore sophie will use something else

    # Validate other fields only if they're present
    if row.get("material") and row["material"] not in VALID_MATERIALS:
        errors.append(f"Row {index}: Invalid material: '{row['material']}'")

    if row.get("pos") and row["pos"] not in VALID_POS:
        errors.append(f"Row {index}: Invalid pos: '{row['pos']}'")

    if row.get("freezer") and row["freezer"] not in VALID_FREEZER:
        errors.append(f"Row {index}: Invalid freezer: '{row['freezer']}'")

    if row.get("rack") and row["rack"] not in VALID_RACK:
        errors.append(f"Row {index}: Invalid rack: '{row['rack']}'")

    if row.get("box") and row["box"] not in VALID_BOX:
        errors.append(f"Row {index}: Invalid box: '{row['box']}'")

    if row.get("redcap_event_name") and row["redcap_event_name"] not in VALID_EVENTS:
        errors.append(f"Row {index}: Invalid redcap_event_name: '{row['redcap_event_name']}'")

    return errors


# === Material → Freezer mapping === Import for sepparation of the different materials
BIOFLUIDS = ["urin", "edta plasma", "serum", "csf", "csf pellet"]
PAXGENE = ["paxgene"]
DNA = ["dna"]
CELLS = ["fibroblasten", "pbmc"]

#lets normalize to lower case

MATERIAL_TO_FREEZER = {
    **{m: ["1", "2", "3"] for m in BIOFLUIDS},   # Biofluids → -80 freezers
    **{m: ["nitrogen"] for m in CELLS},          # Cells → nitrogen
    **{m: ["4deg"] for m in DNA},                # DNA -> 4 Deg freezer
    **{m: ["1", "2", "3"] for m in PAXGENE}      # Paxgene -> -80 freezers
}

def validate_row_v1(row, index):
    """
    Validates a single row for required values and correct formats.

    Args:
        row (Dict): A row from the CSV as a dictionary
        index (int): The row number (for error reporting)

    Returns:
        List[str]: List of validation error messages for this row
    """
    errors = []

    # Check if all required fields are non-empty
    for field in REQUIRED_FIELDS:
        if row.get(field, "").strip() == "":
            errors.append(f"Row {index}: Missing value in '{field}'")

    ######## General Validation ########
            
    # Validate pat_id format (DISABLED if Sophie will use different format)
    # pat_id = row.get("pat_id", "").strip()
    # if pat_id and not re.match(r"^[A-Za-z0-9]{3} [A-Za-z0-9]{3} [A-Za-z0-9]{3}$", pat_id):
    #     errors.append(f"Row {index}: Invalid pat_id format: '{pat_id}'")

    # Validate material
    if row.get("material") and row["material"] not in VALID_MATERIALS:
        errors.append(f"Row {index}: Invalid material: '{row['material']}'")

    # Validate pos
    if row.get("pos") and row["pos"] not in VALID_POS:
        errors.append(f"Row {index}: Invalid pos: '{row['pos']}'")

    # Validate freezer
    if row.get("freezer") and row["freezer"] not in VALID_FREEZER:
        errors.append(f"Row {index}: Invalid freezer: '{row['freezer']}'")

    # Validate rack
    if row.get("rack") and row["rack"] not in VALID_RACK:
        errors.append(f"Row {index}: Invalid rack: '{row['rack']}'")

    # Validate box
    if row.get("box") and row["box"] not in VALID_BOX:
        errors.append(f"Row {index}: Invalid box: '{row['box']}'")

    # Validate event name
    if row.get("redcap_event_name") and row["redcap_event_name"] not in VALID_EVENTS:
        errors.append(f"Row {index}: Invalid redcap_event_name: '{row['redcap_event_name']}'")

    ######## Material-specific Freezer Rule ########
    material = row.get("material", "").strip()
    freezer = row.get("freezer", "").strip()

    if material and freezer:
        allowed_freezers = MATERIAL_TO_FREEZER.get(material, [])
        if allowed_freezers and freezer not in allowed_freezers:
            errors.append(
                f"Row {index}: Invalid freezer '{freezer}' for material '{material}'. "
                f"Allowed freezers: {allowed_freezers}"
            )

    return errors


def validate_row(row, index):
    errors = []

    # --- Required fields ---
    for field in REQUIRED_FIELDS:
        if row.get(field, "").strip() == "":
            errors.append(f"Row {index}: Missing value in '{field}'")

    # --- Get material ---
    material = row.get("material", "").strip().lower()
    pos = row.get("pos", "").strip()
    freezer = row.get("freezer", "").strip()
    rack = row.get("rack", "").strip()
    box = row.get("box", "").strip()
    # --- Material-specific storage rules ---
    if material in BIOFLUIDS:  # fluids
        if pos not in VALID_POS_FLUIDS:
            errors.append(f"Row {index}: Invalid pos '{pos}' for {material} (must be A1–H10)")
        if freezer not in ["1", "2", "3"]:
            errors.append(f"Row {index}: {material} must be stored in -80 freezers (1–3).")

    elif material in PAXGENE:
        if pos not in VALID_POS_PAXGENE:
            errors.append(f"Row {index}: Invalid pos '{pos}' for PAXgene (must be A1–G7)")
        if freezer not in ["1", "2", "3"]:
            errors.append(f"Row {index}: PAXgene must be stored in -80 freezers (1–3).")

    elif material in DNA:
        if pos not in VALID_POS_DNA_CELLS_PBMC:
            errors.append(f"Row {index}: Invalid pos '{pos}' for DNA (must be A1–J10)")
        if freezer != "4deg":
            errors.append(f"Row {index}: DNA must be stored in 4-degree freezer.")

    elif material in CELLS:
        if pos not in VALID_POS_DNA_CELLS_PBMC:
            errors.append(f"Row {index}: Invalid pos '{pos}' for {material} (must be A1–J10)")
        if freezer != "nitrogen":
            errors.append(f"Row {index}: {material} must be stored in nitrogen tank.")

    else:
        errors.append(f"Row {index}: Unknown or unsupported material '{material}'")
        
    #Fängt Material ab

    # --- General checks still valid ---
    if rack and rack not in VALID_RACK:
        errors.append(f"Row {index}: Invalid rack '{rack}'")

    if box and box not in VALID_BOX:
        errors.append(f"Row {index}: Invalid box '{box}'")

    return errors



def validate_file_v1(path, label):
    """
    Validates an entire CSV file for structure and row-level values.

    Args:
        path (str): Path to the CSV file
        label (str): Descriptive label (e.g. "Import file")

    Returns:
        None – exits with error if validation fails
    """
    print(f" Checking {label}: {path}")
    headers, rows = read_csv(path)

    # Check for required column headers
    structure_errors = check_structure(headers)
    if structure_errors:
        print(f" Missing required columns in {label}: {structure_errors}")
        sys.exit(1)

    # Validate each row
    all_errors = []
    for i, row in enumerate(rows, start=2):  # Line 1 is header
        #calls validate row in for loop
        row_errors = validate_row(row, i)
        all_errors.extend(row_errors)
    
    #returns evaluation errors if any of the above occurs
    if all_errors:
        print(f" {len(all_errors)} validation error(s) in {label}:")  ##mach hier auch eher ein error rais
        for e in all_errors:
            print(" -", e)
        sys.exit(1)
    else:
        print(f" {label} passed all validation checks.\n")
        
        
def validate_file(path, label):
    """
    Validates an entire CSV file and raises ValueError if anything is wrong.
    """
    print(f" Checking {label}: {path}")
    headers, rows = read_csv(path)

    # Check for required column headers
    structure_errors = check_structure(headers)
    if structure_errors:
        raise ValueError(f"Missing required columns in {label}: {structure_errors}")

    # Validate each row
    for i, row in enumerate(rows, start=2):
        validate_row(row, i)  # will raise immediately if invalid

    print(f" {label} passed all validation checks.\n")
    return rows  # return rows if valid



def get_occupied_positions(rows):
    """
    Extracts all occupied positions from a list of data rows.

    Args:
        rows (List[Dict]): Data rows (e.g. from reference file)

    Returns:
        Set[Tuple[str, str, str, str]]: Set of (freezer, rack, box, pos)
    """
    positions = set()
    for row in rows:
        key = (
            row.get("freezer", "").strip(),
            row.get("rack", "").strip(),
            row.get("box", "").strip(),
            row.get("pos", "").strip(),
        )
        if all(key):
            positions.add(key)
    return positions


def check_duplicate_positions_v1(import_rows, occupied_positions):
    """
    Compares import rows against existing positions and raises errors for conflicts.

    Args:
        import_rows (List[Dict]): Rows from the import file
        occupied_positions (Set[Tuple]): Set of existing (freezer, rack, box, pos)

    Returns:
        None – prints and exits if duplicates are found
    """
    duplicate_count = 0
    for i, row in enumerate(import_rows, start=2):
        key = (
            row.get("freezer", "").strip(),
            row.get("rack", "").strip(),
            row.get("box", "").strip(),
            row.get("pos", "").strip(),
        )
        if key in occupied_positions:
            print(f"Row {i}: Position {key} is already occupied in existing data.")
            duplicate_count += 1

    if duplicate_count == 0:
        print("No duplicate positions found between import and reference data.")
    else:
        print(f"{duplicate_count} duplicate position error(s) found.")
    return duplicate_count


def check_duplicate_positions(import_rows, occupied_positions, label="Import vs Reference"):
    """
    Compares import rows against existing positions and raises errors for conflicts.

    Args:
        import_rows (List[Dict]): Rows from the import file
        occupied_positions (Set[Tuple]): Set of existing (freezer, rack, box, pos)
        label (str): Descriptive name for error reporting

    Raises:
        ValueError: if duplicate positions are found
    """
    errors = []
    for i, row in enumerate(import_rows, start=2):
        key = (
            row.get("freezer", "").strip(),
            row.get("rack", "").strip(),
            row.get("box", "").strip(),
            row.get("pos", "").strip(),
        )
        if key in occupied_positions:
            errors.append(f"Row {i}: Position {key} is already occupied in reference data.")

    if errors:
        raise ValueError(f"{label} – {len(errors)} duplicate position error(s):\n" + "\n".join(errors))
    else:
        print(f"No duplicate positions found between import and reference data.")
        return 0



def check_internal_duplicates(rows, label):
    """
    Checks for duplicate positions within a single file (usually the reference data).

    Args:
        rows (List[Dict]): Rows to check
        label (str): Descriptive name for error output

    Returns:
        None – prints warning and exits if internal duplicates found
    """
    position_map = {}

    for i, row in enumerate(rows, start=2):  # row index starts from line 2
        key = (
            row.get("freezer", "").strip(),
            row.get("rack", "").strip(),
            row.get("box", "").strip(),
            row.get("pos", "").strip(),
        )

        if all(key):
            position_map.setdefault(key, []).append(i)

    # Look for positions used more than once
    duplicates = {k: v for k, v in position_map.items() if len(v) > 1}
    if duplicates:
        details = "\n".join([f" - Position {k} found on rows {v}" for k, v in duplicates.items()])
        raise ValueError(f"Duplicate positions found within {label}:\n{details}")
    else:
        print(f"No duplicate positions found within {label}.")
        
    return 

def check_internal_tube_id_duplicates(rows, label):
    """
    Checks for duplicate tube_id values within a single dataset (e.g. the reference file).

    Args:
        rows (List[Dict]): Data rows to check
        label (str): Name of the file being checked (for reporting)

    Returns:
        None – exits if duplicates are found
    """
    tube_id_map = {}

    for i, row in enumerate(rows, start=2):
        tid = row.get("tube_id", "").strip()
        if tid:
            tube_id_map.setdefault(tid, []).append(i)

    duplicates = {tid: idxs for tid, idxs in tube_id_map.items() if len(idxs) > 1}

    if duplicates:
        print(f"Duplicate tube_id(s) found in {label}:")
        for tid, idxs in duplicates.items():
            print(f" - tube_id '{tid}' appears in rows {idxs}")
        sys.exit(1)
    else:
        print(f"All tube_id values in {label} are unique.")

def assign_redcap_ids(import_rows, reference_rows):
    
    
    #### This is probably not necessary - because the study ID on recap will go!!!
    
    """
    Assigns REDCap record_id based on pat_id (patient ID).
    If pat_id exists in reference, reuse the same REDCap ID.
    If not, assign a new one (max ID + 1).
    
    ##this needs to be changed. The pat ID is the lokal ID of sophie. Wenn ich nur pat_id benutze 
    was macht dann die Zuordnung? Das ist wirklich tricky. 

    Args:
        import_rows (List[Dict]): Rows to be imported
        reference_rows (List[Dict]): Existing REDCap data

    Returns:
        Tuple[List[Dict], int]: Updated import_rows with assigned record_id, 
                                and number of new record_ids assigned
    """
    pat_to_record = {}
    existing_ids = set()

    for row in reference_rows:
        record_id = row.get("record_id", "").strip()
        pat_id = row.get("pat_id", "").strip()
        if record_id and pat_id:
            pat_to_record[pat_id] = record_id
            existing_ids.add(int(record_id))

    next_record_id = max(existing_ids) + 1 if existing_ids else 1
    new_ids_count = 0

    for row in import_rows:
        pat_id = row.get("pat_id", "").strip()

        if not pat_id:
            print("Missing pat_id in import row.")
            sys.exit(1)

        if pat_id in pat_to_record:
            row["record_id"] = pat_to_record[pat_id]
        else:
            row["record_id"] = str(next_record_id)
            pat_to_record[pat_id] = str(next_record_id)
            next_record_id += 1
            new_ids_count += 1

    print("REDCap record_id assigned based on pat_id.")
    return import_rows, new_ids_count

def is_position_occupied(freezer, rack, box, pos, occupied_positions):
    """
    Validates the position and checks if it is occupied.

    Args:
        freezer (str): Freezer number or name (e.g. "1", "2", "nitrogen")
        rack (str): Rack number (1–100)
        box (str): Box number (1–100)
        pos (str): Position (e.g., A1–H12)
        occupied_positions (Set[Tuple[str, str, str, str]]): Known used positions

    Returns:
        bool: True if occupied, False if available

    Raises:
        ValueError: If input values are invalid
    """

    freezer = str(freezer).strip()
    rack = str(rack).strip()
    box = str(box).strip()
    pos = str(pos).strip().upper()

    valid_freezers = {"1", "2", "3", "nitrogen"}
    if freezer not in valid_freezers:
        raise ValueError(f"Invalid freezer: '{freezer}' (must be 1, 2, 3, or nitrogen)")

    if not rack.isdigit() or not (1 <= int(rack) <= 100):
        raise ValueError(f"Invalid rack: '{rack}' (must be integer 1–100)")

    if not box.isdigit() or not (1 <= int(box) <= 100):
        raise ValueError(f"Invalid box: '{box}' (must be integer 1–100)")

    if not re.match(r"^[A-H](?:[1-9]|1[0-2])$", pos):
        raise ValueError(f"Invalid position: '{pos}' (must be A1–H12)")

    key = (freezer, rack, box, pos)
    return key in occupied_positions

from datetime import datetime
'''
def write_report(filename, import_file, reference_file, import_rows, duplicate_positions_count): #new_ids_count ist aktuell nicht dabei
    with open(filename, "w") as f:
        f.write("Biorepository Data Validation Report\n")
        f.write(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M')}\n\n")
        f.write("Input files:\n")
        f.write(f" - Import: {import_file}\n")
        f.write(f" - Reference: {reference_file}\n\n")
        f.write("Summary:\n")
        f.write(f" - Number of import rows processed: {len(import_rows)}\n")
        #f.write(f" - Number of new pat_ids added: {new_ids_count}\n")
        f.write(f" - Number of duplicate positions found: {duplicate_positions_count}\n")
        record_ids = [int(row["record_id"]) for row in import_rows if "record_id" in row]
        if record_ids:
            f.write(f" - Assigned record_id range: {min(record_ids)} to {max(record_ids)}\n")
        f.write("\nWarnings / Errors:\n")
        f.write(" - None\n\n")
        f.write("Validation completed successfully!\n")
'''    
        
from datetime import datetime

def write_report(filename, import_file, reference_file, import_rows, success, error_message=None):
    with open(filename, "w") as f:
        f.write("Biorepository Data Validation Report\n")
        f.write(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M')}\n\n")
        f.write("Input files:\n")
        f.write(f" - Import: {import_file}\n")
        f.write(f" - Reference: {reference_file}\n\n")

        if success:
            f.write("✅ Validation completed successfully.\n")
            f.write("Recommendation: Safe to upload to REDCap.\n")
        else:
            f.write("❌ Validation failed.\n")
            f.write("Error details:\n")
            f.write(error_message + "\n\n")
            f.write("Recommendation: Do NOT upload to REDCap.\n")
            
            

            from datetime import datetime

def write_report(filename, import_file, reference_file, import_rows=None,
                 errors=None, recommendation=None):
    """
    Writes a validation report to a text file.

    Args:
        filename (str): Path to save the report
        import_file (str): Path to import CSV
        reference_file (str): Path to reference CSV
        import_rows (List[Dict], optional): Imported rows, for summary stats
        errors (List[str], optional): List of error messages collected
        recommendation (str, optional): Recommendation to upload or not
    """
    with open(filename, "w", encoding="utf-8") as f:
        f.write("Biorepository Data Validation Report\n")
        f.write(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M')}\n")
        f.write("="*50 + "\n\n")

        f.write("Input files:\n")
        f.write(f" - Import: {import_file}\n")
        f.write(f" - Reference: {reference_file}\n\n")

        # Summary
        f.write("Summary:\n")
        if import_rows:
            f.write(f" - Number of import rows processed: {len(import_rows)}\n")
        f.write("\n")

        # Errors
        f.write("Errors / Warnings:\n")
        if errors and len(errors) > 0:
            for e in errors:
                f.write(f" - {e}\n")
        else:
            f.write(" - None\n")
        f.write("\n")

        # Recommendation
        f.write("Recommendation:\n")
        if recommendation:
            f.write(f"{recommendation}\n")
        else:
            f.write("No recommendation provided.\n")




            

# RUN IT Block

In [7]:
#validate test data
validate_file("/home/aaron/Desktop/BioVal/data/NDEGTest_import_file_double_position.csv", "Import file")

 Checking Import file: /home/aaron/Desktop/BioVal/data/NDEGTest_import_file_double_position.csv
 Import file passed all validation checks.



[{'record_id': '1',
  'redcap_event_name': 'screening_arm_1',
  'redcap_repeat_instrument': '',
  'redcap_repeat_instance': '',
  'pat_id': '124 123 4ll',
  'visit_date': '12.09.2023',
  'study': '',
  'study_visit': '',
  'material': 'CSF',
  'pos': 'A1',
  'posval': '',
  'tube_id': '2345',
  'box_id': '1',
  'freezer': '1',
  'rack': '1',
  'box': '1',
  'fibro_passage': '',
  'einfrierdatum_minus80': '',
  'aufgetaut': '',
  'volumen': '',
  'kommentar': '',
  'zellzahl_ampulle': '',
  'verschickt_am': '',
  'verschickt_empfaenger': '',
  'verschickt_projekt': '',
  'reserviert_am': '',
  'reserviert_fuer': '',
  'extern_prozessiert': '',
  'visit_time': '',
  'freeze_time': '',
  'nuechtern': '',
  'abholort': '',
  'probenabnahme': '',
  'aufarbeitung': '',
  'visit_nr': '',
  'neurolabor_nummer': '',
  'pat_nr_study': '',
  'accession_nr': '',
  'dna_versandtyp': '',
  'cohort': '',
  'beschriftung': '',
  'bereits_erkrankt': '',
  'fibro_kontrolle': '',
  'extern_verwendbar': '

In [8]:
_, import_rows = read_csv("/home/aaron/Desktop/BioVal/data/NDEGTest_import_file_double_position.csv")

In [9]:
_, ref_rows = read_csv("/home/aaron/Desktop/BioVal/data/NDEGTest_ref_file.csv")


In [10]:
validate_file("/home/aaron/Desktop/BioVal/data/NDEGTest_import_file.csv", "Import file")
validate_file("/home/aaron/Desktop/BioVal/data/NDEGTest_ref_file.csv", "data file")

 Checking Import file: /home/aaron/Desktop/BioVal/data/NDEGTest_import_file.csv
 Import file passed all validation checks.

 Checking data file: /home/aaron/Desktop/BioVal/data/NDEGTest_ref_file.csv
 data file passed all validation checks.



[{'record_id': '1',
  'redcap_event_name': 'screening_arm_1',
  'redcap_repeat_instrument': '',
  'redcap_repeat_instance': '',
  'pat_id': '123 456 789',
  'visit_date': '2023-01-11',
  'study': '',
  'study_visit': '',
  'material': 'Urin',
  'pos': 'A1',
  'posval': '',
  'tube_id': '2345',
  'box_id': '1',
  'freezer': '1',
  'rack': '1',
  'box': '1',
  'fibro_passage': '',
  'einfrierdatum_minus80': '',
  'aufgetaut': '',
  'volumen': '',
  'kommentar': '',
  'zellzahl_ampulle': '',
  'verschickt_am': '',
  'verschickt_empfaenger': '',
  'verschickt_projekt': '',
  'reserviert_am': '',
  'reserviert_fuer': '',
  'extern_prozessiert': '',
  'visit_time': '',
  'freeze_time': '',
  'nuechtern': '',
  'abholort': '',
  'probenabnahme': '',
  'aufarbeitung': '',
  'visit_nr': '',
  'neurolabor_nummer': '',
  'pat_nr_study': '',
  'accession_nr': '',
  'dna_versandtyp': '',
  'cohort': '',
  'beschriftung': '',
  'bereits_erkrankt': '',
  'fibro_kontrolle': '',
  'extern_verwendbar': 

In [11]:
check_internal_duplicates(ref_rows, "data file")
check_internal_tube_id_duplicates(ref_rows, "Reference data file")


No duplicate positions found within data file.
All tube_id values in Reference data file are unique.


In [12]:
import_rows, account_id = assign_redcap_ids(import_rows, ref_rows)


REDCap record_id assigned based on pat_id.


In [13]:
occupied = get_occupied_positions(ref_rows)
check_duplicate_positions(import_rows, occupied)

ValueError: Import vs Reference – 2 duplicate position error(s):
Row 2: Position ('1', '1', '1', 'A1') is already occupied in reference data.
Row 3: Position ('1', '1', '1', 'A2') is already occupied in reference data.

In [14]:
occupied = get_occupied_positions(ref_rows)

# Check a specific position manually
freezer = "1"
rack = "1"
box = "1"
pos = "G1"

if is_position_occupied(freezer, rack, box, pos, occupied):
    print(f"Position ({freezer}, {rack}, {box}, {pos}) is already occupied. Please try another one.")
else:
    print(f"Position ({freezer}, {rack}, {box}, {pos}) is free. ")

Position (1, 1, 1, G1) is free. 


# GUI - BioVal 

In [15]:
import tkinter as tk
from tkinter import filedialog, messagebox
from tkinter import ttk
from PIL import Image, ImageTk  # Pillow muss installiert sein: pip install pillow

def run_validation():
    import_path = filedialog.askopenfilename(title="Select Import CSV")
    ref_path = filedialog.askopenfilename(title="Select Reference CSV")
    
    if not import_path or not ref_path:
        messagebox.showerror("Error", "Both files must be selected.")
        return
    
    errors = []

    try:
        validate_file(import_path, "Import file")
    except ValueError as e:
        errors.append(str(e))

    try:
        validate_file(ref_path, "Reference file")
    except ValueError as e:
        errors.append(str(e))

    try:
        check_internal_duplicates(read_csv(import_path)[1], "Import file")
    except ValueError as e:
        errors.append(str(e))

    try:
        check_internal_duplicates(read_csv(ref_path)[1], "Reference file")
    except ValueError as e:
        errors.append(str(e))

    try:
        occupied_pos = get_occupied_positions(read_csv(ref_path)[1])
        duplicate_positions_count = check_duplicate_positions(
            read_csv(import_path)[1], occupied_pos)
    except ValueError as e:
        errors.append(str(e))
    print(errors)
    # === Decide what to do based on errors ===
    if errors:
        recommendation = "❌ No upload. Errors must be fixed first."
    else:
        recommendation = "✅ Upload safe."

    report_path = filedialog.asksaveasfilename(
        defaultextension=".txt",
        filetypes=[("Text files", "*.txt")],
        title="Save Validation Report As"
    )
    if report_path:
        write_report(report_path, import_path, ref_path, read_csv(import_path)[1] , errors, recommendation)
        messagebox.showinfo("Report", f"Report saved at {report_path}")


        
        
# --- GUI Setup ---
root = tk.Tk()
root.title("BioVal – Biorepository Validator")
root.geometry("600x400")

main_frame = ttk.Frame(root, padding=20)
main_frame.pack(fill="both", expand=True)

# --- Optional Image ---
try:
    img = Image.open("logo.jpeg")  # <-- Dein Bildpfad
    img = img.resize((150, 150))
    photo = ImageTk.PhotoImage(img)
    logo_label = ttk.Label(main_frame, image=photo)
    logo_label.image = photo
    logo_label.pack(pady=(0, 10))
except Exception:
    print("No image found – skipping logo.")

# --- Welcome Text ---
welcome = ttk.Label(
    main_frame,
    text=(
        "Welcome to BioVal!\n\n"
        "This tool helps you validate biorepository REDCap import files.\n"
        "You’ll be prompted to select:\n"
        " - A new import CSV file\n - the data you want to upload"
        " - A reference dataset for comparison - the data already stored in RedCap \n\n"
        "The tool checks patient IDs, sample positions, and generates a report."
    ),
    justify="center",
    wraplength=500
)
welcome.pack(pady=10)

# --- Start Button ---
start_button = ttk.Button(main_frame, text="Run Validation", command=run_validation)
start_button.pack(pady=20)

root.mainloop()

 Checking Import file: /home/aaron/Desktop/BioVal/data/NDEGTest_import_file_double_position.csv
 Import file passed all validation checks.

 Checking Reference file: /home/aaron/Desktop/BioVal/data/NDEGTest_ref_file.csv
 Reference file passed all validation checks.

No duplicate positions found within Import file.
No duplicate positions found within Reference file.
["Import vs Reference – 2 duplicate position error(s):\nRow 2: Position ('1', '1', '1', 'A1') is already occupied in reference data.\nRow 3: Position ('1', '1', '1', 'A2') is already occupied in reference data."]


In [49]:
import tkinter as tk
from tkinter import filedialog, messagebox
from tkinter import ttk
from PIL import Image, ImageTk  # Pillow muss installiert sein: pip install pillow

def run_validation():
    import_path = filedialog.askopenfilename(title="Select Import CSV")
    ref_path = filedialog.askopenfilename(title="Select Reference CSV")

    if not import_path or not ref_path:
        messagebox.showerror("Error", "Both files must be selected.")
        return

    try:
        # Beispiel: CSV lesen & validieren (Funktionen müssen definiert sein)
        _, import_rows = read_csv(import_path)
        _, ref_rows = read_csv(ref_path)
        #Check if either of the files have not the required fields and structure
        validate_file(import_path, "Import file")
        validate_file(ref_path, "Reference data")
        #Check if they have internal dublicates
        check_internal_duplicates(import_rows, "Import file")
        check_internal_duplicates(ref_rows,"Reference date")
        #Die Red cap ID function muss geändert werden, wegen PaTID da brauchen wir aber das Gespräch mit Rebecca
        #import_rows, new_ids_count = assign_redcap_ids(import_rows, ref_rows)
        #Die Funktion macht nicht was sie soll.
        occupied_pos = get_occupied_positions(ref_rows)
        duplicate_positions_count = check_duplicate_positions(import_rows, occupied_pos)

        # Bericht speichern
        report_path = filedialog.asksaveasfilename(
            defaultextension=".txt",
            filetypes=[("Text files", "*.txt")],
            title="Save Validation Report As"
        )
        if report_path:
            #Write report . 
            write_report(report_path, import_path, ref_path, import_rows, duplicate_positions_count) #new_ids_count ist weg
            messagebox.showinfo("Success", f"Validation completed!\nReport saved at:\n{report_path}")
        else:
            messagebox.showinfo("Success", "Validation completed! No report was saved.")

    except Exception as e:
        messagebox.showerror("Validation Error", str(e))


# --- GUI Setup ---
root = tk.Tk()
root.title("BioVal – Biorepository Validator")
root.geometry("600x400")

main_frame = ttk.Frame(root, padding=20)
main_frame.pack(fill="both", expand=True)

# --- Optional Image ---
try:
    img = Image.open("logo.jpeg")  # <-- Dein Bildpfad
    img = img.resize((150, 150))
    photo = ImageTk.PhotoImage(img)
    logo_label = ttk.Label(main_frame, image=photo)
    logo_label.image = photo
    logo_label.pack(pady=(0, 10))
except Exception:
    print("No image found – skipping logo.")

# --- Welcome Text ---
welcome = ttk.Label(
    main_frame,
    text=(
        "Welcome to BioVal!\n\n"
        "This tool helps you validate biorepository REDCap import files.\n"
        "You’ll be prompted to select:\n"
        " - A new import CSV file\n - the data you want to upload"
        " - A reference dataset for comparison - the data already stored in RedCap \n\n"
        "The tool checks patient IDs, sample positions, and generates a report."
    ),
    justify="center",
    wraplength=500
)
welcome.pack(pady=10)

# --- Start Button ---
start_button = ttk.Button(main_frame, text="Run Validation", command=run_validation)
start_button.pack(pady=20)

root.mainloop()


 Checking Import file: /home/aaron/Desktop/BioVal/data/NDEGTest_import_file_double_position.csv
 Import file passed all validation checks.

 Checking Reference data: /home/aaron/Desktop/BioVal/data/NDEGTest_ref_file.csv
 Reference data passed all validation checks.

No duplicate positions found within Import file.
No duplicate positions found within Reference date.
Row 2: Position ('1', '1', '1', 'A1') is already occupied in existing data.
Row 3: Position ('1', '1', '1', 'A2') is already occupied in existing data.
2 duplicate position error(s) found.
2
