# Send and Retrieve vitals

This script facilitates remote processing of validation datasets on a GPU server via SSH/SCP.
It handles input file transfer, inference result retrieval, emissions information retrieval, and vital sign parsing 
for multiple prompts and batch sizes.

Main Functions:
------------------
1. Remote SSH Connection (Paramiko + SCP):
   - `connect_with_gpu()` / `close_gpu_connection()`: Secure GPU session management.
   - Private key-based auth to access GPU cluster.

2. Extracts relevant sections from MIMIC-IV:
   - `extract_sections(note)`: Extracts Chief Complaint, HPI, and Physical Exam from full note text.

3. Note Transfer:
   - `transfer_notes()`: 
     • Build note fragment
     • Upload input files to the GPU server.
     • Save local chief complaint JSON.

4. Result Retrieval:
   - `get_extracted_vitals(inference_id)`: Downloads individual output files and attempts to parse JSON outputs.
   - `get_batch_size(inference_id)`: Repeats `get_extracted_vitals` across multiple batch sizes (1, 2, 4, 8) and aggregates emissions data.
   - `get_emissions()`: Downloads emissions.csv file from gpu server to local machine.

In [None]:
from sqlalchemy import create_engine
from sqlalchemy import text
from scp import SCPClient
import pandas as pd
import regex as re
import paramiko
import json
import os

# Transfer notes

In [None]:
# === GPU create/close connection ===

def connect_with_gpu():
    gpu_host = "bristol.hh.se" #"denver.hh.se"
    gpu_port = 20022
    gpu_username = "asirui24"
    private_key_path = "C:/Users/asirui/.ssh/id_rsa"

    ssh = paramiko.SSHClient()
    ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
    ssh.connect(hostname=gpu_host, port=gpu_port, username=gpu_username, key_filename=private_key_path)
    return ssh

def close_gpu_connection(ssh):
    try:
        ssh.close()
    except Exception as e:
        print(f"Failed to close SSH connection: {e}")

In [3]:
# === Extract Chief Complaint, Past Medical History and Physical Exam sections ===

def extract_sections(note):
    
    # === Define the pattern to match the desired section ===  
    sections = {
        "Chief Complaint": [
            ("Chief Complaint:", "Major Surgical or Invasive Procedure:"),
            ("___ Complaint:", "Major Surgical or Invasive Procedure:")
        ],
        "History of Present Illness": [
            ("History of Present Illness:", "Past Medical History:"),
            ("___ Present Illness:", "Past Medical History:")
        ],
        "Physical Exam": [
            ("Physical Exam:", "Pertinent Results:"),
            ("Physical ___:", "Pertinent Results:"),
            ("Physical Exam:", "Brief Hospital Course:")
        ]
    }

    extracted_sections = {}
    for section_name, marker_pairs in sections.items():
        found = False
        for start_marker, end_marker in marker_pairs:
            if start_marker in note and end_marker in note:
                pattern = re.escape(start_marker) + r"\s*(.*?)\s*" + re.escape(end_marker)
                match = re.search(pattern, note, re.DOTALL)
                if match:
                    extracted_sections[section_name] = match.group(1).strip()
                    found = True
                    break
        if not found:
            extracted_sections[section_name] = "Section not found."
    return extracted_sections

In [None]:
def transfer_notes():
    
    extracted_data = []

    # === Read the contents of the Ground Truth file ===
    with open("datasets/validation_dataset_GroundTruth.json", 'r') as file:
        data = json.load(file)

    # === Connect to the database ===
    DB_NAME = "mimic"
    DB_USER = "postgres"
    DB_PASSWORD = "sd98hS&GD3F4"
    DB_HOST = "localhost"
    DB_PORT = "5432"
    engine = create_engine(f"postgresql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}")

    ssh = connect_with_gpu()

    for idx, row in enumerate(data):
        
        note_id = row["note_id"]
        note_num = row["note_number"]

        query = text('''
        SELECT text
        FROM mimiciv_note.discharge
        WHERE note_id = :note_id
        ''')
        note = pd.read_sql(query, engine, params={"note_id": note_id})
        note  = note["text"].values[0]
        
        sections = extract_sections(note)
        cc = sections["Chief Complaint"]
        hpi = sections["History of Present Illness"]
        pe = sections["Physical Exam"]
        note_fragment = f"History of Present Illness:\n{hpi}\n\n---------------------------\n\nPhysical Exam:\n{pe}"

        remote_path = "/data/home/asirui24/validation/notes"
        remote_input = f"{remote_path}/input_{note_num}.txt"

        try:
            # === Create temp input file locally ===
            with open("temp_input.txt", "w") as temp_file:
                temp_file.write(note_fragment)

            # === Transfer file to remote ===
            with SCPClient(ssh.get_transport()) as scp:
                scp.put("temp_input.txt", remote_input)
        
        except Exception as e:
            print(f"Note {note_num} transfer failed: {e}")

        record = {
            "note_number": row["note_number"],
            "note_id": row["note_id"],
            "chief_complaint": cc
        }

        extracted_data.append(record)

    try:
        os.remove("temp_input.txt")
    except Exception as e:
        print(f"Local cleanup failed: {e}")

    with open("datasets/chief_complaint.json", "w") as f:
        json.dump(extracted_data, f, ensure_ascii=False, indent=4)

    close_gpu_connection(ssh)

# Get Extracted Information

In [5]:
def try_parse_json(output_str):
    try:
        parsed = json.loads(output_str)
        return parsed, None
    except json.JSONDecodeError as e:
        return {}, str(e)

In [None]:
# === Get emissions ===
def get_emissions(remote_path="/data/home/asirui24/validation/results", local_path="emissions.csv"): 
       
    emissions_path = f"{remote_path}/emissions.csv" 

    ssh = connect_with_gpu()
    
    try:
        stdin, stdout, stderr = ssh.exec_command(f"cat {emissions_path}")
        emissions = stdout.read().decode()
    except Exception as e:
        print(f"Emissions transfering failed: {e}")
        close_gpu_connection(ssh)

    with open(local_path, "w", encoding="utf-8") as f:
        f.write(emissions)

    close_gpu_connection(ssh)

In [None]:
# === Get extracted vital signs ===
def get_extracted_vitals(inference_id):

    ssh = connect_with_gpu()

    remote_path = f"/data/home/asirui24/validation/results/{inference_id}"

    with open("datasets/ground_truth.json", 'r') as file:
        data = json.load(file)
        
    with open(f"datasets/{inference_id}.jsonl", 'w') as f:

        for idx, row in enumerate(data):
            
            remote_output = f"{remote_path}/output_{idx+1}.txt"

            try:
                stdin, stdout, stderr = ssh.exec_command(f"cat {remote_output}")
                VS = stdout.read().decode()
                if not VS.strip():
                    print(f"Note_{row['note_number']} - Output file is empty or missing")
                    continue
            except Exception as e:
                print(f"Note_{idx+1} failed: {e}")

            parsed_json, parse_error = try_parse_json(VS)

            record = {
                "note_number": row["note_number"],
                "note_id": row["note_id"],
                "extracted_data": VS,
                "parsed_json": parsed_json,
                "parse_error": parse_error
            }
            
            f.write(json.dumps(record) + '\n')
            
    close_gpu_connection(ssh)

In [None]:
# === Get extracted vital signs for different batch sizes ===
def get_batch_size(inference_id):
    
    ssh = connect_with_gpu()
    
    with open("datasets/ground_truth.json", 'r') as file:
        data = json.load(file)
    
    emissions_out = "emissions_batch.csv"
    header_written = False

    for batch in [1, 2, 4, 8]:

        if batch == 1:
            remote_path = f"/data/home/asirui24/validation/results"
        else:
            remote_path = f"/data/home/asirui24/validation/results_batch{batch}"
        
        with open(f"datasets/batch_size/{inference_id}_batch_{batch}.jsonl", 'w') as f:

            for idx, row in enumerate(data):
                
                remote_output = f"{remote_path}/{inference_id}/output_{idx+1}.txt"

                try:
                    stdin, stdout, stderr = ssh.exec_command(f"cat {remote_output}")
                    VS = stdout.read().decode()
                    if not VS.strip():
                        print(f"Note_{row['note_number']} - Output file is empty or missing")
                        continue
                except Exception as e:
                    print(f"Note_{idx+1} failed: {e}")

                parsed_json, parse_error = try_parse_json(VS)

                record = {
                    "note_number": row["note_number"],
                    "note_id": row["note_id"],
                    "extracted_data": VS,
                    "parsed_json": parsed_json,
                    "parse_error": parse_error
                }
                
                f.write(json.dumps(record) + '\n')

        # === Batch size emissions.csv ===

        try:
            stdin, stdout, stderr = ssh.exec_command(f"cat {remote_path}/emissions.csv")
            emissions = stdout.read().decode()
        except Exception as e:
            print(f"Emissions transfering failed: {e}")
            continue

        temp_emissions = f"temp_emissions.csv"
        with open(temp_emissions, "w", encoding="utf-8") as f:
            f.write(emissions)

        try:
            df = pd.read_csv(temp_emissions)
            filtered = df[df["project_name"] == inference_id].copy()
            filtered["batch_size"] = batch  # Add batch size column

            if not header_written and not filtered.empty:
                filtered.to_csv(emissions_out, mode='w', index=False)
                header_written = True
            else:
                filtered.to_csv(emissions_out, mode='a', index=False, header=False)
        except Exception as e:
            print(f"Error filtering emissions: {e}")
        finally:
            os.remove(temp_emissions)
            
    close_gpu_connection(ssh)

# Call functions

In [10]:
# transfer_notes()    

In [11]:
# get_emissions()

In [None]:
# prompts1 = ["llama3_10", "llama3_11", "llama3_13", "llama3_15", "llama3_20", "llama3_21", "llama3_23", "llama3_25"]
# prompts2 = ["meditron3_10", "meditron3_11", "meditron3_13", "meditron3_15", "meditron3_20", "meditron3_21", "meditron3_23", "meditron3_25"]
# prompts3 = ["deepseek-llm_10", "deepseek-llm_11", "deepseek-llm_13", "deepseek-llm_15", "deepseek-llm_20", "deepseek-llm_21", "deepseek-llm_23", "deepseek-llm_25"]
# prompts = prompts1 + prompts2 + prompts3

# for prompt in prompts:   
#     get_extracted_vitals(prompt)

In [None]:
# prompt = "llama3_13"
# get_batch_size(prompt)