<Input 벡터>

0. patient_number(int)
1. Bruise: streamlit을 이용한 manual input
=> bruise_vector = [head_count, head_length, arms_count, arms_length, legs_count, legs_length, torso_count, torso_length, buttocks_count, buttocks_length, specific_shape]*_count(int), _length(float), specific_shape(0 or 1)
2. Video/Audio: Video(.mp4) input
=> emotion_vector = [Happiness, Sadness, Anger, Surprise, Fear] *모두 float(0-1), 의식 없을 시 제외
3. History taking: streamlit을 이용한 manual input
=> response_vector = [consciousness_state, guardian_status, abuse_likely, match_explanation, developemental_stage, treatment_delayed, consistent_history, poor_condition, inappropriate_relationship] *0(아니오) or 1(예) or None(유보)
4. Basic information: csv input
=> info_vector = [age_months, sex, height_cm, weight_kg] *앞에서부터 각각 int, 0(남자), 1(여자), Float, Float
5. X-ray assessment: txt input(여러 부위의 .txt형식 판독문을 합쳐서 하나의 .txt 파일로 input)
=> X-ray_vector = [Skull, Rib, Humerus, Radius_Ulna, Femur, Tibia_Fibula, Spiral_fx, Metaphyseal_fx]
*앞 6개는 0 ~ 10(int), 11(N/A), 뒤 2개는 0(아니요) or 1(예) or 2(N/A)
6. Lab: csv input
=> lab_vector = [CBC_RBC, CBC_WBC, CBC_Platelet, Hb, PT_INR, aPTT, AST, ALT, ALP, Na, K, C', Calcium, Phosphorus, 25hydroxyvitaminD, Serum_albumin, Pre_albumin, Transferrin, Glucose] *모두 float

<Output 벡터> 

=> abuse_risk_score(int), abuse_cause = [원인1(str), 관여율1(int), 원인2(str), 관여율2(int), 원인3(str), 관여율3(int)]
*현재 임의의 데이터 x_train, y_train을 이용해 training한 XGBoost 사용

In [1]:
import xgboost as xgb
import pandas as pd
import numpy as np

# Define the input vectors
def preprocess_inputs(patient_data):
    # patient_data is assumed to be a dictionary with keys corresponding to the input vectors
    # Extract each input vector from the dictionary
    bruise_vector = patient_data['bruise_vector']
    emotion_vector = patient_data['emotion_vector']
    response_vector = patient_data['response_vector']
    info_vector = patient_data['info_vector']
    xray_vector = patient_data['xray_vector']
    lab_vector = patient_data['lab_vector']    
    # Concatenate all vectors into a single input vector
    input_vector = np.concatenate([
        bruise_vector,
        emotion_vector,
        response_vector,
        info_vector,
        xray_vector,
        lab_vector
    ])
    return input_vector

# Simulate patient data for demonstration
patient_data = {
    'bruise_vector': [2, 3.0, 1, 4.5, 0, 0, 1, 2.5, 0, 0.0, 1],
    'emotion_vector': [0.5, 0.3, 0.1, 0.0, 0.2],
    'response_vector': [1, 1, 1, 0, 1, 1, 0],
    'info_vector': [24, 0, 80.0, 10.5],
    'xray_vector': [2, 3, 1, 0, 1, 1, 0, 1],
    'lab_vector': [4.5, 10.0, 150.0, 12.0, 1.1, 34.0, 24.0, 150.0, 35.0, 140.0, 3.5, 0.9, 2.0, 3.2, 45.0, 4.0, 6.5, 200.0, 90.0]
}

input_vector = preprocess_inputs(patient_data)

# Load a dataset and train the XGBoost model (for demonstration, dummy data is used here)
# Normally, you would have a labeled dataset for training
X_train = np.random.rand(100, len(input_vector))  # Dummy features
y_train = np.random.randint(0, 10, 100)  # Dummy abuse risk scores (target)

# Train an XGBoost model
dtrain = xgb.DMatrix(X_train, label=y_train)
params = {
    'objective': 'reg:squarederror',
    'max_depth': 5,
    'eta': 0.1,
    'verbosity': 0
}
model = xgb.train(params, dtrain, num_boost_round=50)

# Assuming model has been trained and predictions have been made as before
# Normalize abuse risk score to 0-100% scale (if it's a regression output)
abuse_risk_score_raw = model.predict(dtest)[0]
abuse_risk_score = min(max(int(abuse_risk_score_raw * 10), 0), 100)  # Adjust scale and clip to [0, 100]

# Extract the top 3 causes from the feature importances
importance = model.get_score(importance_type='weight')

# Normalize the importance values to sum to 1 for involvement rate calculation
total_importance = sum(importance.values())
normalized_importance = {k: v / total_importance for k, v in importance.items()}

# Sort and get the top 3 important features
sorted_importance = sorted(normalized_importance.items(), key=lambda x: x[1], reverse=True)
top_3_causes = sorted_importance[:3]

# Map feature indices to actual input vector names for interpretation
feature_names = ['head_count', 'head_length', 'arms_count', 'arms_length', 'legs_count', 'legs_length', 
                 'torso_count', 'torso_length', 'buttocks_count', 'buttocks_length', 'specific_shape',
                 'Happiness', 'Sadness', 'Anger', 'Surprise', 'Fear',
                 'abuse_likely', 'match_explanation', 'developemental_stage', 'treatment_delayed', 'consistent_history', 'poor_condition', 'inappropriate_relationship',
                 'age_months', 'sex', 'height_cm', 'weight_kg',
                 'Skull', 'Rib', 'Humerus', 'Radius_Ulna', 'Femur', 'Tibia_Fibula', 'Spiral_fx', 'Metaphyseal_fx',
                 'CBC_RBC', 'CBC_WBC', 'CBC_Platelet', 'Hb', 'PT_INR', 'aPTT', 'AST', 'ALT', 'ALP', 'Na', 'K', 'C', 
                 'Calcium', 'Phosphorus', '25hydroxyvitaminD', 'Serum_albumin', 'Pre_albumin', 'Transferrin', 'Glucose']

# Create abuse_cause vector with top 3 features and normalize involvement rates
abuse_cause = []
for feature, importance in top_3_causes:
    # Extract the index from feature (e.g., 'f0' -> 0)
    feature_index = int(feature[1:])  # Remove 'f' and convert to int
    feature_name = feature_names[feature_index]  # Map to human-readable feature name
    involvement_rate = round(importance, 3)  # Normalize to 0-1 range and round for better readability
    abuse_cause.append((feature_name, involvement_rate))

# Output the normalized abuse risk score and top 3 causes
print("Abuse Risk Score:", abuse_risk_score, "%")
for i, cause in enumerate(abuse_cause, 1):
    print(f"Cause {i}: {cause[0]}, Involvement Rate: {cause[1]}")

NameError: name 'dtest' is not defined