<a href="https://colab.research.google.com/github/alaaatefbediwi-ds/Data-Analysis-Projects/blob/main/Copy_of_modules.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Creating folder structure
import os

base_path = "/content/mediscan_ai"
folders = [
    f"{base_path}/data",
    f"{base_path}/models",
    f"{base_path}/app"
]

for folder in folders:
    os.makedirs(folder, exist_ok=True)

print("✅ Folder structure created.")

✅ Folder structure created.


### **preprocessing.py**

In [3]:
# 📄 Step 2: Create preprocessing.py
preprocessing_code = '''
# preprocessing.py

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Feature lists
numeric_features = [
    'Age', 'Height', 'Weight', 'Body Mass Index (BMI)', 'Total Body Water (TBW)',
    'Extracellular Water (ECW)', 'Intracellular Water (ICW)',
    'Extracellular Fluid/Total Body Water (ECF/TBW)',
    'Total Body Fat Ratio (TBFR) (%)', 'Lean Mass (LM) (%)',
    'Body Protein Content (Protein) (%)', 'Visceral Fat Rating (VFR)',
    'Bone Mass (BM)', 'Muscle Mass (MM)', 'Obesity (%)',
    'Total Fat Content (TFC)', 'Visceral Fat Area (VFA)',
    'Visceral Muscle Area (VMA) (Kg)', 'Hepatic Fat Accumulation (HFA)',
    'Glucose', 'Total Cholesterol (TC)', 'Low Density Lipoprotein (LDL)',
    'High Density Lipoprotein (HDL)', 'Triglyceride',
    'Aspartat Aminotransferaz (AST)', 'Alanin Aminotransferaz (ALT)',
    'Alkaline Phosphatase (ALP)', 'Creatinine', 'Glomerular Filtration Rate (GFR)',
    'C-Reactive Protein (CRP)', 'Hemoglobin (HGB)', 'Vitamin D'
]

# Split categorical features based on fill strategy
fill_zero_features = ['Comorbidity', 'Diabetes Mellitus (DM)']
fill_most_frequent_features = ['Gender']
categorical_features = fill_zero_features + fill_most_frequent_features

# Pipelines
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

# Gender or other categorical features filled with most frequent and then one-hot encoded
cat_most_freq_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown='ignore')) #, sparse=False))
])

# Comorbidity & DM: filled with zero and then one-hot encoded
cat_zero_fill_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value=0)),
    ("encoder", OneHotEncoder(handle_unknown='ignore')) #, sparse=False))
])

# Combined preprocessor
preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, numeric_features),
    ("cat_mostfreq", cat_most_freq_transformer, fill_most_frequent_features),
    ("cat_zerofill", cat_zero_fill_transformer, fill_zero_features)
])

#categorical_features = fill_zero_features + fill_most_frequent_features
'''

with open("/content/mediscan_ai/app/preprocessing.py", "w") as f:
    f.write(preprocessing_code)

print("✅ preprocessing.py created.")

✅ preprocessing.py created.


### **train_model.py**

In [4]:
# 📄 Step 2: Create train_model.py
train_model_code = '''
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostClassifier
import joblib
import os

# Load the dataset
data = pd.read_excel('/content/Gallstone.xlsx')

# Features and target
X = data.drop(columns=['Gallstone Status', 'Coronary Artery Disease (CAD)', 'Hypothyroidism', 'Hyperlipidemia'])
y = data['Gallstone Status']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Preprocessing
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train model
cat_model = CatBoostClassifier(verbose=0, random_state=42)
cat_model.fit(X_train_scaled, y_train)

# Save model and scaler
os.makedirs('/content/mediscan_ai/models', exist_ok=True)
joblib.dump(cat_model, '/content/mediscan_ai/models/gallstone_model.pkl')
joblib.dump(scaler, '/content/mediscan_ai/models/scaler.pkl')

print("✅ Model and scaler saved successfully.")
'''

with open("/content/mediscan_ai/app/train_model.py", "w") as f:
    f.write(train_model_code)

print("✅ train_model.py created.")

✅ train_model.py created.


### **train_model.py 2**

In [5]:
# 📄 Step 3: Create train_model.py
train_model_code = '''
# train_model.py

import pandas as pd
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.pipeline import Pipeline
import joblib
import os

from preprocessing import preprocessor, numeric_features, categorical_features

# Load data
data = pd.read_excel('/content/Gallstone.xlsx')

# ✅ Reverse the label so that:
# 1 → Has Gallstone, 0 → No Gallstone (more intuitive)
#data['Gallstone Status'] = data['Gallstone Status'].map({0: 1, 1: 0})

# Features and target
X = data[numeric_features + categorical_features]
y = data['Gallstone Status']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Full pipeline with model
full_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", CatBoostClassifier(verbose=0, random_state=42))
])

# Train
full_pipeline.fit(X_train, y_train)

# Save the pipeline
os.makedirs('/content/mediscan_ai/models', exist_ok=True)
joblib.dump(full_pipeline, '/content/mediscan_ai/models/gallstone_model.pkl')

print("✅ Model pipeline saved successfully.")
'''

with open("/content/mediscan_ai/app/train_model.py", "w") as f:
    f.write(train_model_code)

print("✅ train_model.py created.")

✅ train_model.py created.


In [26]:
!pip install catboost



### **predictor.py**

In [None]:
# 📄 Step 3: Create predictor.py
predictor_code = '''
import joblib
import numpy as np

# Load model and scaler
model = joblib.load('/content/mediscan_ai/models/gallstone_model.pkl')
scaler = joblib.load('/content/mediscan_ai/models/scaler.pkl')

def predict_gallstone(lab_results: dict):
    # Convert lab_results dictionary to a 2D array
    features = np.array([list(lab_results.values())])
    scaled_features = scaler.transform(features)

    prediction = model.predict(scaled_features)[0]
    probability = model.predict_proba(scaled_features)[0][1]

    return ("Gallstone Detected" if prediction else "No Gallstone"), probability
'''

with open("/content/mediscan_ai/app/predictor.py", "w") as f:
    f.write(predictor_code)

print("✅ predictor.py created.")

✅ predictor.py created.


### **predictor.py 2**

In [24]:
# 📄 Step 4: Create predictor.py
predictor_code = '''
# predictor.py

import joblib
import pandas as pd

# Load the full pipeline (preprocessing + model)
model = joblib.load('/content/mediscan_ai/models/gallstone_model.pkl')

def predict_gallstone(lab_results: dict):
    """
    Predict gallstone status based on input lab results.

    Parameters:
        lab_results (dict): Dictionary of feature_name: value

    Returns:
        prediction_label (str), probability (float)
    """
    # Convert input dict to DataFrame
    input_df = pd.DataFrame([lab_results])  # Single row

    # Predict using the full pipeline
    #prediction = model.predict(input_df)[0]
    #probability = model.predict_proba(input_df)[0][0]  # Probability of "gallstone" class

    #label = "Gallstone Detected" if prediction == 0 else "No Gallstone Detected"
    prediction = model.predict(input_df)[0]
    probas = model.predict_proba(input_df)[0]

    if prediction == 0:
        label = "Gallstone Detected"
        probability = probas[0]  # Probability of class 0 (gallstone)
    else:
        label = "No Gallstone Detected"
        probability = probas[1]  # Probability of class 1 (no gallstone)

    return label, round(probability * 100, 2)  # Return percentage
    #return label, probability
'''

with open("/content/mediscan_ai/app/predictor.py", "w") as f:
    f.write(predictor_code)

print("predictor.py created.")

predictor.py created.


In [25]:
# Step 5: Run Training Script
%run /content/mediscan_ai/app/train_model.py

✅ Model pipeline saved successfully.


In [26]:
import sys
sys.path.append("/content/mediscan_ai")  # Adjust to your actual path

In [30]:
# Test Example_1 (Prediction)
from app.predictor import predict_gallstone

sample_input = {
    'Age': 38,
    'Height': 171,
    'Weight': 68.6,
    'Body Mass Index (BMI)': 23.5,
    'Total Body Water (TBW)': 39.5,
    'Extracellular Water (ECW)': 16.6,
    'Intracellular Water (ICW)': 22.9,
    'Extracellular Fluid/Total Body Water (ECF/TBW)': 42,
    'Total Body Fat Ratio (TBFR) (%)': 19.2,
    'Lean Mass (LM) (%)': 80.76,
    'Body Protein Content (Protein) (%)': 17.28,
    'Visceral Fat Rating (VFR)': 6,
    'Bone Mass (BM)': 2.8,
    'Muscle Mass (MM)': 52.6,
    'Obesity (%)': 6.7,
    'Total Fat Content (TFC)': 13.2,
    'Visceral Fat Area (VFA)': 8.2,
    'Visceral Muscle Area (VMA) (Kg)': 28.8,
    'Hepatic Fat Accumulation (HFA)': 0,
    'Glucose': 93,
    'Total Cholesterol (TC)': 239,
    'Low Density Lipoprotein (LDL)': 169,
    'High Density Lipoprotein (HDL)': 43,
    'Triglyceride': 129,
    'Aspartat Aminotransferaz (AST)': 19,
    'Alanin Aminotransferaz (ALT)': 34,
    'Alkaline Phosphatase (ALP)': 75,
    'Creatinine': 0.91,
    'Glomerular Filtration Rate (GFR)': 110.63,
    'C-Reactive Protein (CRP)': 0.0,
    'Hemoglobin (HGB)': 16.6,
    'Vitamin D': 15.6,
    'Gender': '0',
    'Comorbidity': 0,
    'Diabetes Mellitus (DM)': 0
}

label, prob = predict_gallstone(sample_input)
print(f"{label} with probability {prob:.4f}")

Gallstone Detected with probability 0.7677


In [31]:
# Test Example_2 (Prediction)
from app.predictor import predict_gallstone

sample_input = {
    'Gallstone Status': 1.00,
    'Age': 31.00,
    'Gender': 1.00,
    'Comorbidity': 0.00,
    'Coronary Artery Disease (CAD)': 0.00,
    'Hypothyroidism': 0.00,
    'Hyperlipidemia': 0.00,
    'Diabetes Mellitus (DM)': 0.00,
    'Height': 168.00,
    'Weight': 93.60,
    'Body Mass Index (BMI)': 33.20,
    'Total Body Water (TBW)': 37.60,
    'Extracellular Water (ECW)': 14.00,
    'Intracellular Water (ICW)': 24.00,
    'Extracellular Fluid/Total Body Water (ECF/TBW)': 37.23,
    'Total Body Fat Ratio (TBFR) (%)': 38.03,
    'Lean Mass (LM) (%)': 61.97,
    'Body Protein Content (Protein) (%)': 17.28,
    'Visceral Fat Rating (VFR)': 7.00,
    'Bone Mass (BM)': 2.40,
    'Muscle Mass (MM)': 55.10,
    'Obesity (%)': 47.38,
    'Total Fat Content (TFC)': 35.60,
    'Visceral Fat Area (VFA)': 17.20,
    'Visceral Muscle Area (VMA) (Kg)': 32.30,
    'Hepatic Fat Accumulation (HFA)': 2.00,
    'Glucose': 96.00,
    'Total Cholesterol (TC)': 173.00,
    'Low Density Lipoprotein (LDL)': 96.00,
    'High Density Lipoprotein (HDL)': 45.00,
    'Triglyceride': 192.00,
    'Aspartat Aminotransferaz (AST)': 9.00,
    'Alanin Aminotransferaz (ALT)': 12.00,
    'Alkaline Phosphatase (ALP)': 55.00,
    'Creatinine': 0.64,
    'Glomerular Filtration Rate (GFR)': 121.09,
    'C-Reactive Protein (CRP)': 3.30,
    'Hemoglobin (HGB)': 14.10,
    'Vitamin D': 4.90
}

label, prob = predict_gallstone(sample_input)
print(f"{label} with probability {prob:.4f}")

No Gallstone Detected with probability 0.0227


In [None]:
!pip install -q catboost openpyxl

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# Step 5: Run Training Script
#%run /content/mediscan_ai/app/train_model.py

✅ Model and scaler saved successfully.


### **pdf_extractor.py**

In [None]:
# 📄 Step: Create pdf_extractor.py
pdf_extractor_code = '''
import re
import pytesseract
from pdf2image import convert_from_path
from PIL import Image

# === Step 1: Convert PDF to images ===
def pdf_to_text_via_ocr(pdf_path, dpi=300):
    images = convert_from_path(pdf_path, dpi=dpi)
    full_text = ""
    for img in images:
        text = pytesseract.image_to_string(img)
        text = re.sub(r"[^\x00-\x7F]+", " ", text)  # remove non-ASCII
        full_text += text + "\\n"
    return full_text

# === Step 2: Regex patterns for lab values ===
lab_patterns = {
    "Hospital Name": [r"Hospital\\s*[:\\-]?\\s*(.+)"],
    "Lab Name": [r"Lab\\s*Name\\s*[:\\-]?\\s*(.+)"],
    "Patient Name": [r"Name\\s*[:\\-]?\\s*([A-Z][a-z]+\\s[A-Z][a-z]+)"],
    "Gender": [r"Gender\\s*[:\\-]?\\s*(Male|Female)"],
    "Age": [r"Age\\s*[:\\-]?\\s*(\\d{1,3})"],
    "Lab Date": [r"Lab\\s*Date\\s*[:\\-]?\\s*([A-Za-z]+\\s\\d{1,2},\\s\\d{4})"],
    "Comorbidity": [r"Comorbidity\\s*[:\\-]?\\s*(Yes|No)"],
    "Diabetes Mellitus (DM)": [r"Diabetes Mellitus.*[:\\-]?\\s*(Yes|No)"],
    "Body Mass Index (BMI)": [r"BMI.*?[:\\-]?\\s*([\\d.]+)"],
    "Height (cm)": [r"Height.*?[:\\-]?\\s*([\\d.]+)"],
    "Weight (kg)": [r"Weight.*?[:\\-]?\\s*([\\d.]+)"],
    "Total Body Water (TBW)": [r"Total Body Water.*?[:\\-]?\\s*([\\d.]+)"],
    "Extracellular Water (ECW)": [r"Extracellular Water.*?[:\\-]?\\s*([\\d.]+)"],
    "Intracellular Water (ICW)": [r"Intracellular Water.*?[:\\-]?\\s*([\\d.]+)"],
    "Extracellular Fluid/Total Body Water (ECF/TBW)": [r"ECF.*?/.*?TBW.*?[:\\-]?\\s*([\\d.]+)"],
    "Total Body Fat Ratio (TBFR) (%)": [r"Total Body Fat Ratio.*?[:\\-]?\\s*([\\d.]+)%?"],
    "Lean Mass (%)": [r"Lean Mass.*?[:\\-]?\\s*([\\d.]+)%?"],
    "Body Protein Content (%)": [r"Protein.*?Content.*?[:\\-]?\\s*([\\d.]+)%?"],
    "Visceral Fat Rating (VFR)": [r"Visceral Fat Rating.*?[:\\-]?\\s*([\\d.]+)"],
    "Bone Mass (BM)": [r"Bone Mass.*?[:\\-]?\\s*([\\d.]+)"],
    "Muscle Mass (MM)": [r"Muscle Mass.*?[:\\-]?\\s*([\\d.]+)"],
    "Obesity (%)": [r"Obesity.*?[:\\-]?\\s*([\\d.]+)%?"],
    "Total Fat Content (TFC)": [r"Total Fat Content.*?[:\\-]?\\s*([\\d.]+)"],
    "Visceral Fat Area (VFA)": [r"Visceral Fat Area.*?[:\\-]?\\s*([\\d.]+)"],
    "Visceral Muscle Area (VMA)": [r"Visceral Muscle Area.*?[:\\-]?\\s*([\\d.]+)"],
    "Hepatic Fat Accumulation (HFA)": [r"Hepatic Fat Accumulation.*?[:\\-]?\\s*([\\d.]+)"],
    "Glucose": [r"Glucose.*?[:\\-]?\\s*([\\d.]+)"],
    "Total Cholesterol (TC)": [r"Total Cholesterol.*?[:\\-]?\\s*([\\d.]+)"],
    "Low Density Lipoprotein (LDL)": [r"LDL.*?[:\\-]?\\s*([\\d.]+)"],
    "High Density Lipoprotein (HDL)": [r"HDL.*?[:\\-]?\\s*([\\d.]+)"],
    "Triglyceride": [r"Triglyceride.*?[:\\-]?\\s*([\\d.]+)"],
    "Aspartat Aminotransferaz (AST)": [r"AST.*?[:\\-]?\\s*([\\d.]+)"],
    "Alanin Aminotransferaz (ALT)": [r"ALT.*?[:\\-]?\\s*([\\d.]+)"],
    "Alkaline Phosphatase (ALP)": [r"ALP.*?[:\\-]?\\s*([\\d.]+)"],
    "Creatinine": [r"Creatinine.*?[:\\-]?\\s*([\\d.]+)"],
    "Glomerular Filtration Rate (GFR)": [r"GFR.*?[:\\-]?\\s*([\\d.]+)"],
    "C-Reactive Protein (CRP)": [r"CRP.*?[:\\-]?\\s*([\\d.]+)"],
    "Hemoglobin (HGB)": [r"Hemoglobin.*?[:\\-]?\\s*([\\d.]+)"],
    "Vitamin D": [r"Vitamin\\s*D.*?[:\\-]?\\s*([\\d.]+)"]
}

# === Step 3: Extract values using regex
def extract_lab_values(text, patterns):
    extracted = {}
    for label, regex_list in patterns.items():
        for pattern in regex_list:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                val = match.group(1).strip()
                if val and val.lower() != label.lower():
                    extracted[label] = val
                    break
        if label not in extracted:
            extracted[label] = "Not found"
    return extracted

# === Step 4: Main pipeline
def extract_from_scanned_lab_pdf(pdf_path):
    text = pdf_to_text_via_ocr(pdf_path)
    extracted_data = extract_lab_values(text, lab_patterns)
    return extracted_data
'''

# 📝 Write code to file
with open("/content/mediscan_ai/app/pdf_extractor.py", "w") as f:
    f.write(pdf_extractor_code)

print("pdf_extractor.py created successfully.")

pdf_extractor.py created successfully.


### Test Example

In [None]:
!rm /content/mediscan_ai/app/pdf_extractor.py

In [None]:
%%writefile /content/mediscan_ai/app/pdf_extractor.py
import re
import pytesseract
from pdf2image import convert_from_path
from PIL import Image

def pdf_to_text_via_ocr(pdf_path, dpi=300):
    images = convert_from_path(pdf_path, dpi=dpi)
    full_text = ""
    for img in images:
        text = pytesseract.image_to_string(img)
        text = re.sub(r"[^\x00-\x7F]+", " ", text)
        full_text += text + "\n"
    return full_text

lab_patterns = {
    "Hospital Name": [r"Hospital\\s*[:\\-]?\\s*(.+)"],
    "Lab Name": [r"Lab\\s*Name\\s*[:\\-]?\\s*(.+)"],
    "Patient Name": [r"Name\\s*[:\\-]?\\s*([A-Z][a-z]+\\s[A-Z][a-z]+)"],
    "Gender": [r"Gender\\s*[:\\-]?\\s*(Male|Female)"],
    "Age": [r"Age\\s*[:\\-]?\\s*(\\d{1,3})"],
    "Lab Date": [r"Lab\\s*Date\\s*[:\\-]?\\s*([A-Za-z]+\\s\\d{1,2},\\s\\d{4})"],
    "Comorbidity": [r"Comorbidity\\s*[:\\-]?\\s*(Yes|No)"],
    "Diabetes Mellitus (DM)": [r"Diabetes Mellitus.*[:\\-]?\\s*(Yes|No)"],
    "Body Mass Index (BMI)": [r"BMI.*?[:\\-]?\\s*([\\d.]+)"],
    "Height (cm)": [r"Height.*?[:\\-]?\\s*([\\d.]+)"],
    "Weight (kg)": [r"Weight.*?[:\\-]?\\s*([\\d.]+)"],
    "Total Body Water (TBW)": [r"Total Body Water.*?[:\\-]?\\s*([\\d.]+)"],
    "Extracellular Water (ECW)": [r"Extracellular Water.*?[:\\-]?\\s*([\\d.]+)"],
    "Intracellular Water (ICW)": [r"Intracellular Water.*?[:\\-]?\\s*([\\d.]+)"],
    "Extracellular Fluid/Total Body Water (ECF/TBW)": [r"ECF.*?/.*?TBW.*?[:\\-]?\\s*([\\d.]+)"],
    "Total Body Fat Ratio (TBFR) (%)": [r"Total Body Fat Ratio.*?[:\\-]?\\s*([\\d.]+)%?"],
    "Lean Mass (%)": [r"Lean Mass.*?[:\\-]?\\s*([\\d.]+)%?"],
    "Body Protein Content (%)": [r"Protein.*?Content.*?[:\\-]?\\s*([\\d.]+)%?"],
    "Visceral Fat Rating (VFR)": [r"Visceral Fat Rating.*?[:\\-]?\\s*([\\d.]+)"],
    "Bone Mass (BM)": [r"Bone Mass.*?[:\\-]?\\s*([\\d.]+)"],
    "Muscle Mass (MM)": [r"Muscle Mass.*?[:\\-]?\\s*([\\d.]+)"],
    "Obesity (%)": [r"Obesity.*?[:\\-]?\\s*([\\d.]+)%?"],
    "Total Fat Content (TFC)": [r"Total Fat Content.*?[:\\-]?\\s*([\\d.]+)"],
    "Visceral Fat Area (VFA)": [r"Visceral Fat Area.*?[:\\-]?\\s*([\\d.]+)"],
    "Visceral Muscle Area (VMA)": [r"Visceral Muscle Area.*?[:\\-]?\\s*([\\d.]+)"],
    "Hepatic Fat Accumulation (HFA)": [r"Hepatic Fat Accumulation.*?[:\\-]?\\s*([\\d.]+)"],
    "Glucose": [r"Glucose.*?[:\\-]?\\s*([\\d.]+)"],
    "Total Cholesterol (TC)": [r"Total Cholesterol.*?[:\\-]?\\s*([\\d.]+)"],
    "Low Density Lipoprotein (LDL)": [r"LDL.*?[:\\-]?\\s*([\\d.]+)"],
    "High Density Lipoprotein (HDL)": [r"HDL.*?[:\\-]?\\s*([\\d.]+)"],
    "Triglyceride": [r"Triglyceride.*?[:\\-]?\\s*([\\d.]+)"],
    "Aspartat Aminotransferaz (AST)": [r"AST.*?[:\\-]?\\s*([\\d.]+)"],
    "Alanin Aminotransferaz (ALT)": [r"ALT.*?[:\\-]?\\s*([\\d.]+)"],
    "Alkaline Phosphatase (ALP)": [r"ALP.*?[:\\-]?\\s*([\\d.]+)"],
    "Creatinine": [r"Creatinine.*?[:\\-]?\\s*([\\d.]+)"],
    "Glomerular Filtration Rate (GFR)": [r"GFR.*?[:\\-]?\\s*([\\d.]+)"],
    "C-Reactive Protein (CRP)": [r"CRP.*?[:\\-]?\\s*([\\d.]+)"],
    "Hemoglobin (HGB)": [r"Hemoglobin.*?[:\\-]?\\s*([\\d.]+)"],
    "Vitamin D": [r"Vitamin\\s*D.*?[:\\-]?\\s*([\\d.]+)"]
}

def extract_lab_values(text, patterns):
    extracted = {}
    for label, regex_list in patterns.items():
        for pattern in regex_list:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                val = match.group(1).strip()
                if val and val.lower() != label.lower():
                    extracted[label] = val
                    break
        if label not in extracted:
            extracted[label] = "Not found"
    return extracted

def extract_from_scanned_lab_pdf(pdf_path):
    text = pdf_to_text_via_ocr(pdf_path)
    return extract_lab_values(text, lab_patterns)

Writing /content/mediscan_ai/app/pdf_extractor.py


In [None]:
!apt-get install poppler-utils -y
!apt-get install tesseract-ocr -y
!pip install pytesseract pdf2image pillow

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 35 not upgraded.
Need to get 186 kB of archives.
After this operation, 697 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.8 [186 kB]
Fetched 186 kB in 1s (312 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 126111 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.8_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.8) ...
Setting up poppler-utils (22.02.0-2ubuntu0.8) ...
Processing triggers for man-db (2.10.2-1) ...
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed

In [None]:
from google.colab import files
uploaded = files.upload()

pdf_path = next(iter(uploaded))  # Get the uploaded file name

Saving Lab Test_1.pdf to Lab Test_1.pdf


In [None]:
from pdf_extractor import pdf_to_text_via_ocr

# Run OCR only and print the raw text
ocr_text = pdf_to_text_via_ocr(pdf_path)
print(ocr_text)

Hospital: Alexandria General Hospital
Name: Mohamed Ali

Age: 38

Comorbidity: No

Diabetes Mellitus (DM): No
Body Mass Index (BMI): 23.5
Height: 171

Weight: 68.6

Total Body Water (TBW): 39.5
Extracellular Water (ECW): 16.6

Intracellular Water (ICW): 22.9

Lab Test Results
Lab Name: Gallstone Diagnosis
Gender: Male

Lab Date: June 09, 2025

Extracellular Fluid/Total Body Water (ECF/TBW): 42

Total Body Fat Ratio (TBFR): 19.2%
Lean Mass (LM): 80.76%

Body Protein Content: 17.28%
Visceral Fat Rating (VFR): 6

Bone Mass: 2.8 kg

Muscle Mass (MM): 52.6 kg
Obesity: 6.7%

Total Fat Content (TFC): 13.2 kg
Visceral Fat Area (VFA): 8.2 cm?
Visceral Muscle Area (VMA): 28.8 kg
Hepatic Fat Accumulation (HFA): 0
Glucose: 93 mg/dL

Total Cholesterol (TC): 239 mg/dL
Low Density Lipoprotein (LDL): 169 mg/dL
High Density Lipoprotein (HDL): 43 mg/dL
Triglyceride: 129 mg/dL

AST: 19 U/L

ALT: 34 U/L

ALP: 75 U/L

Creatinine: 0.91 mg/dL

GFR: 110.63 mL/min

C-Reactive Protein (CRP): 0.0 mg/L
Hemoglobin

In [None]:
ocr_text = pdf_to_text_via_ocr(pdf_path, dpi=400)
print(ocr_text)

Hospital: Alexandria General Hospital
Name: Mohamed Ali

Age: 38

Comorbidity: No

Diabetes Mellitus (DM): No
Body Mass Index (BMI): 23.5
Height: 171

Weight: 68.6

Total Body Water (TBW): 39.5
Extracellular Water (ECW): 16.6

Intracellular Water (ICW): 22.9

Lab Test Results
Lab Name: Gallstone Diagnosis
Gender: Male

Lab Date: June 09, 2025

Extracellular Fluid/Total Body Water (ECF/TBW): 42

Total Body Fat Ratio (TBFR): 19.2%
Lean Mass (LM): 80.76%

Body Protein Content: 17.28%
Visceral Fat Rating (VFR): 6

Bone Mass: 2.8 kg

Muscle Mass (MM): 52.6 kg
Obesity: 6.7%

Total Fat Content (TFC): 13.2 kg
Visceral Fat Area (VFA): 8.2 cm?
Visceral Muscle Area (VMA): 28.8 kg
Hepatic Fat Accumulation (HFA): 0
Glucose: 93 mg/dL

Total Cholesterol (TC): 239 mg/dL
Low Density Lipoprotein (LDL): 169 mg/dL
High Density Lipoprotein (HDL): 43 mg/dL
Triglyceride: 129 mg/dL

AST: 19 U/L

ALT: 34 U/L

ALP: 75 U/L

Creatinine: 0.91 mg/dL

GFR: 110.63 mL/min

C-Reactive Protein (CRP): 0.0 mg/L
Hemoglobin

In [None]:
### Adjusting Regular Expressions
def extract_from_scanned_lab_pdf(pdf_path: str):
    text = pdf_to_text_via_ocr(pdf_path)
    data = {}

    patterns = {
    "Hospital Name": r"Hospital[:\-]?\s*(.+)",
    "Lab Name": r"Lab Name[:\-]?\s*(.+)",
    "Patient Name": r"Name[:\-]?\s*(.+)",
    "Gender": r"Gender[:\-]?\s*(Male|Female)",
    "Age": r"Age[:\-]?\s*(\d+)",
    "Lab Date": r"Lab Date[:\-]?\s*(.+)",
    "Comorbidity": r"Comorbidity[:\-]?\s*(Yes|No)",
    "Diabetes Mellitus \(DM\)": r"Diabetes Mellitus \(DM\)[:\-]?\s*(Yes|No)",
    "Body Mass Index \(BMI\)": r"BMI[:\-]?\s*([\d.]+)",
    "Height \(cm\)": r"Height[:\-]?\s*([\d.]+)",
    "Weight \(kg\)": r"Weight[:\-]?\s*([\d.]+)",
    "Total Body Water \(TBW\)": r"Total Body Water \(TBW\)[:\-]?\s*([\d.]+)",
    "Extracellular Water \(ECW\)": r"ECW[:\-]?\s*([\d.]+)",
    "Intracellular Water \(ICW\)": r"ICW[:\-]?\s*([\d.]+)",
    "ECF/TBW": r"ECF/TBW[:\-]?\s*([\d.]+)",
    "Total Body Fat Ratio \(TBFR\)": r"TBFR[:\-]?\s*([\d.]+)%",
    "Lean Mass \(%\)": r"Lean Mass \(LM\)[:\-]?\s*([\d.]+)%",
    "Body Protein Content \(%\)": r"Body Protein Content[:\-]?\s*([\d.]+)%",
    "Visceral Fat Rating \(VFR\)": r"VFR[:\-]?\s*([\d.]+)",
    "Bone Mass \(BM\)": r"Bone Mass[:\-]?\s*([\d.]+)",
    "Muscle Mass \(MM\)": r"Muscle Mass \(MM\)[:\-]?\s*([\d.]+)",
    "Obesity \(%\)": r"Obesity[:\-]?\s*([\d.]+)%",
    "Total Fat Content \(TFC\)": r"TFC[:\-]?\s*([\d.]+)",
    "Visceral Fat Area \(VFA\)": r"VFA[:\-]?\s*([\d.]+)",
    "Visceral Muscle Area \(VMA\)": r"VMA[:\-]?\s*([\d.]+)",
    "Hepatic Fat Accumulation \(HFA\)": r"HFA[:\-]?\s*([\d.]+)",
    "Glucose": r"Glucose[:\-]?\s*([\d.]+)",
    "Total Cholesterol \(TC\)": r"TC[:\-]?\s*([\d.]+)",
    "Low Density Lipoprotein \(LDL\)": r"LDL[:\-]?\s*([\d.]+)",
    "High Density Lipoprotein \(HDL\)": r"HDL[:\-]?\s*([\d.]+)",
    "Triglyceride": r"Triglyceride[:\-]?\s*([\d.]+)",
    "AST": r"AST[:\-]?\s*([\d.]+)",
    "ALT": r"ALT[:\-]?\s*([\d.]+)",
    "ALP": r"ALP[:\-]?\s*([\d.]+)",
    "Creatinine": r"Creatinine[:\-]?\s*([\d.]+)",
    "GFR": r"GFR[:\-]?\s*([\d.]+)",
    "C-Reactive Protein \(CRP\)": r"CRP[:\-]?\s*([\d.]+)",
    "Hemoglobin \(HGB\)": r"HGB[:\-]?\s*([\d.]+)",
    "Vitamin D": r"Vitamin D[:\-]?\s*([\d.]+)",
}
    for key, pattern in patterns.items():
        match = re.search(pattern, text, re.IGNORECASE)
        data[key] = match.group(1).strip() if match else "Not found"

    return data

In [None]:
%%writefile /content/mediscan_ai/app/pdf_extractor.py
# pdf_extractor.py

import pytesseract
from pdf2image import convert_from_path
import re

def pdf_to_text_via_ocr(pdf_path: str) -> str:
    images = convert_from_path(pdf_path)
    text = ""
    for img in images:
        text += pytesseract.image_to_string(img)
    return text

# continue in pdf_extractor.py

def extract_from_scanned_lab_pdf(pdf_path: str):
    text = pdf_to_text_via_ocr(pdf_path)
    data = {}

    patterns = {
    "Hospital Name": r"Hospital[\s:\-]*\n*\s*(.+)",
    "Lab Name": r"Lab\s*Name[\s:\-]*\n*\s*(.+)",
    "Patient Name": r"(?:Patient\s*Name|Name)[\s:\-]*\n*\s*(.+)",
    "Gender": r"Gender[\s:\-]*\n*\s*(Male|Female)",
    "Age": r"Age[\s:\-]*\n*\s*(\d+)",
    "Lab Date": r"Lab\s*Date[\s:\-]*\n*\s*(.+)",
    "Comorbidity": r"Comorbidity[\s:\-]*\n*\s*(Yes|No)",
    "Diabetes Mellitus (DM)": r"Diabetes\s*Mellitus\s*\(DM\)[\s:\-]*\n*\s*(Yes|No)",
    "Body Mass Index (BMI)": r"Body\s*Mass\s*Index\s*\(BMI\)[\s:\-]*\n*\s*([\d.]+)",
    "Height (cm)": r"Height[\s:\-]*\n*\s*([\d.]+)",
    "Weight (kg)": r"Weight[\s:\-]*\n*\s*([\d.]+)",
    "Total Body Water (TBW)": r"Total\s*Body\s*Water\s*\(TBW\)[\s:\-]*\n*\s*([\d.]+)",
    "Extracellular Water (ECW)": r"Extracellular\s*Water\s*\(ECW\)[\s:\-]*\n*\s*([\d.]+)",
    "Intracellular Water (ICW)": r"Intracellular\s*Water\s*\(ICW\)[\s:\-]*\n*\s*([\d.]+)",
    "Extracellular Fluid/Total Body Water (ECF/TBW)": r"Extracellular\s*Fluid/Total\s*Body\s*Water\s*\(ECF/TBW\)[\s:\-]*\n*\s*([\d.]+)",
    "Total Body Fat Ratio (TBFR) (%)": r"Total\s*Body\s*Fat\s*Ratio\s*\(TBFR\)\s*\(%\)[\s:\-]*\n*\s*([\d.]+)",
    "Lean Mass (%)": r"Lean\s*Mass\s*\(%\)[\s:\-]*\n*\s*([\d.]+)",
    "Body Protein Content (%)": r"Body\s*Protein\s*Content\s*\(%\)[\s:\-]*\n*\s*([\d.]+)",
    "Visceral Fat Rating (VFR)": r"Visceral\s*Fat\s*Rating\s*\(VFR\)[\s:\-]*\n*\s*([\d.]+)",
    "Bone Mass (BM)": r"Bone\s*Mass\s*\(BM\)[\s:\-]*\n*\s*([\d.]+)",
    "Muscle Mass (MM)": r"Muscle\s*Mass\s*\(MM\)[\s:\-]*\n*\s*([\d.]+)",
    "Obesity (%)": r"Obesity\s*\(%\)[\s:\-]*\n*\s*([\d.]+)",
    "Total Fat Content (TFC)": r"Total\s*Fat\s*Content\s*\(TFC\)[\s:\-]*\n*\s*([\d.]+)",
    "Visceral Fat Area (VFA)": r"Visceral\s*Fat\s*Area\s*\(VFA\)[\s:\-]*\n*\s*([\d.]+)",
    "Visceral Muscle Area (VMA)": r"Visceral\s*Muscle\s*Area\s*\(VMA\)[\s:\-]*\n*\s*([\d.]+)",
    "Hepatic Fat Accumulation (HFA)": r"Hepatic\s*Fat\s*Accumulation\s*\(HFA\)[\s:\-]*\n*\s*([\d.]+)",
    "Glucose": r"Glucose[\s:\-]*\n*\s*([\d.]+)",
    "Total Cholesterol (TC)": r"Total\s*Cholesterol\s*\(TC\)[\s:\-]*\n*\s*([\d.]+)",
    "Low Density Lipoprotein (LDL)": r"Low\s*Density\s*Lipoprotein\s*\(LDL\)[\s:\-]*\n*\s*([\d.]+)",
    "High Density Lipoprotein (HDL)": r"High\s*Density\s*Lipoprotein\s*\(HDL\)[\s:\-]*\n*\s*([\d.]+)",
    "Triglyceride": r"Triglyceride[\s:\-]*\n*\s*([\d.]+)",
    "Aspartat Aminotransferaz (AST)": r"Aspartat\s*Aminotransferaz\s*\(AST\)[\s:\-]*\n*\s*([\d.]+)",
    "Alanin Aminotransferaz (ALT)": r"Alanin\s*Aminotransferaz\s*\(ALT\)[\s:\-]*\n*\s*([\d.]+)",
    "Alkaline Phosphatase (ALP)": r"Alkaline\s*Phosphatase\s*\(ALP\)[\s:\-]*\n*\s*([\d.]+)",
    "Creatinine": r"Creatinine[\s:\-]*\n*\s*([\d.]+)",
    "Glomerular Filtration Rate (GFR)": r"Glomerular\s*Filtration\s*Rate\s*\(GFR\)[\s:\-]*\n*\s*([\d.]+)",
    "C-Reactive Protein (CRP)": r"C[\s\-]*Reactive\s*Protein\s*\(CRP\)[\s:\-]*\n*\s*([\d.]+)",
    "Hemoglobin (HGB)": r"Hemoglobin\s*\(HGB\)[\s:\-]*\n*\s*([\d.]+)",
    "Vitamin D": r"Vitamin\s*D[\s:\-]*\n*\s*([\d.]+)",
}


    for key, pattern in patterns.items():
        match = re.search(pattern, text, re.IGNORECASE)
        data[key] = match.group(1).strip() if match else "Not found"

    return data

Writing /content/mediscan_ai/app/pdf_extractor.py


In [None]:
import re
pdf_path = "/content/Lab Test_1.pdf"

# OCR text cleaning
text = pdf_to_text_via_ocr(pdf_path)
text = re.sub(r'\s{2,}', ' ', text)
text = re.sub(r'\n+', '\n', text)

# 💡 Insert newlines between inline fields
text = re.sub(r'(?<=\w): ([^:\n]+?)(?= \w+:)', r': \1\n', text)

print("Cleaned and Separated Text:\n", text)

patterns = {
    "Hospital Name": r"Hospital[\s:\-]*\n*\s*(.+)",
    "Lab Name": r"Lab\s*Name[\s:\-]*\n*\s*(.+)",
    "Patient Name": r"(?:Patient\s*Name|Name)[\s:\-]*\n*\s*(.+)",
    "Gender": r"Gender[\s:\-]*\n*\s*(Male|Female)",
    "Age": r"Age[\s:\-]*\n*\s*(\d+)",
    "Lab Date": r"Lab\s*Date[\s:\-]*\n*\s*(.+)",
    "Comorbidity": r"Comorbidity[\s:\-]*\n*\s*(Yes|No)",
    "Diabetes Mellitus (DM)": r"Diabetes\s*Mellitus\s*\(DM\)[\s:\-]*\n*\s*(Yes|No)",
    "Body Mass Index (BMI)": r"Body\s*Mass\s*Index\s*\(BMI\)[\s:\-]*\n*\s*([\d.]+)",
    "Height (cm)": r"Height[\s:\-]*\n*\s*([\d.]+)",
    "Weight (kg)": r"Weight[\s:\-]*\n*\s*([\d.]+)",
    "Total Body Water (TBW)": r"Total\s*Body\s*Water\s*\(TBW\)[\s:\-]*\n*\s*([\d.]+)",
    "Extracellular Water (ECW)": r"Extracellular\s*Water\s*\(ECW\)[\s:\-]*\n*\s*([\d.]+)",
    "Intracellular Water (ICW)": r"Intracellular\s*Water\s*\(ICW\)[\s:\-]*\n*\s*([\d.]+)",
    "Extracellular Fluid/Total Body Water (ECF/TBW)": r"Extracellular\s*Fluid/Total\s*Body\s*Water\s*\(ECF/TBW\)[\s:\-]*\n*\s*([\d.]+)",
    "Total Body Fat Ratio (TBFR) (%)": r"Total\s*Body\s*Fat\s*Ratio\s*\(TBFR\)\s*\(%\)[\s:\-]*\n*\s*([\d.]+)",
    "Lean Mass (%)": r"Lean\s*Mass\s*\(%\)[\s:\-]*\n*\s*([\d.]+)",
    "Body Protein Content (%)": r"Body\s*Protein\s*Content\s*\(%\)[\s:\-]*\n*\s*([\d.]+)",
    "Visceral Fat Rating (VFR)": r"Visceral\s*Fat\s*Rating\s*\(VFR\)[\s:\-]*\n*\s*([\d.]+)",
    "Bone Mass (BM)": r"Bone\s*Mass\s*\(BM\)[\s:\-]*\n*\s*([\d.]+)",
    "Muscle Mass (MM)": r"Muscle\s*Mass\s*\(MM\)[\s:\-]*\n*\s*([\d.]+)",
    "Obesity (%)": r"Obesity\s*\(%\)[\s:\-]*\n*\s*([\d.]+)",
    "Total Fat Content (TFC)": r"Total\s*Fat\s*Content\s*\(TFC\)[\s:\-]*\n*\s*([\d.]+)",
    "Visceral Fat Area (VFA)": r"Visceral\s*Fat\s*Area\s*\(VFA\)[\s:\-]*\n*\s*([\d.]+)",
    "Visceral Muscle Area (VMA)": r"Visceral\s*Muscle\s*Area\s*\(VMA\)[\s:\-]*\n*\s*([\d.]+)",
    "Hepatic Fat Accumulation (HFA)": r"Hepatic\s*Fat\s*Accumulation\s*\(HFA\)[\s:\-]*\n*\s*([\d.]+)",
    "Glucose": r"Glucose[\s:\-]*\n*\s*([\d.]+)",
    "Total Cholesterol (TC)": r"Total\s*Cholesterol\s*\(TC\)[\s:\-]*\n*\s*([\d.]+)",
    "Low Density Lipoprotein (LDL)": r"Low\s*Density\s*Lipoprotein\s*\(LDL\)[\s:\-]*\n*\s*([\d.]+)",
    "High Density Lipoprotein (HDL)": r"High\s*Density\s*Lipoprotein\s*\(HDL\)[\s:\-]*\n*\s*([\d.]+)",
    "Triglyceride": r"Triglyceride[\s:\-]*\n*\s*([\d.]+)",
    "Aspartat Aminotransferaz (AST)": r"Aspartat\s*Aminotransferaz\s*\(AST\)[\s:\-]*\n*\s*([\d.]+)",
    "Alanin Aminotransferaz (ALT)": r"Alanin\s*Aminotransferaz\s*\(ALT\)[\s:\-]*\n*\s*([\d.]+)",
    "Alkaline Phosphatase (ALP)": r"Alkaline\s*Phosphatase\s*\(ALP\)[\s:\-]*\n*\s*([\d.]+)",
    "Creatinine": r"Creatinine[\s:\-]*\n*\s*([\d.]+)",
    "Glomerular Filtration Rate (GFR)": r"Glomerular\s*Filtration\s*Rate\s*\(GFR\)[\s:\-]*\n*\s*([\d.]+)",
    "C-Reactive Protein (CRP)": r"C[\s\-]*Reactive\s*Protein\s*\(CRP\)[\s:\-]*\n*\s*([\d.]+)",
    "Hemoglobin (HGB)": r"Hemoglobin\s*\(HGB\)[\s:\-]*\n*\s*([\d.]+)",
    "Vitamin D": r"Vitamin\s*D[\s:\-]*\n*\s*([\d.]+)",
}
# Then apply your pattern matching here.
# Apply pattern matching
results = {}
for field, pattern in patterns.items():
    match = re.search(pattern, text)
    results[field] = match.group(1).strip() if match else 'Not found'

print("Extracted Results:\n", results)

Cleaned and Separated Text:
 Hospital: Alexandria General Hospital
Name: Mohamed Ali
 Age: 38
 Comorbidity: No Diabetes Mellitus (DM): No
Body Mass Index (BMI): 23.5
Height: 171
 Weight: 68.6 Total Body Water (TBW): 39.5
Extracellular Water (ECW): 16.6 Intracellular Water (ICW): 22.9 Lab Test Results
Lab Name: Gallstone Diagnosis
Gender: Male Lab
 Date: June 09, 2025 Extracellular Fluid/Total Body Water (ECF/TBW): 42 Total Body Fat Ratio (TBFR): 19.2%
Lean Mass (LM): 80.76% Body Protein Content: 17.28%
Visceral Fat Rating (VFR): 6 Bone Mass: 2.8 kg Muscle Mass (MM): 52.6 kg
Obesity: 6.7% Total Fat Content (TFC): 13.2 kg
Visceral Fat Area (VFA): 8.2 cm?
Visceral Muscle Area (VMA): 28.8 kg
Hepatic Fat Accumulation (HFA): 0
Glucose: 93 mg/dL Total Cholesterol (TC): 239 mg/dL
Low Density Lipoprotein (LDL): 169 mg/dL
High Density Lipoprotein (HDL): 43 mg/dL
Triglyceride: 129 mg/dL
 AST: 19 U/L
 ALT: 34 U/L
 ALP: 75 U/L
 Creatinine: 0.91 mg/dL
 GFR: 110.63 mL/min C-Reactive Protein (CRP): 0.

In [None]:
### Final Corrected Code

import re
pdf_path = "/content/Lab Test_1.pdf"

# OCR text cleaning
text = pdf_to_text_via_ocr(pdf_path)
text = re.sub(r'\s{2,}', ' ', text)
text = re.sub(r'\n+', '\n', text)

# 💡 Insert newlines between inline fields
text = re.sub(r'(?<=\w): ([^:\n]+?)(?= \w+:)', r': \1\n', text)

print("Cleaned and Separated Text:\n", text)

# Regex patterns for extracting fields
patterns = {
    'Hospital Name': r'Hospital\s*:\s*(.+)',
    'Lab Name': r'Lab Name\s*:\s*(.+)',
    'Patient Name': r'Name\s*:\s*(.+)',
    'Gender': r'Gender\s*:\s*(Male|Female)',
    'Age': r'Age\s*:\s*(\d+)',
    'Lab Date': r'Date\s*:\s*(\w+ \d{2}, \d{4})',
    'Comorbidity': r'Comorbidity\s*:\s*(\w+)',
    'Diabetes Mellitus (DM)': r'Diabetes Mellitus \(DM\)\s*:\s*(\w+)',
    'Body Mass Index (BMI)': r'Body Mass Index \(BMI\)\s*:\s*([\d.]+)',
    'Height (cm)': r'Height\s*:\s*(\d+)',
    'Weight (kg)': r'Weight\s*:\s*([\d.]+)',
    'Total Body Water (TBW)': r'Total Body Water \(TBW\)\s*:\s*([\d.]+)',
    'Extracellular Water (ECW)': r'Extracellular Water \(ECW\)\s*:\s*([\d.]+)',
    'Intracellular Water (ICW)': r'Intracellular Water \(ICW\)\s*:\s*([\d.]+)',
    'Extracellular Fluid/Total Body Water (ECF/TBW)': r'Extracellular Fluid/Total Body Water \(ECF/TBW\)\s*:\s*([\d.]+)',
    'Total Body Fat Ratio (TBFR) (%)': r'Total Body Fat Ratio \(TBFR\)\s*:\s*([\d.]+)%',
    'Lean Mass (%)': r'Lean Mass \(LM\)\s*:\s*([\d.]+)%',
    'Body Protein Content (%)': r'Body Protein Content\s*:\s*([\d.]+)%',
    'Visceral Fat Rating (VFR)': r'Visceral Fat Rating \(VFR\)\s*:\s*([\d.]+)',
    'Bone Mass (BM)': r'Bone Mass\s*:\s*([\d.]+)\s*kg',
    'Muscle Mass (MM)': r'Muscle Mass \(MM\)\s*:\s*([\d.]+)\s*kg',
    'Obesity (%)': r'Obesity\s*:\s*([\d.]+)%',
    'Total Fat Content (TFC)': r'Total Fat Content \(TFC\)\s*:\s*([\d.]+)',
    'Visceral Fat Area (VFA)': r'Visceral Fat Area \(VFA\)\s*:\s*([\d.]+)',
    'Visceral Muscle Area (VMA)': r'Visceral Muscle Area \(VMA\)\s*:\s*([\d.]+)',
    'Hepatic Fat Accumulation (HFA)': r'Hepatic Fat Accumulation \(HFA\)\s*:\s*([\d.]+)',
    'Glucose': r'Glucose\s*:\s*([\d.]+)',
    'Total Cholesterol (TC)': r'Total Cholesterol \(TC\)\s*:\s*([\d.]+)',
    'Low Density Lipoprotein (LDL)': r'Low Density Lipoprotein \(LDL\)\s*:\s*([\d.]+)',
    'High Density Lipoprotein (HDL)': r'High Density Lipoprotein \(HDL\)\s*:\s*([\d.]+)',
    'Triglyceride': r'Triglyceride\s*:\s*([\d.]+)',
    'Aspartat Aminotransferaz (AST)': r'AST\s*:\s*([\d.]+)',
    'Alanin Aminotransferaz (ALT)': r'ALT\s*:\s*([\d.]+)',
    'Alkaline Phosphatase (ALP)': r'ALP\s*:\s*([\d.]+)',
    'Creatinine': r'Creatinine\s*:\s*([\d.]+)',
    'Glomerular Filtration Rate (GFR)': r'GFR\s*:\s*([\d.]+)',
    'C-Reactive Protein (CRP)': r'C-Reactive Protein \(CRP\)\s*:\s*([\d.]+)',
    'Hemoglobin (HGB)': r'Hemoglobin \(HGB\)\s*:\s*([\d.]+)',
    'Vitamin D': r'Vitamin D\s*:\s*([\d.]+)',
}
# Extraction logic
results = {}
for field, pattern in patterns.items():
    match = re.search(r'\s*' + pattern, text)
    results[field] = match.group(1).strip() if match else 'Not found'

print("Extracted Results:\n", results)

Cleaned and Separated Text:
 Hospital: Alexandria General Hospital
Name: Mohamed Ali
 Age: 38
 Comorbidity: No Diabetes Mellitus (DM): No
Body Mass Index (BMI): 23.5
Height: 171
 Weight: 68.6 Total Body Water (TBW): 39.5
Extracellular Water (ECW): 16.6 Intracellular Water (ICW): 22.9 Lab Test Results
Lab Name: Gallstone Diagnosis
Gender: Male Lab
 Date: June 09, 2025 Extracellular Fluid/Total Body Water (ECF/TBW): 42 Total Body Fat Ratio (TBFR): 19.2%
Lean Mass (LM): 80.76% Body Protein Content: 17.28%
Visceral Fat Rating (VFR): 6 Bone Mass: 2.8 kg Muscle Mass (MM): 52.6 kg
Obesity: 6.7% Total Fat Content (TFC): 13.2 kg
Visceral Fat Area (VFA): 8.2 cm?
Visceral Muscle Area (VMA): 28.8 kg
Hepatic Fat Accumulation (HFA): 0
Glucose: 93 mg/dL Total Cholesterol (TC): 239 mg/dL
Low Density Lipoprotein (LDL): 169 mg/dL
High Density Lipoprotein (HDL): 43 mg/dL
Triglyceride: 129 mg/dL
 AST: 19 U/L
 ALT: 34 U/L
 ALP: 75 U/L
 Creatinine: 0.91 mg/dL
 GFR: 110.63 mL/min C-Reactive Protein (CRP): 0.

### **Final Adjusted Code**

In [None]:
%%writefile /content/mediscan_ai/app/pdf_extractor.py
import re
import pytesseract
from pdf2image import convert_from_path
from PIL import Image

def pdf_to_text_via_ocr(pdf_path, dpi=300):
    images = convert_from_path(pdf_path, dpi=dpi)
    full_text = ""
    for img in images:
        text = pytesseract.image_to_string(img)
        text = re.sub(r"[^\x00-\x7F]+", " ", text)  # Remove non-ASCII
        full_text += text + "\n"
    return full_text

def extract_lab_report_data(pdf_path):
    # Step 1: OCR + Cleaning
    text = pdf_to_text_via_ocr(pdf_path)
    text = re.sub(r'\s{2,}', ' ', text)
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r'(?<=\w): ([^:\n]+?)(?= \w+:)', r': \1\n', text)

    # Step 2: Regex patterns
    patterns = {
        'Hospital Name': r'Hospital\s*:\s*(.+)',
        'Lab Name': r'Lab Name\s*:\s*(.+)',
        'Patient Name': r'Name\s*:\s*(.+)',
        'Gender': r'Gender\s*:\s*(Male|Female)',
        'Age': r'Age\s*:\s*(\d+)',
        'Lab Date': r'Date\s*:\s*(\w+ \d{2}, \d{4})',
        'Comorbidity': r'Comorbidity\s*:\s*(\w+)',
        'Diabetes Mellitus (DM)': r'Diabetes Mellitus \(DM\)\s*:\s*(\w+)',
        'Body Mass Index (BMI)': r'Body Mass Index \(BMI\)\s*:\s*([\d.]+)',
        'Height (cm)': r'Height\s*:\s*(\d+)',
        'Weight (kg)': r'Weight\s*:\s*([\d.]+)',
        'Total Body Water (TBW)': r'Total Body Water \(TBW\)\s*:\s*([\d.]+)',
        'Extracellular Water (ECW)': r'Extracellular Water \(ECW\)\s*:\s*([\d.]+)',
        'Intracellular Water (ICW)': r'Intracellular Water \(ICW\)\s*:\s*([\d.]+)',
        'Extracellular Fluid/Total Body Water (ECF/TBW)': r'Extracellular Fluid/Total Body Water \(ECF/TBW\)\s*:\s*([\d.]+)',
        'Total Body Fat Ratio (TBFR) (%)': r'Total Body Fat Ratio \(TBFR\)\s*:\s*([\d.]+)%',
        'Lean Mass (%)': r'Lean Mass \(LM\)\s*:\s*([\d.]+)%',
        'Body Protein Content (%)': r'Body Protein Content\s*:\s*([\d.]+)%',
        'Visceral Fat Rating (VFR)': r'Visceral Fat Rating \(VFR\)\s*:\s*([\d.]+)',
        'Bone Mass (BM)': r'Bone Mass\s*:\s*([\d.]+)\s*kg',
        'Muscle Mass (MM)': r'Muscle Mass \(MM\)\s*:\s*([\d.]+)\s*kg',
        'Obesity (%)': r'Obesity\s*:\s*([\d.]+)%',
        'Total Fat Content (TFC)': r'Total Fat Content \(TFC\)\s*:\s*([\d.]+)',
        'Visceral Fat Area (VFA)': r'Visceral Fat Area \(VFA\)\s*:\s*([\d.]+)',
        'Visceral Muscle Area (VMA)': r'Visceral Muscle Area \(VMA\)\s*:\s*([\d.]+)',
        'Hepatic Fat Accumulation (HFA)': r'Hepatic Fat Accumulation \(HFA\)\s*:\s*([\d.]+)',
        'Glucose': r'Glucose\s*:\s*([\d.]+)',
        'Total Cholesterol (TC)': r'Total Cholesterol \(TC\)\s*:\s*([\d.]+)',
        'Low Density Lipoprotein (LDL)': r'Low Density Lipoprotein \(LDL\)\s*:\s*([\d.]+)',
        'High Density Lipoprotein (HDL)': r'High Density Lipoprotein \(HDL\)\s*:\s*([\d.]+)',
        'Triglyceride': r'Triglyceride\s*:\s*([\d.]+)',
        'Aspartat Aminotransferaz (AST)': r'AST\s*:\s*([\d.]+)',
        'Alanin Aminotransferaz (ALT)': r'ALT\s*:\s*([\d.]+)',
        'Alkaline Phosphatase (ALP)': r'ALP\s*:\s*([\d.]+)',
        'Creatinine': r'Creatinine\s*:\s*([\d.]+)',
        'Glomerular Filtration Rate (GFR)': r'GFR\s*:\s*([\d.]+)',
        'C-Reactive Protein (CRP)': r'C-Reactive Protein \(CRP\)\s*:\s*([\d.]+)',
        'Hemoglobin (HGB)': r'Hemoglobin \(HGB\)\s*:\s*([\d.]+)',
        'Vitamin D': r'Vitamin D\s*:\s*([\d.]+)',
    }

    # Step 3: Extract values
    results = {}
    for field, pattern in patterns.items():
        match = re.search(r'\s*' + pattern, text)
        results[field] = match.group(1).strip() if match else 'Not found'

    return results

Writing /content/mediscan_ai/app/pdf_extractor.py


### **pdf_extractor.py**

In [32]:
!apt-get install poppler-utils -y
!apt-get install tesseract-ocr -y
!pip install pytesseract pdf2image pillow

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 35 not upgraded.
Need to get 186 kB of archives.
After this operation, 697 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.8 [186 kB]
Fetched 186 kB in 0s (1,715 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 126111 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.8_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.8) ...
Setting up poppler-utils (22.02.0-2ubuntu0.8) ...
Processing triggers for man-db (2.10.2-1) ...
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly install

In [33]:
# 📄 Step: Create pdf_extractor.py
pdf_extractor_code = '''
import re
import pytesseract
from pdf2image import convert_from_path
from PIL import Image

def pdf_to_text_via_ocr(pdf_path, dpi=300):
    images = convert_from_path(pdf_path, dpi=dpi)
    full_text = ""
    for img in images:
        text = pytesseract.image_to_string(img)
        text = re.sub(r"[^\x00-\x7F]+", " ", text)  # Remove non-ASCII
        full_text += text + "\n"
    return full_text

def extract_lab_report_data(pdf_path):
    # Step 1: OCR + Cleaning
    text = pdf_to_text_via_ocr(pdf_path)
    text = re.sub(r'\s{2,}', ' ', text)
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r'(?<=\w): ([^:\n]+?)(?= \w+:)', r': \1\n', text)

    # Step 2: Regex patterns
    patterns = {
        'Hospital Name': r'Hospital\s*:\s*(.+)',
        'Lab Name': r'Lab Name\s*:\s*(.+)',
        'Patient Name': r'Name\s*:\s*(.+)',
        'Gender': r'Gender\s*:\s*(Male|Female)',
        'Age': r'Age\s*:\s*(\d+)',
        'Lab Date': r'Date\s*:\s*(\w+ \d{2}, \d{4})',
        'Comorbidity': r'Comorbidity\s*:\s*(\w+)',
        'Diabetes Mellitus (DM)': r'Diabetes Mellitus \(DM\)\s*:\s*(\w+)',
        'Body Mass Index (BMI)': r'Body Mass Index \(BMI\)\s*:\s*([\d.]+)',
        'Height (cm)': r'Height\s*:\s*(\d+)',
        'Weight (kg)': r'Weight\s*:\s*([\d.]+)',
        'Total Body Water (TBW)': r'Total Body Water \(TBW\)\s*:\s*([\d.]+)',
        'Extracellular Water (ECW)': r'Extracellular Water \(ECW\)\s*:\s*([\d.]+)',
        'Intracellular Water (ICW)': r'Intracellular Water \(ICW\)\s*:\s*([\d.]+)',
        'Extracellular Fluid/Total Body Water (ECF/TBW)': r'Extracellular Fluid/Total Body Water \(ECF/TBW\)\s*:\s*([\d.]+)',
        'Total Body Fat Ratio (TBFR) (%)': r'Total Body Fat Ratio \(TBFR\)\s*:\s*([\d.]+)%',
        'Lean Mass (%)': r'Lean Mass \(LM\)\s*:\s*([\d.]+)%',
        'Body Protein Content (%)': r'Body Protein Content\s*:\s*([\d.]+)%',
        'Visceral Fat Rating (VFR)': r'Visceral Fat Rating \(VFR\)\s*:\s*([\d.]+)',
        'Bone Mass (BM)': r'Bone Mass\s*:\s*([\d.]+)\s*kg',
        'Muscle Mass (MM)': r'Muscle Mass \(MM\)\s*:\s*([\d.]+)\s*kg',
        'Obesity (%)': r'Obesity\s*:\s*([\d.]+)%',
        'Total Fat Content (TFC)': r'Total Fat Content \(TFC\)\s*:\s*([\d.]+)',
        'Visceral Fat Area (VFA)': r'Visceral Fat Area \(VFA\)\s*:\s*([\d.]+)',
        'Visceral Muscle Area (VMA)': r'Visceral Muscle Area \(VMA\)\s*:\s*([\d.]+)',
        'Hepatic Fat Accumulation (HFA)': r'Hepatic Fat Accumulation \(HFA\)\s*:\s*([\d.]+)',
        'Glucose': r'Glucose\s*:\s*([\d.]+)',
        'Total Cholesterol (TC)': r'Total Cholesterol \(TC\)\s*:\s*([\d.]+)',
        'Low Density Lipoprotein (LDL)': r'Low Density Lipoprotein \(LDL\)\s*:\s*([\d.]+)',
        'High Density Lipoprotein (HDL)': r'High Density Lipoprotein \(HDL\)\s*:\s*([\d.]+)',
        'Triglyceride': r'Triglyceride\s*:\s*([\d.]+)',
        'Aspartat Aminotransferaz (AST)': r'AST\s*:\s*([\d.]+)',
        'Alanin Aminotransferaz (ALT)': r'ALT\s*:\s*([\d.]+)',
        'Alkaline Phosphatase (ALP)': r'ALP\s*:\s*([\d.]+)',
        'Creatinine': r'Creatinine\s*:\s*([\d.]+)',
        'Glomerular Filtration Rate (GFR)': r'GFR\s*:\s*([\d.]+)',
        'C-Reactive Protein (CRP)': r'C-Reactive Protein \(CRP\)\s*:\s*([\d.]+)',
        'Hemoglobin (HGB)': r'Hemoglobin \(HGB\)\s*:\s*([\d.]+)',
        'Vitamin D': r'Vitamin D\s*:\s*([\d.]+)',
    }

    # Step 3: Extract values
    results = {}
    for field, pattern in patterns.items():
        match = re.search(r'\s*' + pattern, text)
        results[field] = match.group(1).strip() if match else 'Not found'

    return results
'''

# 📝 Write code to file
with open("/content/mediscan_ai/app/pdf_extractor.py", "w") as f:
    f.write(pdf_extractor_code)

print("pdf_extractor.py created successfully.")

pdf_extractor.py created successfully.


In [36]:
### Test Example 1
import sys
sys.path.append('/content/mediscan_ai')

from app.pdf_extractor import extract_lab_report_data

pdf_path = "/content/Lab Test_1.pdf"
results = extract_lab_report_data(pdf_path)

for key, value in results.items():
    print(f"{key}: {value}")

Hospital Name: Alexandria
Lab Name: Gallstone
Patient Name: Mohamed
Gender: Male
Age: 38
Lab Date: Not found
Comorbidity: No
Diabetes Mellitus (DM): No
Body Mass Index (BMI): 23.5
Height (cm): 171
Weight (kg): 68.6
Total Body Water (TBW): 39.5
Extracellular Water (ECW): 16.6
Intracellular Water (ICW): 22.9
Extracellular Fluid/Total Body Water (ECF/TBW): 42
Total Body Fat Ratio (TBFR) (%): 19.2
Lean Mass (%): 80.76
Body Protein Content (%): 17.28
Visceral Fat Rating (VFR): 6
Bone Mass (BM): 2.8
Muscle Mass (MM): 52.6
Obesity (%): 6.7
Total Fat Content (TFC): 13.2
Visceral Fat Area (VFA): 8.2
Visceral Muscle Area (VMA): 28.8
Hepatic Fat Accumulation (HFA): 0
Glucose: 93
Total Cholesterol (TC): 239
Low Density Lipoprotein (LDL): 169
High Density Lipoprotein (HDL): 43
Triglyceride: 129
Aspartat Aminotransferaz (AST): 19
Alanin Aminotransferaz (ALT): 34
Alkaline Phosphatase (ALP): 75
Creatinine: 0.91
Glomerular Filtration Rate (GFR): 110.63
C-Reactive Protein (CRP): 0.0
Hemoglobin (HGB): 16

In [37]:
print(results)

{'Hospital Name': 'Alexandria', 'Lab Name': 'Gallstone', 'Patient Name': 'Mohamed', 'Gender': 'Male', 'Age': '38', 'Lab Date': 'Not found', 'Comorbidity': 'No', 'Diabetes Mellitus (DM)': 'No', 'Body Mass Index (BMI)': '23.5', 'Height (cm)': '171', 'Weight (kg)': '68.6', 'Total Body Water (TBW)': '39.5', 'Extracellular Water (ECW)': '16.6', 'Intracellular Water (ICW)': '22.9', 'Extracellular Fluid/Total Body Water (ECF/TBW)': '42', 'Total Body Fat Ratio (TBFR) (%)': '19.2', 'Lean Mass (%)': '80.76', 'Body Protein Content (%)': '17.28', 'Visceral Fat Rating (VFR)': '6', 'Bone Mass (BM)': '2.8', 'Muscle Mass (MM)': '52.6', 'Obesity (%)': '6.7', 'Total Fat Content (TFC)': '13.2', 'Visceral Fat Area (VFA)': '8.2', 'Visceral Muscle Area (VMA)': '28.8', 'Hepatic Fat Accumulation (HFA)': '0', 'Glucose': '93', 'Total Cholesterol (TC)': '239', 'Low Density Lipoprotein (LDL)': '169', 'High Density Lipoprotein (HDL)': '43', 'Triglyceride': '129', 'Aspartat Aminotransferaz (AST)': '19', 'Alanin Ami

In [None]:
### Test Example 2
import sys
sys.path.append('/content/mediscan_ai')

from app.pdf_extractor import extract_lab_report_data

pdf_path = "/content/lab_test_report_with_patient_info.pdf"
results = extract_lab_report_data(pdf_path)

for key, value in results.items():
    print(f"{key}: {value}")

Hospital Name: Alexandria General Hospital Lab
Lab Name: Not found
Patient Name: Alaa Atef
Gender: Female
Age: 33
Lab Date: June 09, 2025
Comorbidity: Not found
Diabetes Mellitus (DM): Not found
Body Mass Index (BMI): 25.5
Height (cm): Not found
Weight (kg): Not found
Total Body Water (TBW): 44.0
Extracellular Water (ECW): 18.1
Intracellular Water (ICW): 25.9
Extracellular Fluid/Total Body Water (ECF/TBW): Not found
Total Body Fat Ratio (TBFR) (%): 41.0
Lean Mass (%): 19.9
Body Protein Content (%): 80.13
Visceral Fat Rating (VFR): 16.01
Bone Mass (BM): Not found
Muscle Mass (MM): 57.5
Obesity (%): Not found
Total Fat Content (TFC): 16.0
Visceral Fat Area (VFA): 15.0
Visceral Muscle Area (VMA): 10.1
Hepatic Fat Accumulation (HFA): 29.9
Glucose: 575
Total Cholesterol (TC): 230
Low Density Lipoprotein (LDL): 157
High Density Lipoprotein (HDL): 48
Triglyceride: 191
Aspartat Aminotransferaz (AST): 12
Alanin Aminotransferaz (ALT): 12
Alkaline Phosphatase (ALP): 135
Creatinine: 0.96
Glomerula

In [None]:
print(results)

{'Hospital Name': 'Alexandria General Hospital Lab', 'Lab Name': 'Not found', 'Patient Name': 'Alaa Atef', 'Gender': 'Female', 'Age': '33', 'Lab Date': 'June 09, 2025', 'Comorbidity': 'Not found', 'Diabetes Mellitus (DM)': 'Not found', 'Body Mass Index (BMI)': '25.5', 'Height (cm)': 'Not found', 'Weight (kg)': 'Not found', 'Total Body Water (TBW)': '44.0', 'Extracellular Water (ECW)': '18.1', 'Intracellular Water (ICW)': '25.9', 'Extracellular Fluid/Total Body Water (ECF/TBW)': 'Not found', 'Total Body Fat Ratio (TBFR) (%)': '41.0', 'Lean Mass (%)': '19.9', 'Body Protein Content (%)': '80.13', 'Visceral Fat Rating (VFR)': '16.01', 'Bone Mass (BM)': 'Not found', 'Muscle Mass (MM)': '57.5', 'Obesity (%)': 'Not found', 'Total Fat Content (TFC)': '16.0', 'Visceral Fat Area (VFA)': '15.0', 'Visceral Muscle Area (VMA)': '10.1', 'Hepatic Fat Accumulation (HFA)': '29.9', 'Glucose': '575', 'Total Cholesterol (TC)': '230', 'Low Density Lipoprotein (LDL)': '157', 'High Density Lipoprotein (HDL)':

### **Combining pdf_extractor.py with passing data to the model for prediction**

In [40]:
# 📄 pdf_extractor.py
# 📄 Step: Create pdf_extractor.py
pdf_extractor_code = '''
import re
import pytesseract
import pandas as pd
from pdf2image import convert_from_path
from PIL import Image

# ----------------------------
# Constants (put at the top)
# ----------------------------
MODEL_FEATURES = [
    'Age', 'Gender', 'Comorbidity', 'Diabetes Mellitus (DM)', 'Height (cm)', 'Weight (kg)',
    'Body Mass Index (BMI)', 'Total Body Water (TBW)', 'Extracellular Water (ECW)',
    'Intracellular Water (ICW)', 'Extracellular Fluid/Total Body Water (ECF/TBW)',
    'Total Body Fat Ratio (TBFR) (%)', 'Lean Mass (%)', 'Body Protein Content (%)',
    'Visceral Fat Rating (VFR)', 'Bone Mass (BM)', 'Muscle Mass (MM)', 'Obesity (%)',
    'Total Fat Content (TFC)', 'Visceral Fat Area (VFA)', 'Visceral Muscle Area (VMA)',
    'Hepatic Fat Accumulation (HFA)', 'Glucose', 'Total Cholesterol (TC)',
    'Low Density Lipoprotein (LDL)', 'High Density Lipoprotein (HDL)', 'Triglyceride',
    'Aspartat Aminotransferaz (AST)', 'Alanin Aminotransferaz (ALT)',
    'Alkaline Phosphatase (ALP)', 'Creatinine', 'Glomerular Filtration Rate (GFR)',
    'C-Reactive Protein (CRP)', 'Hemoglobin (HGB)', 'Vitamin D'
]

gender_map = {"Male": 1, "Female": 0}
yes_no_map = {"Yes": 1, "No": 0}

# -------------------------------------
# Function to prepare input for model
# -------------------------------------
def prepare_input_for_model(extracted_data: dict):
    gender = gender_map.get(extracted_data.get("Gender", ""), 0)
    comorbidity = yes_no_map.get(extracted_data.get("Comorbidity", ""), 0)
    diabetes = yes_no_map.get(extracted_data.get("Diabetes Mellitus (DM)", ""), 0)

    model_input = {
        'Age': float(extracted_data.get('Age', 0)),
        'Gender': gender,
        'Comorbidity': comorbidity,
        'Diabetes Mellitus (DM)': diabetes,
        'Height (cm)': float(extracted_data.get('Height (cm)', 0)),
        'Weight (kg)': float(extracted_data.get('Weight (kg)', 0)),
        'Body Mass Index (BMI)': float(extracted_data.get('Body Mass Index (BMI)', 0)),
        'Total Body Water (TBW)': float(extracted_data.get('Total Body Water (TBW)', 0)),
        'Extracellular Water (ECW)': float(extracted_data.get('Extracellular Water (ECW)', 0)),
        'Intracellular Water (ICW)': float(extracted_data.get('Intracellular Water (ICW)', 0)),
        'Extracellular Fluid/Total Body Water (ECF/TBW)': float(extracted_data.get('Extracellular Fluid/Total Body Water (ECF/TBW)', 0)),
        'Total Body Fat Ratio (TBFR) (%)': float(extracted_data.get('Total Body Fat Ratio (TBFR) (%)', 0)),
        'Lean Mass (%)': float(extracted_data.get('Lean Mass (%)', 0)),
        'Body Protein Content (%)': float(extracted_data.get('Body Protein Content (%)', 0)),
        'Visceral Fat Rating (VFR)': float(extracted_data.get('Visceral Fat Rating (VFR)', 0)),
        'Bone Mass (BM)': float(extracted_data.get('Bone Mass (BM)', 0)),
        'Muscle Mass (MM)': float(extracted_data.get('Muscle Mass (MM)', 0)),
        'Obesity (%)': float(extracted_data.get('Obesity (%)', 0)),
        'Total Fat Content (TFC)': float(extracted_data.get('Total Fat Content (TFC)', 0)),
        'Visceral Fat Area (VFA)': float(extracted_data.get('Visceral Fat Area (VFA)', 0)),
        'Visceral Muscle Area (VMA)': float(extracted_data.get('Visceral Muscle Area (VMA)', 0)),
        'Hepatic Fat Accumulation (HFA)': float(extracted_data.get('Hepatic Fat Accumulation (HFA)', 0)),
        'Glucose': float(extracted_data.get('Glucose', 0)),
        'Total Cholesterol (TC)': float(extracted_data.get('Total Cholesterol (TC)', 0)),
        'Low Density Lipoprotein (LDL)': float(extracted_data.get('Low Density Lipoprotein (LDL)', 0)),
        'High Density Lipoprotein (HDL)': float(extracted_data.get('High Density Lipoprotein (HDL)', 0)),
        'Triglyceride': float(extracted_data.get('Triglyceride', 0)),
        'Aspartat Aminotransferaz (AST)': float(extracted_data.get('Aspartat Aminotransferaz (AST)', 0)),
        'Alanin Aminotransferaz (ALT)': float(extracted_data.get('Alanin Aminotransferaz (ALT)', 0)),
        'Alkaline Phosphatase (ALP)': float(extracted_data.get('Alkaline Phosphatase (ALP)', 0)),
        'Creatinine': float(extracted_data.get('Creatinine', 0)),
        'Glomerular Filtration Rate (GFR)': float(extracted_data.get('Glomerular Filtration Rate (GFR)', 0)),
        'C-Reactive Protein (CRP)': float(extracted_data.get('C-Reactive Protein (CRP)', 0)),
        'Hemoglobin (HGB)': float(extracted_data.get('Hemoglobin (HGB)', 0)),
        'Vitamin D': float(extracted_data.get('Vitamin D', 0)),
    }

    return pd.DataFrame([model_input])

# --------------------------
# OCR and Extraction Logic
# --------------------------
def pdf_to_text_via_ocr(pdf_path, dpi=300):
    images = convert_from_path(pdf_path, dpi=dpi)
    full_text = ""
    for img in images:
        text = pytesseract.image_to_string(img)
        text = re.sub(r"[^\x00-\x7F]+", " ", text)  # Remove non-ASCII
        full_text += text + "\n"
    return full_text

def extract_lab_report_data(pdf_path):
    # Step 1: OCR + Cleaning
    text = pdf_to_text_via_ocr(pdf_path)
    text = re.sub(r'\s{2,}', ' ', text)
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r'(?<=\w): ([^:\n]+?)(?= \w+:)', r': \1\n', text)

    # Step 2: Regex patterns (truncated for brevity; insert full dictionary)
    patterns = {
        'Hospital Name': r'Hospital\s*:\s*(.+)',
        'Lab Name': r'Lab Name\s*:\s*(.+)',
        'Patient Name': r'Name\s*:\s*(.+)',
        'Gender': r'Gender\s*:\s*(Male|Female)',
        'Age': r'Age\s*:\s*(\d+)',
        'Lab Date': r'Date\s*:\s*(\w+ \d{2}, \d{4})',
        'Comorbidity': r'Comorbidity\s*:\s*(\w+)',
        'Diabetes Mellitus (DM)': r'Diabetes Mellitus \(DM\)\s*:\s*(\w+)',
        'Body Mass Index (BMI)': r'Body Mass Index \(BMI\)\s*:\s*([\d.]+)',
        'Height (cm)': r'Height\s*:\s*(\d+)',
        'Weight (kg)': r'Weight\s*:\s*([\d.]+)',
        'Total Body Water (TBW)': r'Total Body Water \(TBW\)\s*:\s*([\d.]+)',
        'Extracellular Water (ECW)': r'Extracellular Water \(ECW\)\s*:\s*([\d.]+)',
        'Intracellular Water (ICW)': r'Intracellular Water \(ICW\)\s*:\s*([\d.]+)',
        'Extracellular Fluid/Total Body Water (ECF/TBW)': r'Extracellular Fluid/Total Body Water \(ECF/TBW\)\s*:\s*([\d.]+)',
        'Total Body Fat Ratio (TBFR) (%)': r'Total Body Fat Ratio \(TBFR\)\s*:\s*([\d.]+)%',
        'Lean Mass (%)': r'Lean Mass \(LM\)\s*:\s*([\d.]+)%',
        'Body Protein Content (%)': r'Body Protein Content\s*:\s*([\d.]+)%',
        'Visceral Fat Rating (VFR)': r'Visceral Fat Rating \(VFR\)\s*:\s*([\d.]+)',
        'Bone Mass (BM)': r'Bone Mass\s*:\s*([\d.]+)\s*kg',
        'Muscle Mass (MM)': r'Muscle Mass \(MM\)\s*:\s*([\d.]+)\s*kg',
        'Obesity (%)': r'Obesity\s*:\s*([\d.]+)%',
        'Total Fat Content (TFC)': r'Total Fat Content \(TFC\)\s*:\s*([\d.]+)',
        'Visceral Fat Area (VFA)': r'Visceral Fat Area \(VFA\)\s*:\s*([\d.]+)',
        'Visceral Muscle Area (VMA)': r'Visceral Muscle Area \(VMA\)\s*:\s*([\d.]+)',
        'Hepatic Fat Accumulation (HFA)': r'Hepatic Fat Accumulation \(HFA\)\s*:\s*([\d.]+)',
        'Glucose': r'Glucose\s*:\s*([\d.]+)',
        'Total Cholesterol (TC)': r'Total Cholesterol \(TC\)\s*:\s*([\d.]+)',
        'Low Density Lipoprotein (LDL)': r'Low Density Lipoprotein \(LDL\)\s*:\s*([\d.]+)',
        'High Density Lipoprotein (HDL)': r'High Density Lipoprotein \(HDL\)\s*:\s*([\d.]+)',
        'Triglyceride': r'Triglyceride\s*:\s*([\d.]+)',
        'Aspartat Aminotransferaz (AST)': r'AST\s*:\s*([\d.]+)',
        'Alanin Aminotransferaz (ALT)': r'ALT\s*:\s*([\d.]+)',
        'Alkaline Phosphatase (ALP)': r'ALP\s*:\s*([\d.]+)',
        'Creatinine': r'Creatinine\s*:\s*([\d.]+)',
        'Glomerular Filtration Rate (GFR)': r'GFR\s*:\s*([\d.]+)',
        'C-Reactive Protein (CRP)': r'C-Reactive Protein \(CRP\)\s*:\s*([\d.]+)',
        'Hemoglobin (HGB)': r'Hemoglobin \(HGB\)\s*:\s*([\d.]+)',
        'Vitamin D': r'Vitamin D\s*:\s*([\d.]+)',
    }

    extracted_data = {}
    for key, pattern in patterns.items():
        match = re.search(pattern, text)
        if match:
            extracted_data[key] = match.group(1).strip()

    return extracted_data
'''

# 📝 Write code to file
with open("/content/mediscan_ai/app/pdf_extractor.py", "w") as f:
    f.write(pdf_extractor_code)

print("pdf_extractor.py created successfully.")

pdf_extractor.py created successfully.


### **Testing**

In [89]:
### Complete Code
import sys
sys.path.append('/content/mediscan_ai/app')
from pdf_extractor import extract_lab_report_data
#from preprocessing import preprocess_text
from predictor import predict_gallstone
from pdf_extractor import prepare_input_for_model

# Define PDF path
pdf_path = '/content/Lab Test_1.pdf'

# Step 1: Extract text
raw_text = extract_lab_report_data(pdf_path)

# Step 2: Preprocess
#cleaned_data = preprocess_text(raw_text)

cleaned_data = prepare_input_for_model(raw_text)

# If the result is a nested DataFrame or Series, extract it
if isinstance(cleaned_data.iloc[0, 0], pd.DataFrame):
    cleaned_data = cleaned_data.iloc[0, 0]

if isinstance(cleaned_data, pd.DataFrame) and cleaned_data.shape == (1, 1):
    cleaned_data = cleaned_data.iloc[0, 0]  # Unpack inner dict
    cleaned_data = pd.DataFrame([cleaned_data])  # Convert to proper 2D

# Original DataFrame: cleaned_data
import pandas as pd

# Define column mapping
column_mapping = {
    'Height (cm)': 'Height',
    'Weight (kg)': 'Weight',
    'Lean Mass (%)': 'Lean Mass (LM) (%)',
    'Body Protein Content (%)': 'Body Protein Content (Protein) (%)',
    'Visceral Muscle Area (VMA)': 'Visceral Muscle Area (VMA) (Kg)'
}

# Rename the columns
cleaned_data_renamed = cleaned_data.rename(columns=column_mapping)

# Define expected columns
expected_columns = [
    'Age', 'Gender', 'Comorbidity',
    'Coronary Artery Disease (CAD)', 'Hypothyroidism', 'Hyperlipidemia',
    'Diabetes Mellitus (DM)', 'Height', 'Weight', 'Body Mass Index (BMI)',
    'Total Body Water (TBW)', 'Extracellular Water (ECW)',
    'Intracellular Water (ICW)',
    'Extracellular Fluid/Total Body Water (ECF/TBW)',
    'Total Body Fat Ratio (TBFR) (%)', 'Lean Mass (LM) (%)',
    'Body Protein Content (Protein) (%)', 'Visceral Fat Rating (VFR)',
    'Bone Mass (BM)', 'Muscle Mass (MM)', 'Obesity (%)',
    'Total Fat Content (TFC)', 'Visceral Fat Area (VFA)',
    'Visceral Muscle Area (VMA) (Kg)', 'Hepatic Fat Accumulation (HFA)',
    'Glucose', 'Total Cholesterol (TC)', 'Low Density Lipoprotein (LDL)',
    'High Density Lipoprotein (HDL)', 'Triglyceride',
    'Aspartat Aminotransferaz (AST)', 'Alanin Aminotransferaz (ALT)',
    'Alkaline Phosphatase (ALP)', 'Creatinine',
    'Glomerular Filtration Rate (GFR)', 'C-Reactive Protein (CRP)',
    'Hemoglobin (HGB)', 'Vitamin D'
]

# Add any missing expected columns with default value (e.g., 0 or NaN)
for col in expected_columns:
    if col not in cleaned_data_renamed.columns:
        cleaned_data_renamed[col] = 0  # or np.nan if preferable

# Reorder columns to match expected order
cleaned_data_ready = cleaned_data_renamed[expected_columns]

# Convert the first row to a dictionary
input_dict = cleaned_data_ready.iloc[0].to_dict()

# Pass it to the prediction function
label, probability = predict_gallstone(input_dict)

# Print results
print(f"Prediction: {label}")
print(f"Probability: {probability:.2f}%")

Prediction: Gallstone Detected
Probability: 76.30%


In [41]:
import sys
sys.path.append('/content/mediscan_ai/app')

In [80]:
from pdf_extractor import extract_lab_report_data
#from preprocessing import preprocess_text
from predictor import predict_gallstone

In [82]:
# Define PDF path
pdf_path = '/content/Lab Test_1.pdf'

# Step 1: Extract text
raw_text = extract_lab_report_data(pdf_path)

for key, value in raw_text.items():
    print(f"{key}: {value}")

Hospital Name: Alexandria General Hospital
Lab Name: Gallstone Diagnosis
Patient Name: Mohamed Ali
Gender: Male
Age: 38
Lab Date: June 09, 2025
Comorbidity: No
Diabetes Mellitus (DM): No
Body Mass Index (BMI): 23.5
Height (cm): 171
Weight (kg): 68.6
Total Body Water (TBW): 39.5
Extracellular Water (ECW): 16.6
Intracellular Water (ICW): 22.9
Extracellular Fluid/Total Body Water (ECF/TBW): 42
Total Body Fat Ratio (TBFR) (%): 19.2
Lean Mass (%): 80.76
Body Protein Content (%): 17.28
Visceral Fat Rating (VFR): 6
Bone Mass (BM): 2.8
Muscle Mass (MM): 52.6
Obesity (%): 6.7
Total Fat Content (TFC): 13.2
Visceral Fat Area (VFA): 8.2
Visceral Muscle Area (VMA): 28.8
Hepatic Fat Accumulation (HFA): 0
Glucose: 93
Total Cholesterol (TC): 239
Low Density Lipoprotein (LDL): 169
High Density Lipoprotein (HDL): 43
Triglyceride: 129
Aspartat Aminotransferaz (AST): 19
Alanin Aminotransferaz (ALT): 34
Alkaline Phosphatase (ALP): 75
Creatinine: 0.91
Glomerular Filtration Rate (GFR): 110.63
C-Reactive Prot

In [83]:
# Step 2: Preprocess
#cleaned_data = preprocess_text(raw_text)
from pdf_extractor import prepare_input_for_model

cleaned_data = prepare_input_for_model(raw_text)
cleaned_data

Unnamed: 0,Age,Gender,Comorbidity,Diabetes Mellitus (DM),Height (cm),Weight (kg),Body Mass Index (BMI),Total Body Water (TBW),Extracellular Water (ECW),Intracellular Water (ICW),...,High Density Lipoprotein (HDL),Triglyceride,Aspartat Aminotransferaz (AST),Alanin Aminotransferaz (ALT),Alkaline Phosphatase (ALP),Creatinine,Glomerular Filtration Rate (GFR),C-Reactive Protein (CRP),Hemoglobin (HGB),Vitamin D
0,38.0,1,0,0,171.0,68.6,23.5,39.5,16.6,22.9,...,43.0,129.0,19.0,34.0,75.0,0.91,110.63,0.0,16.6,15.6


In [84]:
cleaned_data.columns

Index(['Age', 'Gender', 'Comorbidity', 'Diabetes Mellitus (DM)', 'Height (cm)',
       'Weight (kg)', 'Body Mass Index (BMI)', 'Total Body Water (TBW)',
       'Extracellular Water (ECW)', 'Intracellular Water (ICW)',
       'Extracellular Fluid/Total Body Water (ECF/TBW)',
       'Total Body Fat Ratio (TBFR) (%)', 'Lean Mass (%)',
       'Body Protein Content (%)', 'Visceral Fat Rating (VFR)',
       'Bone Mass (BM)', 'Muscle Mass (MM)', 'Obesity (%)',
       'Total Fat Content (TFC)', 'Visceral Fat Area (VFA)',
       'Visceral Muscle Area (VMA)', 'Hepatic Fat Accumulation (HFA)',
       'Glucose', 'Total Cholesterol (TC)', 'Low Density Lipoprotein (LDL)',
       'High Density Lipoprotein (HDL)', 'Triglyceride',
       'Aspartat Aminotransferaz (AST)', 'Alanin Aminotransferaz (ALT)',
       'Alkaline Phosphatase (ALP)', 'Creatinine',
       'Glomerular Filtration Rate (GFR)', 'C-Reactive Protein (CRP)',
       'Hemoglobin (HGB)', 'Vitamin D'],
      dtype='object')

In [74]:
print(type(cleaned_data))
print(cleaned_data.shape)
print(cleaned_data.head())

<class 'pandas.core.frame.DataFrame'>
(1, 35)
    Age  Gender  Comorbidity  Diabetes Mellitus (DM)  Height (cm)  \
0  38.0       1            0                       0        171.0   

   Weight (kg)  Body Mass Index (BMI)  Total Body Water (TBW)  \
0         68.6                   23.5                    39.5   

   Extracellular Water (ECW)  Intracellular Water (ICW)  ...  \
0                       16.6                       22.9  ...   

   High Density Lipoprotein (HDL)  Triglyceride  \
0                            43.0         129.0   

   Aspartat Aminotransferaz (AST)  Alanin Aminotransferaz (ALT)  \
0                            19.0                          34.0   

   Alkaline Phosphatase (ALP)  Creatinine  Glomerular Filtration Rate (GFR)  \
0                        75.0        0.91                            110.63   

   C-Reactive Protein (CRP)  Hemoglobin (HGB)  Vitamin D  
0                       0.0              16.6       15.6  

[1 rows x 35 columns]


In [75]:
cleaned_data = prepare_input_for_model(raw_text)

# If the result is a nested DataFrame or Series, extract it
if isinstance(cleaned_data.iloc[0, 0], pd.DataFrame):
    cleaned_data = cleaned_data.iloc[0, 0]

In [76]:
if isinstance(cleaned_data, pd.DataFrame) and cleaned_data.shape == (1, 1):
    cleaned_data = cleaned_data.iloc[0, 0]  # Unpack inner dict
    cleaned_data = pd.DataFrame([cleaned_data])  # Convert to proper 2D

In [77]:
cleaned_data

Unnamed: 0,Age,Gender,Comorbidity,Diabetes Mellitus (DM),Height (cm),Weight (kg),Body Mass Index (BMI),Total Body Water (TBW),Extracellular Water (ECW),Intracellular Water (ICW),...,High Density Lipoprotein (HDL),Triglyceride,Aspartat Aminotransferaz (AST),Alanin Aminotransferaz (ALT),Alkaline Phosphatase (ALP),Creatinine,Glomerular Filtration Rate (GFR),C-Reactive Protein (CRP),Hemoglobin (HGB),Vitamin D
0,38.0,1,0,0,171.0,68.6,23.5,39.5,16.6,22.9,...,43.0,129.0,19.0,34.0,75.0,0.91,110.63,0.0,16.6,15.6


In [85]:
# Original DataFrame: cleaned_data
import pandas as pd

# Define column mapping
column_mapping = {
    'Height (cm)': 'Height',
    'Weight (kg)': 'Weight',
    'Lean Mass (%)': 'Lean Mass (LM) (%)',
    'Body Protein Content (%)': 'Body Protein Content (Protein) (%)',
    'Visceral Muscle Area (VMA)': 'Visceral Muscle Area (VMA) (Kg)'
}

# Rename the columns
cleaned_data_renamed = cleaned_data.rename(columns=column_mapping)

# Define expected columns
expected_columns = [
    'Age', 'Gender', 'Comorbidity',
    'Coronary Artery Disease (CAD)', 'Hypothyroidism', 'Hyperlipidemia',
    'Diabetes Mellitus (DM)', 'Height', 'Weight', 'Body Mass Index (BMI)',
    'Total Body Water (TBW)', 'Extracellular Water (ECW)',
    'Intracellular Water (ICW)',
    'Extracellular Fluid/Total Body Water (ECF/TBW)',
    'Total Body Fat Ratio (TBFR) (%)', 'Lean Mass (LM) (%)',
    'Body Protein Content (Protein) (%)', 'Visceral Fat Rating (VFR)',
    'Bone Mass (BM)', 'Muscle Mass (MM)', 'Obesity (%)',
    'Total Fat Content (TFC)', 'Visceral Fat Area (VFA)',
    'Visceral Muscle Area (VMA) (Kg)', 'Hepatic Fat Accumulation (HFA)',
    'Glucose', 'Total Cholesterol (TC)', 'Low Density Lipoprotein (LDL)',
    'High Density Lipoprotein (HDL)', 'Triglyceride',
    'Aspartat Aminotransferaz (AST)', 'Alanin Aminotransferaz (ALT)',
    'Alkaline Phosphatase (ALP)', 'Creatinine',
    'Glomerular Filtration Rate (GFR)', 'C-Reactive Protein (CRP)',
    'Hemoglobin (HGB)', 'Vitamin D'
]

# Add any missing expected columns with default value (e.g., 0 or NaN)
for col in expected_columns:
    if col not in cleaned_data_renamed.columns:
        cleaned_data_renamed[col] = 0  # or np.nan if preferable

# Reorder columns to match expected order
cleaned_data_ready = cleaned_data_renamed[expected_columns]

In [86]:
cleaned_data_ready

Unnamed: 0,Age,Gender,Comorbidity,Coronary Artery Disease (CAD),Hypothyroidism,Hyperlipidemia,Diabetes Mellitus (DM),Height,Weight,Body Mass Index (BMI),...,High Density Lipoprotein (HDL),Triglyceride,Aspartat Aminotransferaz (AST),Alanin Aminotransferaz (ALT),Alkaline Phosphatase (ALP),Creatinine,Glomerular Filtration Rate (GFR),C-Reactive Protein (CRP),Hemoglobin (HGB),Vitamin D
0,38.0,1,0,0,0,0,0,171.0,68.6,23.5,...,43.0,129.0,19.0,34.0,75.0,0.91,110.63,0.0,16.6,15.6


In [88]:
# Convert the first row to a dictionary
input_dict = cleaned_data_ready.iloc[0].to_dict()

# Pass it to the prediction function
label, probability = predict_gallstone(input_dict)

# Print results
print(f"Prediction: {label}")
print(f"Probability: {probability:.2f}%")

Prediction: Gallstone Detected
Probability: 76.30%


### **Complete Flow**

In [97]:
from google.colab import files
import pandas as pd
import sys
import os

# Let user upload a PDF
uploaded = files.upload()

# Use the uploaded file path
for filename in uploaded.keys():
    pdf_path = os.path.join('/content', filename)
    extract_lab_report_data(pdf_path)

# Import dependencies
sys.path.append('/content/mediscan_ai/app')
from pdf_extractor import extract_lab_report_data, prepare_input_for_model
from predictor import predict_gallstone

def predict_from_pdf(pdf_path):
    # Step 1: Extract raw text from the PDF
    raw_text = extract_lab_report_data(pdf_path)

    # Step 2: Prepare input for the model
    cleaned_data = prepare_input_for_model(raw_text)

    # Step 3: Handle nested structure if any
    if isinstance(cleaned_data.iloc[0, 0], pd.DataFrame):
        cleaned_data = cleaned_data.iloc[0, 0]

    if isinstance(cleaned_data, pd.DataFrame) and cleaned_data.shape == (1, 1):
        cleaned_data = cleaned_data.iloc[0, 0]  # Unpack inner dict
        cleaned_data = pd.DataFrame([cleaned_data])  # Convert to proper 2D

    # Step 4: Rename columns if necessary
    column_mapping = {
        'Height (cm)': 'Height',
        'Weight (kg)': 'Weight',
        'Lean Mass (%)': 'Lean Mass (LM) (%)',
        'Body Protein Content (%)': 'Body Protein Content (Protein) (%)',
        'Visceral Muscle Area (VMA)': 'Visceral Muscle Area (VMA) (Kg)'
    }
    cleaned_data_renamed = cleaned_data.rename(columns=column_mapping)

    # Step 5: Add any missing columns
    expected_columns = [
        'Age', 'Gender', 'Comorbidity',
        'Coronary Artery Disease (CAD)', 'Hypothyroidism', 'Hyperlipidemia',
        'Diabetes Mellitus (DM)', 'Height', 'Weight', 'Body Mass Index (BMI)',
        'Total Body Water (TBW)', 'Extracellular Water (ECW)',
        'Intracellular Water (ICW)',
        'Extracellular Fluid/Total Body Water (ECF/TBW)',
        'Total Body Fat Ratio (TBFR) (%)', 'Lean Mass (LM) (%)',
        'Body Protein Content (Protein) (%)', 'Visceral Fat Rating (VFR)',
        'Bone Mass (BM)', 'Muscle Mass (MM)', 'Obesity (%)',
        'Total Fat Content (TFC)', 'Visceral Fat Area (VFA)',
        'Visceral Muscle Area (VMA) (Kg)', 'Hepatic Fat Accumulation (HFA)',
        'Glucose', 'Total Cholesterol (TC)', 'Low Density Lipoprotein (LDL)',
        'High Density Lipoprotein (HDL)', 'Triglyceride',
        'Aspartat Aminotransferaz (AST)', 'Alanin Aminotransferaz (ALT)',
        'Alkaline Phosphatase (ALP)', 'Creatinine',
        'Glomerular Filtration Rate (GFR)', 'C-Reactive Protein (CRP)',
        'Hemoglobin (HGB)', 'Vitamin D'
    ]

    for col in expected_columns:
        if col not in cleaned_data_renamed.columns:
            cleaned_data_renamed[col] = 0  # or np.nan if preferable

    # Step 6: Reorder columns
    cleaned_data_ready = cleaned_data_renamed[expected_columns]

    # Step 7: Predict
    input_dict = cleaned_data_ready.iloc[0].to_dict()
    label, probability = predict_gallstone(input_dict)

    # Step 8: Output
    print(f"Prediction: {label}")
    print(f"Probability: {probability:.2f}%")
    #return label, probability
predict_from_pdf(pdf_path)

Saving Lab Test_1.pdf to Lab Test_1 (4).pdf
Prediction: Gallstone Detected
Probability: 76.30%


In [96]:
# Once the file is uploaded
predict_from_pdf(pdf_path)

Prediction: Gallstone Detected
Probability: 76.30%


### **Report Generator.py**

In [100]:
!pip install fpdf



In [104]:
# 📄 Step: Create pdf_extractor.py
report_generator_code = '''
from fpdf import FPDF
import os
from datetime import datetime

class PDFReport(FPDF):
    def header(self):
        self.set_font("Arial", "B", 16)
        self.cell(0, 10, "Gallstone Prediction Report", ln=True, align="C")
        self.ln(10)

    def footer(self):
        self.set_y(-15)
        self.set_font("Arial", "I", 8)
        self.cell(0, 10, f"Generated on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", align="C")

def generate_pdf_report(prediction_label, probability, output_path="gallstone_report.pdf", patient_name=None):
    pdf = PDFReport()
    pdf.add_page()

    pdf.set_font("Arial", "", 12)

    if patient_name:
        pdf.cell(0, 10, f"Patient Name: {patient_name}", ln=True)

    pdf.cell(0, 10, f"Prediction: {prediction_label}", ln=True)
    pdf.cell(0, 10, f"Probability: {probability:.2f}%", ln=True)

    pdf.output(output_path)

    print(f"✅ Report saved as: {os.path.abspath(output_path)}")
    return os.path.abspath(output_path)
'''

# 📝 Write code to file
with open("/content/mediscan_ai/app/report_generator.py", "w") as f:
    f.write(report_generator_code)

print("report_generator.py created successfully.")

report_generator.py created successfully.


In [106]:
from report_generator import generate_pdf_report

# After prediction
label, probability = predict_gallstone(input_dict)

# Generate report
generate_pdf_report(label, probability, patient_name="John Doe")

✅ Report saved as: /content/gallstone_report.pdf


'/content/gallstone_report.pdf'