<a href="https://colab.research.google.com/github/alaaatefbediwi-ds/graduation_project/blob/main/Another_copy_of_modules.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **mediscan_ai folder**

In [1]:
# Creating folder structure
import os

base_path = "/content/mediscan_ai"
folders = [
    f"{base_path}/data",
    f"{base_path}/models",
    f"{base_path}/app"
]

for folder in folders:
    os.makedirs(folder, exist_ok=True)

print(" Folder structure created.")

 Folder structure created.


### **preprocessing.py**

In [2]:
#  Step 2: Create preprocessing.py
preprocessing_code = '''
# preprocessing.py

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Feature lists
numeric_features = [
    'Age', 'Height', 'Weight', 'Body Mass Index (BMI)', 'Total Body Water (TBW)',
    'Extracellular Water (ECW)', 'Intracellular Water (ICW)',
    'Extracellular Fluid/Total Body Water (ECF/TBW)',
    'Total Body Fat Ratio (TBFR) (%)', 'Lean Mass (LM) (%)',
    'Body Protein Content (Protein) (%)', 'Visceral Fat Rating (VFR)',
    'Bone Mass (BM)', 'Muscle Mass (MM)', 'Obesity (%)',
    'Total Fat Content (TFC)', 'Visceral Fat Area (VFA)',
    'Visceral Muscle Area (VMA) (Kg)', 'Hepatic Fat Accumulation (HFA)',
    'Glucose', 'Total Cholesterol (TC)', 'Low Density Lipoprotein (LDL)',
    'High Density Lipoprotein (HDL)', 'Triglyceride',
    'Aspartat Aminotransferaz (AST)', 'Alanin Aminotransferaz (ALT)',
    'Alkaline Phosphatase (ALP)', 'Creatinine', 'Glomerular Filtration Rate (GFR)',
    'C-Reactive Protein (CRP)', 'Hemoglobin (HGB)', 'Vitamin D'
]

# Split categorical features based on fill strategy
fill_zero_features = ['Comorbidity', 'Diabetes Mellitus (DM)']
fill_most_frequent_features = ['Gender']
categorical_features = fill_zero_features + fill_most_frequent_features

# Pipelines
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

# Gender or other categorical features filled with most frequent and then one-hot encoded
cat_most_freq_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown='ignore')) #, sparse=False))
])

# Comorbidity & DM: filled with zero and then one-hot encoded
cat_zero_fill_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value=0)),
    ("encoder", OneHotEncoder(handle_unknown='ignore')) #, sparse=False))
])

# Combined preprocessor
preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, numeric_features),
    ("cat_mostfreq", cat_most_freq_transformer, fill_most_frequent_features),
    ("cat_zerofill", cat_zero_fill_transformer, fill_zero_features)
])

#categorical_features = fill_zero_features + fill_most_frequent_features
'''

with open("/content/mediscan_ai/app/preprocessing.py", "w") as f:
    f.write(preprocessing_code)

print(" preprocessing.py created.")

 preprocessing.py created.


### **train_model.py**

In [3]:
#  Step 3: Create train_model.py
train_model_code = '''
# train_model.py

import pandas as pd
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.pipeline import Pipeline
import joblib
import os

from preprocessing import preprocessor, numeric_features, categorical_features

# Load data
data = pd.read_excel('/content/Gallstone.xlsx')

#  Reverse the label so that:
# 1 → Has Gallstone, 0 → No Gallstone (more intuitive)
#data['Gallstone Status'] = data['Gallstone Status'].map({0: 1, 1: 0})

# Features and target
X = data[numeric_features + categorical_features]
y = data['Gallstone Status']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Full pipeline with model
full_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", CatBoostClassifier(verbose=0, random_state=42))
])

# Train
full_pipeline.fit(X_train, y_train)

# Save the pipeline
os.makedirs('/content/mediscan_ai/models', exist_ok=True)
joblib.dump(full_pipeline, '/content/mediscan_ai/models/gallstone_model.pkl')

print("Model pipeline saved successfully.")
'''

with open("/content/mediscan_ai/app/train_model.py", "w") as f:
    f.write(train_model_code)

print("train_model.py created.")

train_model.py created.


In [4]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


### **predictor.py**

In [5]:
#  Step 4: Create predictor.py
predictor_code = '''
# predictor.py

import joblib
import pandas as pd

# Load the full pipeline (preprocessing + model)
model = joblib.load('/content/mediscan_ai/models/gallstone_model.pkl')

def predict_gallstone(lab_results: dict):
    """
    Predict gallstone status based on input lab results.

    Parameters:
        lab_results (dict): Dictionary of feature_name: value

    Returns:
        prediction_label (str), probability (float)
    """
    # Convert input dict to DataFrame
    input_df = pd.DataFrame([lab_results])  # Single row

    # Predict using the full pipeline
    #prediction = model.predict(input_df)[0]
    #probability = model.predict_proba(input_df)[0][0]  # Probability of "gallstone" class

    #label = "Gallstone Detected" if prediction == 0 else "No Gallstone Detected"
    prediction = model.predict(input_df)[0]
    probas = model.predict_proba(input_df)[0]

    if prediction == 0:
        label = "Gallstone Detected"
        probability = probas[0]  # Probability of class 0 (gallstone)
    else:
        label = "No Gallstone Detected"
        probability = probas[1]  # Probability of class 1 (no gallstone)

    return label, round(probability * 100, 2)  # Return percentage
    #return label, probability
'''

with open("/content/mediscan_ai/app/predictor.py", "w") as f:
    f.write(predictor_code)

print("predictor.py created.")

predictor.py created.


In [6]:
# Step 5: Run Training Script
%run /content/mediscan_ai/app/train_model.py

Model pipeline saved successfully.


In [7]:
import sys
sys.path.append("/content/mediscan_ai")  # Adjust to your actual path

In [9]:
!pip install -q catboost openpyxl

### **pdf_extractor.py**

In [10]:
!apt-get install poppler-utils -y
!apt-get install tesseract-ocr -y
!pip install pytesseract pdf2image pillow

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 35 not upgraded.
Need to get 186 kB of archives.
After this operation, 697 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.8 [186 kB]
Fetched 186 kB in 0s (374 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 126111 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.8_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.8) ...
Setting up poppler-utils (22.02.0-2ubuntu0.8) ...
Processing triggers for man-db (2.10.2-1) ...
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed

In [11]:
# Step: Create pdf_extractor.py
pdf_extractor_code = '''
import re
import pytesseract
from pdf2image import convert_from_path
from PIL import Image

def pdf_to_text_via_ocr(pdf_path, dpi=300):
    images = convert_from_path(pdf_path, dpi=dpi)
    full_text = ""
    for img in images:
        text = pytesseract.image_to_string(img)
        text = re.sub(r"[^\x00-\x7F]+", " ", text)  # Remove non-ASCII
        full_text += text + "\n"
    return full_text

def extract_lab_report_data(pdf_path):
    # Step 1: OCR + Cleaning
    text = pdf_to_text_via_ocr(pdf_path)
    text = re.sub(r'\s{2,}', ' ', text)
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r'(?<=\w): ([^:\n]+?)(?= \w+:)', r': \1\n', text)

    # Step 2: Regex patterns
    patterns = {
        'Hospital Name': r'Hospital\s*:\s*(.+)',
        'Lab Name': r'Lab Name\s*:\s*(.+)',
        'Patient Name': r'Name\s*:\s*(.+)',
        'Gender': r'Gender\s*:\s*(Male|Female)',
        'Age': r'Age\s*:\s*(\d+)',
        'Lab Date': r'Date\s*:\s*(\w+ \d{2}, \d{4})',
        'Comorbidity': r'Comorbidity\s*:\s*(\w+)',
        'Diabetes Mellitus (DM)': r'Diabetes Mellitus \(DM\)\s*:\s*(\w+)',
        'Body Mass Index (BMI)': r'Body Mass Index \(BMI\)\s*:\s*([\d.]+)',
        'Height (cm)': r'Height\s*:\s*(\d+)',
        'Weight (kg)': r'Weight\s*:\s*([\d.]+)',
        'Total Body Water (TBW)': r'Total Body Water \(TBW\)\s*:\s*([\d.]+)',
        'Extracellular Water (ECW)': r'Extracellular Water \(ECW\)\s*:\s*([\d.]+)',
        'Intracellular Water (ICW)': r'Intracellular Water \(ICW\)\s*:\s*([\d.]+)',
        'Extracellular Fluid/Total Body Water (ECF/TBW)': r'Extracellular Fluid/Total Body Water \(ECF/TBW\)\s*:\s*([\d.]+)',
        'Total Body Fat Ratio (TBFR) (%)': r'Total Body Fat Ratio \(TBFR\)\s*:\s*([\d.]+)%',
        'Lean Mass (%)': r'Lean Mass \(LM\)\s*:\s*([\d.]+)%',
        'Body Protein Content (%)': r'Body Protein Content\s*:\s*([\d.]+)%',
        'Visceral Fat Rating (VFR)': r'Visceral Fat Rating \(VFR\)\s*:\s*([\d.]+)',
        'Bone Mass (BM)': r'Bone Mass\s*:\s*([\d.]+)\s*kg',
        'Muscle Mass (MM)': r'Muscle Mass \(MM\)\s*:\s*([\d.]+)\s*kg',
        'Obesity (%)': r'Obesity\s*:\s*([\d.]+)%',
        'Total Fat Content (TFC)': r'Total Fat Content \(TFC\)\s*:\s*([\d.]+)',
        'Visceral Fat Area (VFA)': r'Visceral Fat Area \(VFA\)\s*:\s*([\d.]+)',
        'Visceral Muscle Area (VMA)': r'Visceral Muscle Area \(VMA\)\s*:\s*([\d.]+)',
        'Hepatic Fat Accumulation (HFA)': r'Hepatic Fat Accumulation \(HFA\)\s*:\s*([\d.]+)',
        'Glucose': r'Glucose\s*:\s*([\d.]+)',
        'Total Cholesterol (TC)': r'Total Cholesterol \(TC\)\s*:\s*([\d.]+)',
        'Low Density Lipoprotein (LDL)': r'Low Density Lipoprotein \(LDL\)\s*:\s*([\d.]+)',
        'High Density Lipoprotein (HDL)': r'High Density Lipoprotein \(HDL\)\s*:\s*([\d.]+)',
        'Triglyceride': r'Triglyceride\s*:\s*([\d.]+)',
        'Aspartat Aminotransferaz (AST)': r'AST\s*:\s*([\d.]+)',
        'Alanin Aminotransferaz (ALT)': r'ALT\s*:\s*([\d.]+)',
        'Alkaline Phosphatase (ALP)': r'ALP\s*:\s*([\d.]+)',
        'Creatinine': r'Creatinine\s*:\s*([\d.]+)',
        'Glomerular Filtration Rate (GFR)': r'GFR\s*:\s*([\d.]+)',
        'C-Reactive Protein (CRP)': r'C-Reactive Protein \(CRP\)\s*:\s*([\d.]+)',
        'Hemoglobin (HGB)': r'Hemoglobin \(HGB\)\s*:\s*([\d.]+)',
        'Vitamin D': r'Vitamin D\s*:\s*([\d.]+)',
    }

    # Step 3: Extract values
    results = {}
    for field, pattern in patterns.items():
        match = re.search(r'\s*' + pattern, text)
        results[field] = match.group(1).strip() if match else 'Not found'

    return results
'''

# Write code to file
with open("/content/mediscan_ai/app/pdf_extractor.py", "w") as f:
    f.write(pdf_extractor_code)

print("pdf_extractor.py created successfully.")

pdf_extractor.py created successfully.


In [14]:
### Test Example 1
import sys
sys.path.append('/content/mediscan_ai')

from app.pdf_extractor import extract_lab_report_data

pdf_path = "/content/Lab Test_1.pdf"
results = extract_lab_report_data(pdf_path)

for key, value in results.items():
    print(f"{key}: {value}")

Hospital Name: Alexandria General Hospital
Lab Name: Gallstone Diagnosis
Patient Name: Mohamed Ali
Gender: Male
Age: 38
Lab Date: June 09, 2025
Comorbidity: No
Diabetes Mellitus (DM): No
Body Mass Index (BMI): 23.5
Height (cm): 171
Weight (kg): 68.6
Total Body Water (TBW): 39.5
Extracellular Water (ECW): 16.6
Intracellular Water (ICW): 22.9
Extracellular Fluid/Total Body Water (ECF/TBW): 42
Total Body Fat Ratio (TBFR) (%): 19.2
Lean Mass (%): 80.76
Body Protein Content (%): 17.28
Visceral Fat Rating (VFR): 6
Bone Mass (BM): 2.8
Muscle Mass (MM): 52.6
Obesity (%): 6.7
Total Fat Content (TFC): 13.2
Visceral Fat Area (VFA): 8.2
Visceral Muscle Area (VMA): 28.8
Hepatic Fat Accumulation (HFA): 0
Glucose: 93
Total Cholesterol (TC): 239
Low Density Lipoprotein (LDL): 169
High Density Lipoprotein (HDL): 43
Triglyceride: 129
Aspartat Aminotransferaz (AST): 19
Alanin Aminotransferaz (ALT): 34
Alkaline Phosphatase (ALP): 75
Creatinine: 0.91
Glomerular Filtration Rate (GFR): 110.63
C-Reactive Prot

In [15]:
print(results)

{'Hospital Name': 'Alexandria General Hospital', 'Lab Name': 'Gallstone Diagnosis', 'Patient Name': 'Mohamed Ali', 'Gender': 'Male', 'Age': '38', 'Lab Date': 'June 09, 2025', 'Comorbidity': 'No', 'Diabetes Mellitus (DM)': 'No', 'Body Mass Index (BMI)': '23.5', 'Height (cm)': '171', 'Weight (kg)': '68.6', 'Total Body Water (TBW)': '39.5', 'Extracellular Water (ECW)': '16.6', 'Intracellular Water (ICW)': '22.9', 'Extracellular Fluid/Total Body Water (ECF/TBW)': '42', 'Total Body Fat Ratio (TBFR) (%)': '19.2', 'Lean Mass (%)': '80.76', 'Body Protein Content (%)': '17.28', 'Visceral Fat Rating (VFR)': '6', 'Bone Mass (BM)': '2.8', 'Muscle Mass (MM)': '52.6', 'Obesity (%)': '6.7', 'Total Fat Content (TFC)': '13.2', 'Visceral Fat Area (VFA)': '8.2', 'Visceral Muscle Area (VMA)': '28.8', 'Hepatic Fat Accumulation (HFA)': '0', 'Glucose': '93', 'Total Cholesterol (TC)': '239', 'Low Density Lipoprotein (LDL)': '169', 'High Density Lipoprotein (HDL)': '43', 'Triglyceride': '129', 'Aspartat Aminot

### **Combining pdf_extractor.py with passing data to the model for prediction**

In [18]:
# pdf_extractor.py
# Step: Create pdf_extractor.py
pdf_extractor_code = '''
import re
import pytesseract
import pandas as pd
from pdf2image import convert_from_path
from PIL import Image

# ----------------------------
# Constants (put at the top)
# ----------------------------
MODEL_FEATURES = [
    'Age', 'Gender', 'Comorbidity', 'Diabetes Mellitus (DM)', 'Height (cm)', 'Weight (kg)',
    'Body Mass Index (BMI)', 'Total Body Water (TBW)', 'Extracellular Water (ECW)',
    'Intracellular Water (ICW)', 'Extracellular Fluid/Total Body Water (ECF/TBW)',
    'Total Body Fat Ratio (TBFR) (%)', 'Lean Mass (%)', 'Body Protein Content (%)',
    'Visceral Fat Rating (VFR)', 'Bone Mass (BM)', 'Muscle Mass (MM)', 'Obesity (%)',
    'Total Fat Content (TFC)', 'Visceral Fat Area (VFA)', 'Visceral Muscle Area (VMA)',
    'Hepatic Fat Accumulation (HFA)', 'Glucose', 'Total Cholesterol (TC)',
    'Low Density Lipoprotein (LDL)', 'High Density Lipoprotein (HDL)', 'Triglyceride',
    'Aspartat Aminotransferaz (AST)', 'Alanin Aminotransferaz (ALT)',
    'Alkaline Phosphatase (ALP)', 'Creatinine', 'Glomerular Filtration Rate (GFR)',
    'C-Reactive Protein (CRP)', 'Hemoglobin (HGB)', 'Vitamin D'
]

gender_map = {"Male": 1, "Female": 0}
yes_no_map = {"Yes": 1, "No": 0}

# -------------------------------------
# Function to prepare input for model
# -------------------------------------
def prepare_input_for_model(extracted_data: dict):
    gender = gender_map.get(extracted_data.get("Gender", ""), 0)
    comorbidity = yes_no_map.get(extracted_data.get("Comorbidity", ""), 0)
    diabetes = yes_no_map.get(extracted_data.get("Diabetes Mellitus (DM)", ""), 0)

    model_input = {
        'Age': float(extracted_data.get('Age', 0)),
        'Gender': gender,
        'Comorbidity': comorbidity,
        'Diabetes Mellitus (DM)': diabetes,
        'Height (cm)': float(extracted_data.get('Height (cm)', 0)),
        'Weight (kg)': float(extracted_data.get('Weight (kg)', 0)),
        'Body Mass Index (BMI)': float(extracted_data.get('Body Mass Index (BMI)', 0)),
        'Total Body Water (TBW)': float(extracted_data.get('Total Body Water (TBW)', 0)),
        'Extracellular Water (ECW)': float(extracted_data.get('Extracellular Water (ECW)', 0)),
        'Intracellular Water (ICW)': float(extracted_data.get('Intracellular Water (ICW)', 0)),
        'Extracellular Fluid/Total Body Water (ECF/TBW)': float(extracted_data.get('Extracellular Fluid/Total Body Water (ECF/TBW)', 0)),
        'Total Body Fat Ratio (TBFR) (%)': float(extracted_data.get('Total Body Fat Ratio (TBFR) (%)', 0)),
        'Lean Mass (%)': float(extracted_data.get('Lean Mass (%)', 0)),
        'Body Protein Content (%)': float(extracted_data.get('Body Protein Content (%)', 0)),
        'Visceral Fat Rating (VFR)': float(extracted_data.get('Visceral Fat Rating (VFR)', 0)),
        'Bone Mass (BM)': float(extracted_data.get('Bone Mass (BM)', 0)),
        'Muscle Mass (MM)': float(extracted_data.get('Muscle Mass (MM)', 0)),
        'Obesity (%)': float(extracted_data.get('Obesity (%)', 0)),
        'Total Fat Content (TFC)': float(extracted_data.get('Total Fat Content (TFC)', 0)),
        'Visceral Fat Area (VFA)': float(extracted_data.get('Visceral Fat Area (VFA)', 0)),
        'Visceral Muscle Area (VMA)': float(extracted_data.get('Visceral Muscle Area (VMA)', 0)),
        'Hepatic Fat Accumulation (HFA)': float(extracted_data.get('Hepatic Fat Accumulation (HFA)', 0)),
        'Glucose': float(extracted_data.get('Glucose', 0)),
        'Total Cholesterol (TC)': float(extracted_data.get('Total Cholesterol (TC)', 0)),
        'Low Density Lipoprotein (LDL)': float(extracted_data.get('Low Density Lipoprotein (LDL)', 0)),
        'High Density Lipoprotein (HDL)': float(extracted_data.get('High Density Lipoprotein (HDL)', 0)),
        'Triglyceride': float(extracted_data.get('Triglyceride', 0)),
        'Aspartat Aminotransferaz (AST)': float(extracted_data.get('Aspartat Aminotransferaz (AST)', 0)),
        'Alanin Aminotransferaz (ALT)': float(extracted_data.get('Alanin Aminotransferaz (ALT)', 0)),
        'Alkaline Phosphatase (ALP)': float(extracted_data.get('Alkaline Phosphatase (ALP)', 0)),
        'Creatinine': float(extracted_data.get('Creatinine', 0)),
        'Glomerular Filtration Rate (GFR)': float(extracted_data.get('Glomerular Filtration Rate (GFR)', 0)),
        'C-Reactive Protein (CRP)': float(extracted_data.get('C-Reactive Protein (CRP)', 0)),
        'Hemoglobin (HGB)': float(extracted_data.get('Hemoglobin (HGB)', 0)),
        'Vitamin D': float(extracted_data.get('Vitamin D', 0)),
    }

    return pd.DataFrame([model_input])

# --------------------------
# OCR and Extraction Logic
# --------------------------
def pdf_to_text_via_ocr(pdf_path, dpi=300):
    images = convert_from_path(pdf_path, dpi=dpi)
    full_text = ""
    for img in images:
        text = pytesseract.image_to_string(img)
        text = re.sub(r"[^\x00-\x7F]+", " ", text)  # Remove non-ASCII
        full_text += text + "\n"
    return full_text

def extract_lab_report_data(pdf_path):
    # Step 1: OCR + Cleaning
    text = pdf_to_text_via_ocr(pdf_path)
    text = re.sub(r'\s{2,}', ' ', text)
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r'(?<=\w): ([^:\n]+?)(?= \w+:)', r': \1\n', text)

    # Step 2: Regex patterns (truncated for brevity; insert full dictionary)
    patterns = {
        'Hospital Name': r'Hospital\s*:\s*(.+)',
        'Lab Name': r'Lab Name\s*:\s*(.+)',
        'Patient Name': r'Name\s*:\s*(.+)',
        'Gender': r'Gender\s*:\s*(Male|Female)',
        'Age': r'Age\s*:\s*(\d+)',
        'Lab Date': r'Date\s*:\s*(\w+ \d{2}, \d{4})',
        'Comorbidity': r'Comorbidity\s*:\s*(\w+)',
        'Diabetes Mellitus (DM)': r'Diabetes Mellitus \(DM\)\s*:\s*(\w+)',
        'Body Mass Index (BMI)': r'Body Mass Index \(BMI\)\s*:\s*([\d.]+)',
        'Height (cm)': r'Height\s*:\s*(\d+)',
        'Weight (kg)': r'Weight\s*:\s*([\d.]+)',
        'Total Body Water (TBW)': r'Total Body Water \(TBW\)\s*:\s*([\d.]+)',
        'Extracellular Water (ECW)': r'Extracellular Water \(ECW\)\s*:\s*([\d.]+)',
        'Intracellular Water (ICW)': r'Intracellular Water \(ICW\)\s*:\s*([\d.]+)',
        'Extracellular Fluid/Total Body Water (ECF/TBW)': r'Extracellular Fluid/Total Body Water \(ECF/TBW\)\s*:\s*([\d.]+)',
        'Total Body Fat Ratio (TBFR) (%)': r'Total Body Fat Ratio \(TBFR\)\s*:\s*([\d.]+)%',
        'Lean Mass (%)': r'Lean Mass \(LM\)\s*:\s*([\d.]+)%',
        'Body Protein Content (%)': r'Body Protein Content\s*:\s*([\d.]+)%',
        'Visceral Fat Rating (VFR)': r'Visceral Fat Rating \(VFR\)\s*:\s*([\d.]+)',
        'Bone Mass (BM)': r'Bone Mass\s*:\s*([\d.]+)\s*kg',
        'Muscle Mass (MM)': r'Muscle Mass \(MM\)\s*:\s*([\d.]+)\s*kg',
        'Obesity (%)': r'Obesity\s*:\s*([\d.]+)%',
        'Total Fat Content (TFC)': r'Total Fat Content \(TFC\)\s*:\s*([\d.]+)',
        'Visceral Fat Area (VFA)': r'Visceral Fat Area \(VFA\)\s*:\s*([\d.]+)',
        'Visceral Muscle Area (VMA)': r'Visceral Muscle Area \(VMA\)\s*:\s*([\d.]+)',
        'Hepatic Fat Accumulation (HFA)': r'Hepatic Fat Accumulation \(HFA\)\s*:\s*([\d.]+)',
        'Glucose': r'Glucose\s*:\s*([\d.]+)',
        'Total Cholesterol (TC)': r'Total Cholesterol \(TC\)\s*:\s*([\d.]+)',
        'Low Density Lipoprotein (LDL)': r'Low Density Lipoprotein \(LDL\)\s*:\s*([\d.]+)',
        'High Density Lipoprotein (HDL)': r'High Density Lipoprotein \(HDL\)\s*:\s*([\d.]+)',
        'Triglyceride': r'Triglyceride\s*:\s*([\d.]+)',
        'Aspartat Aminotransferaz (AST)': r'AST\s*:\s*([\d.]+)',
        'Alanin Aminotransferaz (ALT)': r'ALT\s*:\s*([\d.]+)',
        'Alkaline Phosphatase (ALP)': r'ALP\s*:\s*([\d.]+)',
        'Creatinine': r'Creatinine\s*:\s*([\d.]+)',
        'Glomerular Filtration Rate (GFR)': r'GFR\s*:\s*([\d.]+)',
        'C-Reactive Protein (CRP)': r'C-Reactive Protein \(CRP\)\s*:\s*([\d.]+)',
        'Hemoglobin (HGB)': r'Hemoglobin \(HGB\)\s*:\s*([\d.]+)',
        'Vitamin D': r'Vitamin D\s*:\s*([\d.]+)',
    }

    extracted_data = {}
    for key, pattern in patterns.items():
        match = re.search(pattern, text)
        if match:
            extracted_data[key] = match.group(1).strip()

    return extracted_data
'''

# Write code to file
with open("/content/mediscan_ai/app/pdf_extractor.py", "w") as f:
    f.write(pdf_extractor_code)

print("pdf_extractor.py created successfully.")

pdf_extractor.py created successfully.


### **report_generator.py**

In [28]:
report_generator_code = '''
from fpdf import FPDF

def generate_pdf_report(label, probability, patient_info, output_path):
    pdf = FPDF()
    pdf.add_page()
    pdf.set_font("Arial", size=12)

    pdf.cell(200, 10, txt="Gallstone Disease Prediction Report", ln=True, align="C")
    pdf.ln(10)

    pdf.set_font("Arial", size=10)
    for key, value in patient_info.items():
        pdf.cell(200, 10, txt=f"{key}: {value}", ln=True)

    pdf.ln(10)
    pdf.set_font("Arial", size=12)
    pdf.cell(200, 10, txt=f"Prediction: {label}", ln=True)
    pdf.cell(200, 10, txt=f"Probability: {probability:.2f}%", ln=True)

    pdf.output(output_path)
'''
#  Write code to file
with open("/content/mediscan_ai/app/report_generator.py", "w") as f:
    f.write(report_generator_code)

print("report_generator.py created successfully.")

report_generator.py created successfully.


### **Final Code with Uploading PDF**

In [34]:
!pip install fpdf



In [35]:
import pandas as pd
from pdf_extractor import extract_lab_report_data, prepare_input_for_model
from predictor import predict_gallstone
from report_generator import generate_pdf_report

#  For file upload (Google Colab or Jupyter Notebook)
from google.colab import files
uploaded = files.upload()

# Get the uploaded file path
pdf_path = list(uploaded.keys())[0]

def process_pdf_and_generate_report(pdf_path, output_report_path="diagnosis_report.pdf"):
    raw_data = extract_lab_report_data(pdf_path)
    cleaned_data = prepare_input_for_model(raw_data)

    if isinstance(cleaned_data.iloc[0, 0], pd.DataFrame):
        cleaned_data = cleaned_data.iloc[0, 0]

    if isinstance(cleaned_data, pd.DataFrame) and cleaned_data.shape == (1, 1):
        cleaned_data = cleaned_data.iloc[0, 0]
        cleaned_data = pd.DataFrame([cleaned_data])

    column_mapping = {
        'Height (cm)': 'Height',
        'Weight (kg)': 'Weight',
        'Lean Mass (%)': 'Lean Mass (LM) (%)',
        'Body Protein Content (%)': 'Body Protein Content (Protein) (%)',
        'Visceral Muscle Area (VMA)': 'Visceral Muscle Area (VMA) (Kg)'
    }
    cleaned_data = cleaned_data.rename(columns=column_mapping)

    expected_columns = [
        'Age', 'Gender', 'Comorbidity', 'Coronary Artery Disease (CAD)', 'Hypothyroidism',
        'Hyperlipidemia', 'Diabetes Mellitus (DM)', 'Height', 'Weight', 'Body Mass Index (BMI)',
        'Total Body Water (TBW)', 'Extracellular Water (ECW)', 'Intracellular Water (ICW)',
        'Extracellular Fluid/Total Body Water (ECF/TBW)', 'Total Body Fat Ratio (TBFR) (%)',
        'Lean Mass (LM) (%)', 'Body Protein Content (Protein) (%)', 'Visceral Fat Rating (VFR)',
        'Bone Mass (BM)', 'Muscle Mass (MM)', 'Obesity (%)', 'Total Fat Content (TFC)',
        'Visceral Fat Area (VFA)', 'Visceral Muscle Area (VMA) (Kg)', 'Hepatic Fat Accumulation (HFA)',
        'Glucose', 'Total Cholesterol (TC)', 'Low Density Lipoprotein (LDL)',
        'High Density Lipoprotein (HDL)', 'Triglyceride', 'Aspartat Aminotransferaz (AST)',
        'Alanin Aminotransferaz (ALT)', 'Alkaline Phosphatase (ALP)', 'Creatinine',
        'Glomerular Filtration Rate (GFR)', 'C-Reactive Protein (CRP)', 'Hemoglobin (HGB)',
        'Vitamin D'
    ]
    for col in expected_columns:
        if col not in cleaned_data.columns:
            cleaned_data[col] = 0

    cleaned_data = cleaned_data[expected_columns]

    input_dict = cleaned_data.iloc[0].to_dict()
    label, probability = predict_gallstone(input_dict)

    patient_info = {
        "Patient Name": raw_data.get("Patient Name", "Unknown"),
        "Gender": raw_data.get("Gender", "Unknown"),
        "Age": raw_data.get("Age", "Unknown"),
        "Hospital Name": raw_data.get("Hospital Name", "Unknown"),
        "Lab Name": raw_data.get("Lab Name", "Unknown"),
        "Lab Date": raw_data.get("Lab Date", "Unknown")
    }

    generate_pdf_report(label, probability, patient_info, output_report_path)

    print(f"Prediction: {label}")
    print(f"Probability: {probability:.2f}%")
    print(f"Report generated at: {output_report_path}")


# Call the function with uploaded file
process_pdf_and_generate_report(pdf_path)

Saving Lab Test_2.pdf to Lab Test_2 (3).pdf
Prediction: Gallstone Detected
Probability: 64.96%
Report generated at: diagnosis_report.pdf
