<a href="https://colab.research.google.com/github/b-ukusi/cystpredict2/blob/main/modelF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install catboost -q


In [None]:
!pip install --upgrade \
  "numpy==1.26.4" \
  "pandas<2.2.0" \
  pycaret \
  reportlab \
  scikit-learn


In [None]:
import numpy, pandas
print("NumPy:", numpy.__version__)
print("Pandas:", pandas.__version__)


In [None]:
# ✅ 1. Install essentials (run once per session)
!pip install -U pycaret pandas==2.1.4 numpy==1.26.4 scikit-learn -q
!pip install catboost lightgbm -q  # Add strong regressors

# ✅ 2. Imports
import pandas as pd, numpy as np, re
from pycaret.regression import (
    setup as setup_reg, compare_models, tune_model,
    finalize_model, save_model, evaluate_model
)

# ✅ 3. Load & clean data
df = pd.read_csv('https://raw.githubusercontent.com/b-ukusi/datasets/main/Ovarian%20Cyst%20Track%20Data.csv')
df.columns = [re.sub(r'[^A-Za-z0-9_]+', '_', c.strip()) for c in df.columns]
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]  # remove junk columns
df['growth_rate_cm_day'] = df['Cyst_Growth_Rate_cm_month'] / 30

# ✅ 4. Drop rows with missing required values
df = df.dropna(subset=[
    'Age','Menopause_Status','CA_125_Level','Cyst_Size_cm',
    'Ultrasound_Features','Reported_Symptoms',
    'Recommended_Management','Region','growth_rate_cm_day'
])

# ✅ 5. Clean categorical values
for col in ['Ultrasound_Features','Reported_Symptoms','Region',
            'Recommended_Management','Menopause_Status']:
    df[col] = (df[col].fillna('Unknown').astype(str)
               .str.replace(r'[^A-Za-z0-9_ ]+', '', regex=True)
               .str.replace(' ', '_'))

# ✅ 6. Add category label (optional, used in classification later)
df['growth_category'] = pd.cut(df['growth_rate_cm_day'],
    bins=[-np.inf, 0, 0.01, 0.05, np.inf],
    labels=['Shrinking', 'Stable', 'Moderate-growing', 'Fast-growing'])

# ✅ 7. Rename columns for modeling
df = df.rename(columns={
    'Age': 'age', 'Cyst_Size_cm': 'size', 'CA_125_Level': 'ca125',
    'Recommended_Management': 'management', 'Menopause_Status': 'menopause',
    'Ultrasound_Features': 'ultrasound', 'Reported_Symptoms': 'symptoms',
    'Region': 'region'
})

# ✅ 8. Setup regression model (optimized)
reg = setup_reg(
    data=df,
    target='growth_rate_cm_day',
    numeric_features=['age','ca125','size'],
    categorical_features=['menopause','ultrasound','management','symptoms','region'],
    ignore_features=['Patient_ID', 'Cyst_Growth_Rate_cm_month', 'Date_of_Exam', 'growth_category'],
    feature_selection=False,                # 🛑 Avoid pruning important features
    remove_multicollinearity=False,        # 🛑 Keep correlated features
    normalize=True,
    transform_target=True,
    session_id=123,
    fold=5,
    verbose=False
)

# ✅ 9. Train using strong models
best_model = compare_models(include=['lightgbm', 'catboost', 'rf'], sort='RMSE')
tuned_model = tune_model(best_model, optimize='RMSE')

# ✅ 10. Finalize and save
final_model = finalize_model(tuned_model)
save_model(final_model, 'growth_rate_model')

# ✅ 11. Optional: Evaluate visually in notebook
evaluate_model(final_model)


In [None]:
# ▶️ 1. Install essentials (run once)
!pip install -U pycaret pandas==2.1.4 numpy==1.26.4 reportlab -q

# ▶️ 2. Imports
import pandas as pd
import numpy as np
import re
from pycaret.regression import load_model, predict_model
from reportlab.lib import colors
from reportlab.lib.pagesizes import A4, landscape
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle
from reportlab.lib.units import inch
import datetime
from google.colab import files

# ▶️ 3. Load trained regression model
loaded_model = load_model('growth_rate_model')

# ▶️ 4. Upload CSV File
uploaded = files.upload()
file_path = next(iter(uploaded))

# ▶️ 5. Read and clean prediction data
df_predict = pd.read_csv(file_path)
df_predict.columns = [re.sub(r'[^A-Za-z0-9_]+', '_', c.strip()) for c in df_predict.columns]
df_predict = df_predict.loc[:, ~df_predict.columns.str.contains('^Unnamed')]

# ▶️ 6. Clean categorical values
for col in ['Ultrasound_Features','Reported_Symptoms','Region',
            'Recommended_Management','Menopause_Status']:
    if col in df_predict.columns:
        df_predict[col] = (df_predict[col].fillna('Unknown').astype(str)
                   .str.replace(r'[^A-Za-z0-9_ ]+', '', regex=True)
                   .str.replace(' ', '_'))

# ▶️ 7. Rename columns
df_predict = df_predict.rename(columns={
    'Age': 'age',
    'CA_125_Level': 'ca125',
    'Cyst_Size_cm': 'size',
    'Menopause_Status': 'menopause',
    'Ultrasound_Features': 'ultrasound',
    'Reported_Symptoms': 'symptoms',
    'Recommended_Management': 'management',
    'Region': 'region'
})

model_features = ['age', 'ca125', 'size', 'menopause', 'ultrasound', 'symptoms', 'management', 'region']
if 'Patient_ID' in df_predict.columns:
    data_for_prediction = df_predict[['Patient_ID'] + [col for col in model_features if col in df_predict.columns]].copy()
else:
    data_for_prediction = df_predict[[col for col in model_features if col in df_predict.columns]].copy()
    print("⚠️ 'Patient_ID' column not found in the uploaded CSV. It will not be included in the report.")

# ▶️ 8. Predict growth rate
result = predict_model(loaded_model, data=data_for_prediction)

# ▶️ 9. Classify growth rate
def classify_growth(rate):
    if rate <= 0:
        return 'Shrinking'
    elif rate < 0.012:
        return 'Stable'
    elif rate < 0.020:
        return 'Moderate-growing'
    else:
        return 'Fast-growing'

if 'prediction_label' in result.columns:
    result['growth_category'] = result['prediction_label'].apply(classify_growth)
else:
    print("⚠️ 'prediction_label' missing — prediction may have failed.")

# ▶️ 10. Export to landscape PDF (no symptoms, rounded to 3 decimals)
def export_custom_pdf(df, filename='cyst_growth_report.pdf', top_n=10):
    doc = SimpleDocTemplate(filename, pagesize=landscape(A4))
    story = []
    styles = getSampleStyleSheet()

    story.append(Paragraph("📊 Ovarian Cyst Growth Rate Report", styles['Heading1']))
    story.append(Paragraph(f"🕒 Generated: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M')}", styles['Normal']))
    story.append(Spacer(1, 16))

    header = []
    if 'Patient_ID' in df.columns:
        header.append('Patient ID')
    header += ['Age', 'CA‑125 (U/mL)', 'Size (cm)', 'Menopause', 'Ultrasound',
               'Management', 'Region', 'Predicted Growth (cm/day)', 'Category']

    data = [header]

    for _, row in df.head(top_n).iterrows():
        row_data = []
        if 'Patient_ID' in df.columns:
            row_data.append(row.get('Patient_ID', ''))

        row_data += [
            int(row.get('age', 0)),
            f"{row.get('ca125', 0):.3f}",
            f"{row.get('size', 0):.3f}",
            row.get('menopause', ''),
            row.get('ultrasound', ''),
            row.get('management', ''),
            row.get('region', ''),
            f"{row.get('prediction_label', 0):.3f}",
            row.get('growth_category', '')
        ]
        data.append(row_data)

    # Adjust column widths to fit landscape and dropped 'symptoms'
    col_widths = [1.3*inch, 0.7*inch, 1*inch, 1*inch, 1.3*inch,
                  1.3*inch, 1*inch, 1.3*inch, 1.3*inch]
    if 'Patient_ID' in df.columns:
        col_widths.insert(0, 1.3*inch)

    table = Table(data, colWidths=col_widths)
    table.setStyle(TableStyle([
        ('BACKGROUND', (0, 0), (-1, 0), colors.lightblue),
        ('TEXTCOLOR', (0, 0), (-1, 0), colors.darkblue),
        ('GRID', (0, 0), (-1, -1), 0.4, colors.grey),
        ('ALIGN', (0, 0), (-1, -1), 'CENTER'),
        ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
        ('FONTSIZE', (0, 0), (-1, -1), 8),
        ('BOTTOMPADDING', (0, 0), (-1, 0), 10),
    ]))

    story.append(table)
    doc.build(story)
    print(f"📄 Landscape PDF report saved: {filename}")


# ▶️ 11. Generate Landscape PDF
export_custom_pdf(result)

# ▶️ 12. Show Console Output
columns_to_show = [col for col in ['Patient_ID', 'age', 'ca125', 'size', 'management', 'region', 'prediction_label', 'growth_category'] if col in result.columns]
top10 = result.sort_values(by='prediction_label', ascending=False).head(10)

print("\n📋 Top 10 Fastest-Growing Cyst Cases:")
print(top10[columns_to_show].to_string(index=False))
