In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [11]:
df_model=pd.read_csv("/Users/amulyajayanti/Downloads/AI in oncology/AI-in-Oncology/data/prostate_patient_data.csv")


In [12]:
df_model.head()

Unnamed: 0,PatientID,Clinical Notes
0,1,2020-03-13: DIAGNOSIS - Initial PSA=14.7 ng/mL...
1,2,2022-04-11: DIAGNOSIS - Initial PSA=15.4 ng/mL...
2,3,2021-09-14: DIAGNOSIS - Initial PSA=20.0 ng/mL...
3,4,"2020-10-14: DIAGNOSIS - Initial PSA=4.0 ng/mL,..."
4,5,2022-12-07: DIAGNOSIS - Initial PSA=20.0 ng/mL...


In [13]:
df_model["Clinical Notes"]

0      2020-03-13: DIAGNOSIS - Initial PSA=14.7 ng/mL...
1      2022-04-11: DIAGNOSIS - Initial PSA=15.4 ng/mL...
2      2021-09-14: DIAGNOSIS - Initial PSA=20.0 ng/mL...
3      2020-10-14: DIAGNOSIS - Initial PSA=4.0 ng/mL,...
4      2022-12-07: DIAGNOSIS - Initial PSA=20.0 ng/mL...
                             ...                        
495    2022-11-20: DIAGNOSIS - Initial PSA=12.0 ng/mL...
496    2021-09-10: DIAGNOSIS - Initial PSA=18.6 ng/mL...
497    2022-10-11: DIAGNOSIS - Initial PSA=18.0 ng/mL...
498    2022-05-17: DIAGNOSIS - Initial PSA=10.8 ng/mL...
499    2022-02-07: DIAGNOSIS - Initial PSA=14.5 ng/mL...
Name: Clinical Notes, Length: 500, dtype: object

In [14]:
import pandas as pd
import re
from datetime import datetime

flattened_rows = []

for idx, row in df.iterrows():
    patient_id = row["PatientID"]
    notes = row["Clinical Notes"]
    
    # Split into individual visit entries
    visits = [v.strip() for v in notes.split('|')]
    
    for visit in visits:
        visit_data = {"PatientID": patient_id}
        
        # Extract Visit Date
        date_match = re.search(r"(\d{4}-\d{2}-\d{2})", visit)
        if date_match:
            visit_data["VisitDate"] = pd.to_datetime(date_match.group(1))

        #Extract treatment
        treatment_match = re.search(r'Treatment\s*=\s*([A-Za-z0-9 +]+)', visit)
        if treatment_match:
            visit_data["Treatment"] = treatment_match.group(1).strip()
        else:
            visit_data["Treatment"] = None

        # Extract PSA value
        psa_match = re.search(r'PSA\s*=\s*([\d.]+)', visit)
        if psa_match:
            visit_data["PSA"] = float(psa_match.group(1))

        # Extract PIRADS score
        pirads_match = re.search(r'PI-?RADS\s*=\s*(\d+)', visit)
        if pirads_match:
            visit_data["PIRADS"] = int(pirads_match.group(1))

        # Extract Weight
        weight_match = re.search(r'Weight\s*=\s*([\d.]+)', visit)
        if weight_match:
            visit_data["Weight"] = float(weight_match.group(1))

        # Extract Bone Pain status
        bone_pain_match = re.search(r'Bone Pain\s*=\s*(\w+)', visit, re.IGNORECASE)
        if bone_pain_match:
            visit_data["BonePain"] = bone_pain_match.group(1).capitalize()
        else:
            visit_data["BonePain"] = None

        flattened_rows.append(visit_data)

df_visits = pd.DataFrame(flattened_rows)
df_visits.sort_values(by=["PatientID", "VisitDate"], inplace=True)

print(df_visits.head(30))

    PatientID  VisitDate                  Treatment   PSA  PIRADS  Weight  \
0           1 2020-03-13                       None  14.7       5    73.7   
1           1 2020-03-13                        ADT  14.7       5    73.7   
2           1 2020-06-11                        ADT  14.5       5    74.5   
3           1 2020-09-09                        ADT  14.4       5    75.3   
4           1 2020-12-08                        ADT  14.1       5    76.2   
5           1 2021-03-08                        ADT  13.9       5    76.9   
6           1 2021-06-06                        ADT  13.6       5    77.7   
7           2 2022-04-11                       None  15.4       3    87.8   
8           2 2022-04-11                    Surgery  15.4       3    87.8   
9           2 2022-07-10                    Surgery  15.0       3    87.8   
10          2 2022-10-08                    Surgery  15.0       3    87.8   
11          2 2023-01-06                    Surgery  14.7       3    87.8   

In [15]:
df_visits[["PSA","PIRADS","Weight"]].isnull().sum()

PSA       0
PIRADS    0
Weight    0
dtype: int64

In [16]:
def compute_lifespan_with_noise(row):
    lifespan = 10.0

    # PSA penalty
    lifespan -= 0.1 * row["PSA"]

    # PIRADS
    lifespan -= 0.5 * row["PIRADS"]

    # Bone Pain
    if row["BonePain"] == "Mild":
        lifespan -= 0.5
    elif row["BonePain"] == "Moderate":
        lifespan -= 1.0
    elif row["BonePain"] == "Severe":
        lifespan -= 1.5

    # Treatments
    treatment = str(row["Treatment"]).lower()
    if "surgery" in treatment:
        lifespan += 1.0
    if "radiation" in treatment:
        lifespan += 0.5
    if "adt" in treatment:
        lifespan += 0.3

    # Add Gaussian noise ~ N(0, 1)
    noise = np.random.normal(loc=0, scale=0.3)
    lifespan += noise

    return max(1.0, min(lifespan, 15.0))

# Apply to DataFrame
np.random.seed(42)  
df_visits["Lifespan"] = df_visits.apply(compute_lifespan_with_noise, axis=1)

In [18]:
df_visits.head(30)

Unnamed: 0,PatientID,VisitDate,Treatment,PSA,PIRADS,Weight,BonePain,Lifespan
0,1,2020-03-13,,14.7,5,73.7,,6.179014
1,1,2020-03-13,ADT,14.7,5,73.7,Mild,5.788521
2,1,2020-06-11,ADT,14.5,5,74.5,,6.544307
3,1,2020-09-09,ADT,14.4,5,75.3,Mild,6.316909
4,1,2020-12-08,ADT,14.1,5,76.2,Mild,5.819754
5,1,2021-03-08,ADT,13.9,5,76.9,Mild,5.839759
6,1,2021-06-06,ADT,13.6,5,77.7,Mild,6.413764
7,2,2022-04-11,,15.4,3,87.8,,7.19023
8,2,2022-04-11,Surgery,15.4,3,87.8,,7.819158
9,2,2022-07-10,Surgery,15.0,3,87.8,,8.162768


In [19]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Select features and target
X = df_visits[["Treatment", "BonePain", "PSA", "PIRADS", "Weight"]]
y = df_visits["Lifespan"]

# Preprocessing: encode categorical, pass numerical
categorical_features = ["Treatment", "BonePain"]
numerical_features = ["PSA", "PIRADS", "Weight"]

preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
], remainder="passthrough")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, random_state=42, verbosity=0),
    "SVR (RBF Kernel)": SVR(kernel="rbf")
}

# Evaluate all models
results = {}

for name, model in models.items():
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("regressor", model)
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    results[name] = {
        "RMSE": mean_squared_error(y_test, y_pred, squared=False),
        "MAE": mean_absolute_error(y_test, y_pred),
        "R2": r2_score(y_test, y_pred)
    }

results_df = pd.DataFrame(results).T.sort_values("RMSE")
print(results_df)



                       RMSE       MAE        R2
Linear Regression  0.284286  0.226657  0.928321
Ridge Regression   0.284638  0.227045  0.928144
Gradient Boosting  0.305258  0.245416  0.917356
XGBoost            0.344517  0.269989  0.894731
Random Forest      0.351542  0.278613  0.890395
Decision Tree      0.440153  0.349485  0.828176
SVR (RBF Kernel)   0.675319  0.535039  0.595521




**Retrying the model with feature engineering:**

In [21]:
# Convert BonePain to severity score
def pain_severity_score(pain_level):
    mapping = {"None": 0, "Mild": 1, "Moderate": 2, "Severe": 3}
    return mapping.get(pain_level, 0)

df_visits_agg = (
    df_visits.copy()
    .dropna(subset=["PSA", "PIRADS", "Weight", "Treatment", "BonePain"])
    .assign(
        BonePainScore=lambda d: d["BonePain"].map(pain_severity_score),
        TreatmentType=lambda d: d["Treatment"].str.lower()
    )
    .groupby("PatientID")
    .agg({
        "PSA": ["first", "last", "mean"],
        "PIRADS": "max",
        "Weight": ["first", "last", "mean"],
        "BonePainScore": "max",
        "TreatmentType": lambda x: list(set(x))
    })
)

# Flatten multi-index columns
df_visits_agg.columns = ['_'.join(col).strip() for col in df_visits_agg.columns.values]
df_visits_agg.reset_index(inplace=True)

In [22]:
# Add treatment presence flags
df_visits_agg["Has_Surgery"] = df_visits_agg["TreatmentType_<lambda>"].apply(lambda lst: any("surgery" in t for t in lst))
df_visits_agg["Has_Radiation"] = df_visits_agg["TreatmentType_<lambda>"].apply(lambda lst: any("radiation" in t for t in lst))
df_visits_agg["Has_ADT"] = df_visits_agg["TreatmentType_<lambda>"].apply(lambda lst: any("adt" in t for t in lst))

# Rename for easier access
df_agg = df_visits_agg.rename(columns={
    "PSA_first": "PSA_Initial",
    "PSA_last": "PSA_Latest",
    "PIRADS_max": "Max_PIRADS",
    "Weight_first": "Weight_Initial",
    "Weight_last": "Weight_Latest",
    "BonePainScore_max": "Max_BonePainScore"
})

# Add change features
df_agg["PSA_Change"] = df_agg["PSA_Latest"] - df_agg["PSA_Initial"]
df_agg["Weight_Change"] = df_agg["Weight_Latest"] - df_agg["Weight_Initial"]

# Simulate a noisy, realistic lifespan value
def compute_patient_lifespan(row):
    lifespan = 10.0
    lifespan -= 0.1 * row["PSA_Initial"]
    lifespan -= 0.5 * row["Max_PIRADS"]
    lifespan -= 0.6 * row["Max_BonePainScore"]
    lifespan += 1.0 * row["Has_Surgery"]
    lifespan += 0.5 * row["Has_Radiation"]
    lifespan += 0.3 * row["Has_ADT"]
    lifespan += np.random.normal(0, 0.5)  # small noise
    return max(1.0, min(lifespan, 15.0))

np.random.seed(42)
df_agg["Lifespan"] = df_agg.apply(compute_patient_lifespan, axis=1)

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

# Features and target
features = [
    "PSA_Initial", "PSA_Latest", "PSA_Change",
    "Max_PIRADS", "Weight_Initial", "Weight_Latest", "Weight_Change",
    "Max_BonePainScore", "Has_Surgery", "Has_Radiation", "Has_ADT"
]
target = "Lifespan"

X = df_agg[features]
y = df_agg[target]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Define models
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, random_state=42, verbosity=0),
    "SVR (RBF Kernel)": SVR(kernel="rbf")
}

# Store results
results = {}

# Use scaling for all models for fair comparison
scaler = StandardScaler()

for name, model in models.items():
    pipeline = Pipeline([
        ("scaler", scaler),
        ("regressor", model)
    ])
    
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    results[name] = {
        "RMSE": mean_squared_error(y_test, y_pred, squared=False),
        "MAE": mean_absolute_error(y_test, y_pred),
        "R2": r2_score(y_test, y_pred)
    }

# Display results
results_df = pd.DataFrame(results).T.sort_values("RMSE")
print(results_df)



                       RMSE       MAE        R2
Ridge Regression   0.498895  0.370463  0.824642
Linear Regression  0.499169  0.370634  0.824449
SVR (RBF Kernel)   0.521329  0.385109  0.808517
Random Forest      0.531122  0.386723  0.801255
Gradient Boosting  0.536413  0.399095  0.797275
XGBoost            0.595066  0.443428  0.750519
Decision Tree      0.824608  0.652833  0.520927




In [27]:
!pip install joblib
import joblib

# Save best model (lowest RMSE)
best_model_name = results_df.index[0]
best_model = Pipeline([
    ("scaler", scaler),
    ("regressor", models[best_model_name])
])
best_model.fit(X, y)  # Train on full dataset
joblib.dump(best_model, "lifespan_model.pkl")
print(f"Saved model: {best_model_name}")

Saved model: Ridge Regression
