In [1]:
import csv

rootDir = r"C:\Users\adity\Documents\Python projects\Rembrandt\morphological.csv"

with open(rootDir, 'r') as csv_file:
    reader = csv.reader(csv_file)

In [2]:
import pandas as pd

df = pd.read_csv(rootDir)
df.shape

(64, 10)

In [3]:
pd.set_option('display.max_columns', None)
df

Unnamed: 0,volume,mean_intensity,std_intensity,skewness,kurtosis,surface_area,bounding_box_x,bounding_box_y,bounding_box_z,patient_id
0,16272,1335.285972,250.526688,0.211696,-0.841184,5691.9746,32,30,36,900-00-5380_2005.07.03
1,222504,368.962041,41.300978,0.285648,1.506200,62052.9500,74,93,86,900-00-5381_2005.07.14
2,26188,302.278563,18.792551,-0.810038,0.964888,13556.4160,84,88,53,900-00-5385_2005.08.07
3,181559,439.149775,56.510183,-0.028129,-0.159938,44585.2700,78,99,73,900-00-5396_2005.08.13
4,47542,343.150078,24.413536,0.075483,-0.311200,16671.3420,46,71,63,900-00-5404_2005.09.03
...,...,...,...,...,...,...,...,...,...,...
59,36158,215.038636,26.084232,-0.443242,-0.002712,18956.1910,72,82,55,HF1606_1995.10.06
60,39973,214.066520,25.631973,-0.120549,-0.488124,12678.9200,35,54,50,HF1613_1995.10.20
61,142947,165.467803,153.994744,0.677228,-0.903831,45091.1330,80,87,71,HF1652_1995.12.22
62,20008,213.953069,29.490466,0.556394,1.244998,8474.1670,30,39,44,HF1677_1996.01.27


In [4]:
#This code prepares data for machine learning by separating features from target 
#   variables (tumor volume and surface area) and splitting them into training and testing sets. 
#It then standardizes the numerical features using StandardScaler, 
#   ensuring consistent scaling between the training and testing data. 
#This preprocessing step is crucial for optimizing the performance of many machine 
#   learning models by preventing data leakage and ensuring features are on a comparable scale.  

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = df.drop(['volume', 'surface_area', 'patient_id'], axis=1)  # Features
y_volume = df['volume']
y_surface_area = df['surface_area']

X_train, X_test, y_volume_train, y_volume_test = train_test_split(
    X, y_volume, test_size=0.2, random_state=42
)
_, _, y_surface_area_train, y_surface_area_test = train_test_split(
    X, y_surface_area, test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [5]:
#This code uses a Random Forest Regressor to predict tumor volume. 
#It creates a model, trains it on scaled training data, and then predicts 
#   volumes for the scaled test data. The model's performance is evaluated using 
#   Mean Squared Error (MSE) and R-squared (R2) scores, which 
#   measure the prediction error and the proportion of variance explained, respectively, and the 
#   results are then printed.

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

model_volume = RandomForestRegressor(n_estimators=100, random_state=42)
model_volume.fit(X_train_scaled, y_volume_train)
y_volume_pred = model_volume.predict(X_test_scaled)

mse_volume = mean_squared_error(y_volume_test, y_volume_pred)
r2_volume = r2_score(y_volume_test, y_volume_pred)
print(f"Volume Prediction - MSE: {mse_volume}, R2: {r2_volume}")

Volume Prediction - MSE: 323971302.88860774, R2: 0.8691117494420104


In [6]:
#This code trains and evaluates a Random Forest Regressor specifically for predicting tumor 
#   surface area. It initializes a model with 100 trees, fits it to the scaled training features and the 
#   corresponding surface area targets, and then generates predictions for the scaled test data. 
#The model's performance is assessed using Mean Squared Error (MSE) and R-squared (R2) scores, 
#   which measure prediction error and variance explained, respectively, with the results then 
#   printed to the console.

model_surface_area = RandomForestRegressor(n_estimators = 250, random_state = 75)
model_surface_area.fit(X_train_scaled, y_surface_area_train)
y_surface_area_pred = model_surface_area.predict(X_test_scaled)

mse_surface_area = mean_squared_error(y_surface_area_test, y_surface_area_pred)
r2_surface_area = r2_score(y_surface_area_test, y_surface_area_pred)
print(f"Surface Area Prediction - MSE: {mse_surface_area}, R2: {r2_surface_area}")

Surface Area Prediction - MSE: 46044603.215401724, R2: 0.7356175929297263


In [7]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=3)
grid_search.fit(X_train_scaled, y_volume_train)

In [8]:
#This code evaluates the performance of the best Random Forest Regressor model found through a 
#   grid search for predicting tumor volume. It retrieves the optimal model (best_estimator_) 
#   from the grid search results, uses it to generate predictions on the scaled test data, and 
#   then calculates the Mean Squared Error (MSE) and R-squared (R2) scores to assess the model's accuracy. 
#These tuned model performance metrics are then printed, allowing for a comparison against the initial, 
#   untuned model's performance.

best_model_volume = grid_search.best_estimator_
y_volume_pred_best = best_model_volume.predict(X_test_scaled)

mse_volume_best = mean_squared_error(y_volume_test, y_volume_pred_best)
r2_volume_best = r2_score(y_volume_test, y_volume_pred_best)
print(f"Volume Prediction (Tuned) - MSE: {mse_volume_best}, R2: {r2_volume_best}")

Volume Prediction (Tuned) - MSE: 331750016.004273, R2: 0.8659690570423321
