In [9]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import joblib
import os

# Load the Excel file
file_path = r"C:\Users\spack\OneDrive - Worcester Polytechnic Institute (wpi.edu)\Documents\Data\COMSTAT\Keyence\Keyence_Data_Statistics_with_Biomass.xlsx"
df = pd.read_excel(file_path)

# Define the feature columns explicitly
feature_cols = [
    "Max Diameter Approx (Âµm) - Mean",
    "Max Diameter Approx (Âµm) - Standard Deviation",
    "Max Diameter Approx (Âµm) - Min",
    "Max Diameter Approx (Âµm) - Max",
    "Max Diameter Approx (Âµm) - Skewness",
    "Asphericity - Mean",
    "Asphericity - Standard Deviation",
    "Asphericity - Min",
    "Asphericity - Max",
    "Asphericity - Skewness",
    "Anisotropy (k^2) - Mean",
    "Anisotropy (k^2) - Standard Deviation",
    "Anisotropy (k^2) - Min",
    "Anisotropy (k^2) - Max",
    "Anisotropy (k^2) - Skewness"
]

target_col = "Biomass"

# Drop rows with NaNs in any of the feature or target columns
df = df.dropna(subset=feature_cols + [target_col])

# Shuffle and split into training and CV arrays (80% train, 20% CV)
df_shuffled = shuffle(df, random_state=42)
train_df, cv_df = train_test_split(df_shuffled, test_size=0.2, random_state=42)

# Store final arrays
Final_CV_Array = cv_df.copy()
Training_Array = train_df.copy()

# Create directory to store models
model_dir = "RandomForestModels"
os.makedirs(model_dir, exist_ok=True)

# Train models iteratively with 40-sample increments
step = 40
max_size = len(Training_Array)

for i, n_samples in enumerate(range(step, max_size + 1, step), start=1):
    subset = Training_Array.iloc[:n_samples]

    X_train = subset[feature_cols].values
    y_train = subset[target_col].values

    # Train Random Forest Regressor
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    X_cv = Final_CV_Array[feature_cols].values
    y_cv = Final_CV_Array[target_col].values

    y_pred = model.predict(X_cv)
    print(f"RÂ²: {r2_score(y_cv, y_pred):.3f}")
    # Calculate RMSE
    
    rmse = np.sqrt(mean_squared_error(y_cv, y_pred))
    print(f"RMSE: {rmse:.3f}")
    
    # Save the model
    model_filename = os.path.join(model_dir, f"rf_model_{n_samples}_samples.joblib")
    joblib.dump(model, model_filename)
    print(f"âœ… Saved model trained on {n_samples} samples to {model_filename}")

print("ðŸŽ‰ All models successfully trained and saved.")


RÂ²: -0.366
RMSE: 15.394
âœ… Saved model trained on 40 samples to RandomForestModels\rf_model_40_samples.joblib
RÂ²: -0.539
RMSE: 16.342
âœ… Saved model trained on 80 samples to RandomForestModels\rf_model_80_samples.joblib
RÂ²: -0.626
RMSE: 16.797
âœ… Saved model trained on 120 samples to RandomForestModels\rf_model_120_samples.joblib
RÂ²: -0.733
RMSE: 17.340
âœ… Saved model trained on 160 samples to RandomForestModels\rf_model_160_samples.joblib
RÂ²: -0.818
RMSE: 17.761
âœ… Saved model trained on 200 samples to RandomForestModels\rf_model_200_samples.joblib
ðŸŽ‰ All models successfully trained and saved.


In [5]:
from sklearn.metrics import r2_score, mean_squared_error

for i, n_samples in enumerate(range(step, max_size + 1, step), start=1):
    X_cv = Final_CV_Array[feature_cols].values
    y_cv = Final_CV_Array[target_col].values
    
    y_pred = model.predict(X_cv)
print(f"RÂ²: {r2_score(y_cv, y_pred):.3f}")
print(f"MSE: {mean_squared_error(y_cv, y_pred):.3f}")


RÂ²: -0.818
MSE: 315.458


In [12]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# Path to the test Excel file
test_file = r"C:\Users\spack\OneDrive - Worcester Polytechnic Institute (wpi.edu)\Documents\Grad\Stewart_Lab\PAPERS\Biofilms and Machine Learning\FlowCell_Data_Statistics.xlsx"

# Load test dataset
test_df = pd.read_excel(test_file)

# Define features and target
feature_cols = [
    "Max Diameter Approx (Âµm) - Mean",
    "Max Diameter Approx (Âµm) - Standard Deviation",
    "Max Diameter Approx (Âµm) - Min",
    "Max Diameter Approx (Âµm) - Max",
    "Max Diameter Approx (Âµm) - Skewness",
    "Asphericity - Mean",
    "Asphericity - Standard Deviation",
    "Asphericity - Min",
    "Asphericity - Max",
    "Asphericity - Skewness",
    "Anisotropy (k^2) - Mean",
    "Anisotropy (k^2) - Standard Deviation",
    "Anisotropy (k^2) - Min",
    "Anisotropy (k^2) - Max",
    "Anisotropy (k^2) - Skewness"
]
target_col = "Biomass"

# Extract features and true values
X_test = test_df[feature_cols].values
y_true = test_df[target_col].values

# Load a model (e.g., trained with 200 samples)
model_path = r"RandomForestModels\rf_model_200_samples.joblib"
model = joblib.load(model_path)

# Predict
y_pred = model.predict(X_test)

# Evaluate
rmse = np.sqrt(mean_squared_error(y_true, y_pred))
r2 = r2_score(y_true, y_pred)

print(f"ðŸ“Š Evaluation on FlowCell test data:")
print(f"   RMSE: {rmse:.3f}")
print(f"   RÂ² Score: {r2:.3f}")


ðŸ“Š Evaluation on FlowCell test data:
   RMSE: 24.807
   RÂ² Score: -1893.298


In [13]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# === Load test data ===
test_file = r"C:\Users\spack\OneDrive - Worcester Polytechnic Institute (wpi.edu)\Documents\Grad\Stewart_Lab\PAPERS\Biofilms and Machine Learning\FlowCell_Data_Statistics.xlsx"
test_df = pd.read_excel(test_file)

# === Feature and target columns ===
feature_cols = [
    "Max Diameter Approx (Âµm) - Mean",
    "Max Diameter Approx (Âµm) - Standard Deviation",
    "Max Diameter Approx (Âµm) - Min",
    "Max Diameter Approx (Âµm) - Max",
    "Max Diameter Approx (Âµm) - Skewness",
    "Asphericity - Mean",
    "Asphericity - Standard Deviation",
    "Asphericity - Min",
    "Asphericity - Max",
    "Asphericity - Skewness",
    "Anisotropy (k^2) - Mean",
    "Anisotropy (k^2) - Standard Deviation",
    "Anisotropy (k^2) - Min",
    "Anisotropy (k^2) - Max",
    "Anisotropy (k^2) - Skewness"
]
target_col = "Biomass"

# === Extract test inputs ===
X_test = test_df[feature_cols].values
y_true = test_df[target_col].values

# === Load trained model ===
model_path = r"RandomForestModels\rf_model_200_samples.joblib"
model = joblib.load(model_path)

# === Predict ===
y_pred = model.predict(X_test)

# === Evaluate ===
rmse = np.sqrt(mean_squared_error(y_true, y_pred))
r2 = r2_score(y_true, y_pred)

print(f"ðŸ“Š Evaluation on FlowCell test data:")
print(f"   RMSE: {rmse:.3f}")
print(f"   RÂ² Score: {r2:.3f}")

# === Add predicted values to DataFrame ===
test_df["Predicted Biomass"] = y_pred

# === Save to Excel ===
output_path = r"C:\Users\spack\OneDrive - Worcester Polytechnic Institute (wpi.edu)\Documents\Grad\Stewart_Lab\PAPERS\Biofilms and Machine Learning\FlowCell_Data_Predictions.xlsx"
test_df.to_excel(output_path, index=False)

print(f"âœ… Predictions saved to: {output_path}")


ðŸ“Š Evaluation on FlowCell test data:
   RMSE: 24.807
   RÂ² Score: -1893.298
âœ… Predictions saved to: C:\Users\spack\OneDrive - Worcester Polytechnic Institute (wpi.edu)\Documents\Grad\Stewart_Lab\PAPERS\Biofilms and Machine Learning\FlowCell_Data_Predictions.xlsx
