In [1]:
# Run this cell if you need to install optional dependencies
# In most environments sklearn, pandas are available. Uncomment installs if needed.

%pip install -q scikit-learn pandas joblib
# Optional (better boosting model)
%pip install -q xgboost
# Optional (for imbalance handling)
%pip install -q imbalanced-learn


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib

# File paths - change to match your environment
TRAIN_CSV = r"E:\radiomics_features_fixed.csv"
PRED_CSV = r"E:\ALL_radiomics_features.csv"

# The 11 features you reported as common
COMMON_FEATURES = [
    'diagnostics_Image-original_Maximum',
    'diagnostics_Image-original_Mean',
    'diagnostics_Image-original_Minimum',
    'diagnostics_Mask-original_VolumeNum',
    'diagnostics_Mask-original_VoxelNum',
    'original_shape_Elongation',
    'original_shape_Flatness',
    'original_shape_LeastAxisLength',
    'original_shape_MajorAxisLength',
    'original_shape_MinorAxisLength',
    'original_shape_VoxelVolume'
]

TARGET_COL = 'Origin'   # name of target column in your training CSV
MODEL_SAVE_PATH = r"E:\radiomics_origin_model.joblib"  # where model will be saved
PRED_SAVE_PATH = r"E:\ALL_radiomics_features_with_predictions.csv"  # output predictions file


In [2]:
# Read with low_memory=False to reduce mixed-type warnings
train_df = pd.read_csv(TRAIN_CSV, low_memory=False)
pred_df = pd.read_csv(PRED_CSV, low_memory=False)

print("Train shape:", train_df.shape)
print("Pred shape:", pred_df.shape)

# Check presence of features
missing_in_train = [c for c in COMMON_FEATURES if c not in train_df.columns]
missing_in_pred = [c for c in COMMON_FEATURES if c not in pred_df.columns]
if missing_in_train:
    raise ValueError(f"Missing columns in TRAIN CSV: {missing_in_train}")
if missing_in_pred:
    raise ValueError(f"Missing columns in PRED CSV: {missing_in_pred}")

# Examine target distribution
print("Target value counts:")
print(train_df[TARGET_COL].value_counts(dropna=False))


Train shape: (199, 13)
Pred shape: (2044, 1433)
Target value counts:
Origin
Non small cell lung cancer    82
Melanoma                      44
Breast cancer                 26
Small cell lung cancer        17
Renal cell carcinoma          16
Gastrointestinal cancers      14
Name: count, dtype: int64


In [3]:
# Select features and target
X = train_df[COMMON_FEATURES].copy()
y = train_df[TARGET_COL].copy()

# Convert possible strings to numeric if some columns are mixed type
for col in COMMON_FEATURES:
    # coerce to numeric; bad parse will become NaN
    X[col] = pd.to_numeric(X[col], errors='coerce')

# Handle missing values in X: simple approach = median imputation
X = X.fillna(X.median())

# Also drop rows with missing target (if any)
mask_target_notnull = y.notnull()
if mask_target_notnull.sum() != len(y):
    print(f"Dropping {len(y) - mask_target_notnull.sum()} rows with null target")
    X = X[mask_target_notnull]
    y = y[mask_target_notnull]

# Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

print("Classes:", list(le.classes_))


Classes: ['Breast cancer', 'Gastrointestinal cancers', 'Melanoma', 'Non small cell lung cancer', 'Renal cell carcinoma', 'Small cell lung cancer']


In [4]:
# Stratified split
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# Build pipeline: scaler -> classifier
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', RandomForestClassifier(n_estimators=200, class_weight='balanced', random_state=42, n_jobs=-1))
])

# Optional: small grid search (uncomment to run)
# param_grid = {
#     'clf__n_estimators': [100, 200],
#     'clf__max_depth': [None, 10, 20],
# }
# gs = GridSearchCV(pipeline, param_grid, cv=StratifiedKFold(5), scoring='f1_weighted', n_jobs=-1)
# gs.fit(X_train, y_train)
# print("Best params:", gs.best_params_)
# model = gs.best_estimator_

# Direct fit (faster)
pipeline.fit(X_train, y_train)
model = pipeline

# Evaluate on test set
y_pred = model.predict(X_test)
print("Test accuracy:", accuracy_score(y_test, y_pred))
print("Classification report (test):")
print(classification_report(y_test, y_pred, target_names=le.classes_))


Test accuracy: 0.375
Classification report (test):
                            precision    recall  f1-score   support

             Breast cancer       0.00      0.00      0.00         5
  Gastrointestinal cancers       0.00      0.00      0.00         3
                  Melanoma       0.33      0.11      0.17         9
Non small cell lung cancer       0.50      0.82      0.62        17
      Renal cell carcinoma       0.00      0.00      0.00         3
    Small cell lung cancer       0.00      0.00      0.00         3

                  accuracy                           0.38        40
                 macro avg       0.14      0.16      0.13        40
              weighted avg       0.29      0.38      0.30        40



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [5]:
# Cross-validated score
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X, y_encoded, cv=cv, scoring='f1_weighted', n_jobs=-1)
print("5-fold CV f1_weighted scores:", cv_scores)
print("Mean CV f1_weighted:", cv_scores.mean())

# Save model + label encoder together
artifact = {
    'pipeline': model,
    'label_encoder': le,
    'features': COMMON_FEATURES
}
joblib.dump(artifact, MODEL_SAVE_PATH)
print("Saved model to:", MODEL_SAVE_PATH)


5-fold CV f1_weighted scores: [0.23282675 0.28900135 0.3251267  0.25625    0.28957637]
Mean CV f1_weighted: 0.2785562330954158
Saved model to: E:\radiomics_origin_model.joblib


In [6]:
# Load model
loaded = joblib.load(MODEL_SAVE_PATH)
model = loaded['pipeline']
le = loaded['label_encoder']
features = loaded['features']

# Prepare prediction dataframe
pred_features = pred_df[features].copy()
for col in features:
    pred_features[col] = pd.to_numeric(pred_features[col], errors='coerce')
pred_features = pred_features.fillna(pred_features.median())

# Predict (returns encoded labels)
pred_encoded = model.predict(pred_features)
pred_labels = le.inverse_transform(pred_encoded)

# Add predictions to pred_df and save
pred_df_with_preds = pred_df.copy()
pred_df_with_preds['Predicted_Origin'] = pred_labels

pred_df_with_preds.to_csv(PRED_SAVE_PATH, index=False)
print("Saved predictions to:", PRED_SAVE_PATH)

# Show counts per predicted class
print("Predicted class counts:")
print(pred_df_with_preds['Predicted_Origin'].value_counts())


Saved predictions to: E:\ALL_radiomics_features_with_predictions.csv
Predicted class counts:
Predicted_Origin
Non small cell lung cancer    1147
Melanoma                       566
Breast cancer                  260
Gastrointestinal cancers        36
Small cell lung cancer          20
Renal cell carcinoma            15
Name: count, dtype: int64
