In [None]:
#RandomForest

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import numpy as np

# --- Configuration ---
# 1. PATH TO YOUR TRAINING DATA
TRAIN_FILE_PATH = r"E:\ALL_radiomics_features_with_predictions.csv"
# 2. TARGET COLUMN NAME (The column to predict)
TARGET_COLUMN = "Predicted_Origin"
# 3. IDENTIFIER COLUMNS TO EXCLUDE (Columns that are strings/IDs, like 'ID', 'Case', etc.)
# If your ID column is named 'CaseID', update the list:
EXCLUDE_COLUMNS = [TARGET_COLUMN, 'Patient_ID', 'ID'] # <--- Adjust this list based on your file's non-feature columns!
# ---------------------

## 1. Data Loading and Cleaning

print(f"Loading data from: {TRAIN_FILE_PATH}")
df = pd.read_csv(TRAIN_FILE_PATH)

# Identify feature (X) and target (y) columns
y = df[TARGET_COLUMN]

# Exclude target and any non-numeric identifier/metadata columns from features (X)
X = df.drop(columns=EXCLUDE_COLUMNS, errors='ignore')

# Ensure all remaining feature columns are strictly numeric (Crucial for Scikit-learn)
X_numeric = X.select_dtypes(include=np.number)

# Handle potential missing values (NaN) by imputing with the mean of the column
X_clean = X_numeric.fillna(X_numeric.mean())

# Check for features that were dropped due to being non-numeric
dropped_cols = set(X.columns) - set(X_numeric.columns)
if dropped_cols:
    print(f"\nNOTE: Dropped {len(dropped_cols)} non-numeric column(s) from features (X): {', '.join(dropped_cols)}")

# Get the list of actual features used for training
feature_columns_used = X_clean.columns.tolist()
print(f"Features used for training: {len(feature_columns_used)} columns.")

## 2. Split Data for Training and Testing

# Use 80% for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(
    X_clean, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Training samples: {len(X_train)} | Testing samples: {len(X_test)}")

## 3. Train the Random Forest Classifier

# Initialize the model
model = RandomForestClassifier(
    n_estimators=300,        # Number of trees
    max_depth=None,          # Allow trees to expand fully
    class_weight="balanced", # Handles potential class imbalance
    random_state=42,
    n_jobs=-1                # Use all available CPU cores
)

# Train the model
print("\nStarting model training...")
model.fit(X_train, y_train)
print("Model training complete.")

## 4. Model Evaluation

# Predict on the test set
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)

print("\n" + "="*40)
print("          ✨ MODEL EVALUATION RESULTS ✨")
print("="*40)

# Classification Report (Precision, Recall, F1-score, Support)
print("\n==== Classification Report (Test Set) ====")
print(classification_report(y_test, y_pred))

# Confusion Matrix
print("\n==== Confusion Matrix (Test Set) ====")
# Classes:
class_labels = model.classes_
cm = confusion_matrix(y_test, y_pred, labels=class_labels)
print(pd.DataFrame(cm, index=class_labels, columns=class_labels))

# Overall Accuracy
test_accuracy = accuracy_score(y_test, y_pred)
print(f"\nOverall Test Accuracy: {test_accuracy:.4f}")

# Cross-Validation (Robustness Check)
cv_scores = cross_val_score(model, X_clean, y, cv=5)
print("\n==== 5-Fold Cross-Validation Accuracy ====")
print(f"Scores: {cv_scores}")
print(f"Mean CV Accuracy: {cv_scores.mean():.4f} (Standard Deviation: {cv_scores.std():.4f})")

print("="*40)

Loading data from: E:\ALL_radiomics_features_with_predictions.csv

NOTE: Dropped 19 non-numeric column(s) from features (X): diagnostics_Mask-original_Size, diagnostics_Mask-original_CenterOfMassIndex, diagnostics_Versions_PyRadiomics, diagnostics_Versions_PyWavelet, diagnostics_Mask-original_BoundingBox, diagnostics_Mask-original_CenterOfMass, diagnostics_Mask-original_Spacing, diagnostics_Image-original_Size, diagnostics_Configuration_EnabledImageTypes, case, diagnostics_Versions_Python, diagnostics_Versions_Numpy, diagnostics_Image-original_Spacing, diagnostics_Image-original_Dimensionality, diagnostics_Mask-original_Hash, error, diagnostics_Configuration_Settings, diagnostics_Versions_SimpleITK, diagnostics_Image-original_Hash
Features used for training: 1414 columns.
Training samples: 1635 | Testing samples: 409

Starting model training...
Model training complete.

          ✨ MODEL EVALUATION RESULTS ✨

==== Classification Report (Test Set) ====
                            precis

In [None]:
#XGBOOST

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import numpy as np
from xgboost import XGBClassifier

# --- Configuration ---
# 1. PATH TO YOUR TRAINING DATA
TRAIN_FILE_PATH = r"E:\ALL_radiomics_features_with_predictions.csv"
# 2. TARGET COLUMN NAME (The column to predict)
TARGET_COLUMN = "Predicted_Origin"
# 3. IDENTIFIER COLUMNS TO EXCLUDE (Adjust this list based on your file's non-feature columns!)
EXCLUDE_COLUMNS = [TARGET_COLUMN, 'Patient_ID', 'ID']
# ---------------------

## 1. Data Loading and Cleaning

print(f"Loading data from: {TRAIN_FILE_PATH}")
df = pd.read_csv(TRAIN_FILE_PATH)

# Identify feature (X) and target (y) columns
y = df[TARGET_COLUMN]

# --- CORRECTED DATA ENCODING ---
# Convert target to numerical labels (XGBoost often prefers numerical targets)
# y_encoded is the numeric target (0, 1, 2, ...)
# class_names_index is a Pandas Index containing the original class names in order
y_encoded, class_names_index = pd.factorize(y) 

# Exclude target and any non-numeric identifier/metadata columns from features (X)
X = df.drop(columns=EXCLUDE_COLUMNS, errors='ignore')

# Ensure all remaining feature columns are strictly numeric
X_numeric = X.select_dtypes(include=np.number)

# Handle potential missing values (NaN) by imputing with the mean of the column
X_clean = X_numeric.fillna(X_numeric.mean())

# Check for features that were dropped due to being non-numeric
dropped_cols = set(X.columns) - set(X_numeric.columns)
if dropped_cols:
    print(f"\nNOTE: Dropped {len(dropped_cols)} non-numeric column(s) from features (X): {', '.join(dropped_cols)}")

# Get the list of actual features used for training
feature_columns_used = X_clean.columns.tolist()
print(f"Features used for training: {len(feature_columns_used)} columns.")

## 2. Split Data for Training and Testing

# Use 80% for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(
    X_clean, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)
print(f"Training samples: {len(X_train)} | Testing samples: {len(X_test)}")

## 3. Train the XGBoost Classifier

# Initialize the model
model = XGBClassifier(
    objective='multi:softmax',  # Multi-class classification objective
    n_estimators=300,           # Number of boosting rounds (trees)
    learning_rate=0.1,          # Step size shrinkage to prevent overfitting
    use_label_encoder=False,    # Suppress scikit-learn warning
    eval_metric='mlogloss',     # Evaluation metric for multi-class
    random_state=42,
    n_jobs=-1
)

# Train the model
print("\nStarting XGBoost training...")
model.fit(X_train, y_train)
print("Model training complete.")

## 4. Model Evaluation

# Predict on the test set
y_pred = model.predict(X_test)

# Map numeric predictions back to original class names for the report
# Use the Index directly converted to a list
target_names = class_names_index.tolist()

y_test_decoded = [target_names[i] for i in y_test]
y_pred_decoded = [target_names[i] for i in y_pred]

print("\n" + "="*40)
print("          ✨ XGBOOST EVALUATION RESULTS ✨")
print("="*40)

# Classification Report (Precision, Recall, F1-score, Support)
print("\n==== Classification Report (Test Set) ====")
print(classification_report(y_test_decoded, y_pred_decoded))

# Confusion Matrix
print("\n==== Confusion Matrix (Test Set) ====")
cm = confusion_matrix(y_test_decoded, y_pred_decoded, labels=target_names)
print(pd.DataFrame(cm, index=target_names, columns=target_names))

# Overall Accuracy
test_accuracy = accuracy_score(y_test_decoded, y_pred_decoded)
print(f"\nOverall Test Accuracy: {test_accuracy:.4f}")

# Cross-Validation (Robustness Check)
cv_scores = cross_val_score(model, X_clean, y_encoded, cv=5)
print("\n==== 5-Fold Cross-Validation Accuracy ====")
print(f"Scores: {cv_scores}")
print(f"Mean CV Accuracy: {cv_scores.mean():.4f} (Standard Deviation: {cv_scores.std():.4f})")

print("="*40)

Loading data from: E:\ALL_radiomics_features_with_predictions.csv

NOTE: Dropped 19 non-numeric column(s) from features (X): diagnostics_Mask-original_Size, diagnostics_Mask-original_CenterOfMassIndex, diagnostics_Versions_PyRadiomics, diagnostics_Versions_PyWavelet, diagnostics_Mask-original_BoundingBox, diagnostics_Mask-original_CenterOfMass, diagnostics_Mask-original_Spacing, diagnostics_Image-original_Size, diagnostics_Configuration_EnabledImageTypes, case, diagnostics_Versions_Python, diagnostics_Versions_Numpy, diagnostics_Image-original_Spacing, diagnostics_Image-original_Dimensionality, diagnostics_Mask-original_Hash, error, diagnostics_Configuration_Settings, diagnostics_Versions_SimpleITK, diagnostics_Image-original_Hash
Features used for training: 1414 columns.
Training samples: 1635 | Testing samples: 409

Starting XGBoost training...
Model training complete.

          ✨ XGBOOST EVALUATION RESULTS ✨

==== Classification Report (Test Set) ====
                            pr

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC # Import the SVM Classifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler # Recommended for SVM
import numpy as np

# --- Configuration ---
# 1. PATH TO YOUR TRAINING DATA
TRAIN_FILE_PATH = r"E:\ALL_radiomics_features_with_predictions.csv"
# 2. TARGET COLUMN NAME (The column to predict)
TARGET_COLUMN = "Predicted_Origin"
# 3. IDENTIFIER COLUMNS TO EXCLUDE (Adjust this list based on your file's non-feature columns!)
EXCLUDE_COLUMNS = [TARGET_COLUMN, 'Patient_ID', 'ID']
# ---------------------

## 1. Data Loading and Cleaning

print(f"Loading data from: {TRAIN_FILE_PATH}")
df = pd.read_csv(TRAIN_FILE_PATH)

# Identify feature (X) and target (y) columns
y = df[TARGET_COLUMN]

# Exclude target and any non-numeric identifier/metadata columns from features (X)
X = df.drop(columns=EXCLUDE_COLUMNS, errors='ignore')

# Ensure all remaining feature columns are strictly numeric
X_numeric = X.select_dtypes(include=np.number)

# Handle potential missing values (NaN) by imputing with the mean of the column
X_clean = X_numeric.fillna(X_numeric.mean())

# Check for features that were dropped due to being non-numeric
dropped_cols = set(X.columns) - set(X_numeric.columns)
if dropped_cols:
    print(f"\nNOTE: Dropped {len(dropped_cols)} non-numeric column(s) from features (X): {', '.join(dropped_cols)}")

# Get the list of actual features used for training
feature_columns_used = X_clean.columns.tolist()
print(f"Features used for training: {len(feature_columns_used)} columns.")

## 2. Split Data and Feature Scaling (Crucial for SVM)

# Use 80% for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(
    X_clean, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Training samples: {len(X_train)} | Testing samples: {len(X_test)}")

# Instantiate the Scaler
scaler = StandardScaler()

# Fit the scaler ONLY on the training data and transform both sets
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 3. Train the Support Vector Machine Classifier

# Initialize the model:
# - kernel='rbf' (Radial Basis Function) is a common choice.
# - class_weight='balanced' handles class imbalance.
# - probability=True is included to allow cross_val_score to work consistently, but may slow training.
model = SVC(
    kernel='rbf', 
    class_weight='balanced', 
    random_state=42, 
    probability=False # Set to True if you need probabilities (optional, but slower)
)
# Note: SVM is sensitive to feature scaling, which is why StandardScaler was used.

# Train the model using the scaled training data
print("\nStarting SVM training...")
model.fit(X_train_scaled, y_train)
print("Model training complete.")

## 4. Model Evaluation

# Predict on the scaled test set
y_pred = model.predict(X_test_scaled)

print("\n" + "="*40)
print("          ✨ SVM EVALUATION RESULTS (RBF Kernel) ✨")
print("="*40)

# Classification Report (Precision, Recall, F1-score, Support)
print("\n==== Classification Report (Test Set) ====")
print(classification_report(y_test, y_pred))

# Confusion Matrix
print("\n==== Confusion Matrix (Test Set) ====")
class_labels = model.classes_
cm = confusion_matrix(y_test, y_pred, labels=class_labels)
print(pd.DataFrame(cm, index=class_labels, columns=class_labels))

# Overall Accuracy
test_accuracy = accuracy_score(y_test, y_pred)
print(f"\nOverall Test Accuracy: {test_accuracy:.4f}")

# Cross-Validation (Robustness Check)
# Must scale the entire dataset for cross-validation before passing it to the function
X_scaled_full = scaler.fit_transform(X_clean) 
cv_scores = cross_val_score(model, X_scaled_full, y, cv=5)
print("\n==== 5-Fold Cross-Validation Accuracy ====")
print(f"Scores: {cv_scores}")
print(f"Mean CV Accuracy: {cv_scores.mean():.4f} (Standard Deviation: {cv_scores.std():.4f})")

print("="*40)

Loading data from: E:\ALL_radiomics_features_with_predictions.csv

NOTE: Dropped 19 non-numeric column(s) from features (X): diagnostics_Mask-original_Size, diagnostics_Mask-original_CenterOfMassIndex, diagnostics_Versions_PyRadiomics, diagnostics_Versions_PyWavelet, diagnostics_Mask-original_BoundingBox, diagnostics_Mask-original_CenterOfMass, diagnostics_Mask-original_Spacing, diagnostics_Image-original_Size, diagnostics_Configuration_EnabledImageTypes, case, diagnostics_Versions_Python, diagnostics_Versions_Numpy, diagnostics_Image-original_Spacing, diagnostics_Image-original_Dimensionality, diagnostics_Mask-original_Hash, error, diagnostics_Configuration_Settings, diagnostics_Versions_SimpleITK, diagnostics_Image-original_Hash
Features used for training: 1414 columns.
Training samples: 1635 | Testing samples: 409

Starting SVM training...
Model training complete.

          ✨ SVM EVALUATION RESULTS (RBF Kernel) ✨

==== Classification Report (Test Set) ====
                         

In [9]:
%pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.20.0-cp311-cp311-win_amd64.whl.metadata (4.6 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Using cached astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Using cached flatbuffers-25.9.23-py2.py3-none-any.whl.metadata (875 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow)
  Using cached gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)
Collecting google_pasta>=0.1.1 (from tensorflow)
  Using cached google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow)
  Using cached libclang-18.1.1-py2.py3-none-win_amd64.whl.metadata (5.3 kB)
Collecting opt_einsum>=2.3.2 (from tensorflow)
  Using cached opt_einsum-3.4.0-py3-none-any.whl.metadata (6.3 kB)
Collecting termcolor>=1.1.0 (from tensorflow)
  Using cached termcolor-3.2.0-py3-none-any.whl.metadata (6.4 kB)
Collecting wrapt>=1.11.0 (from tensorflow)
  Downloading wrapt

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler, LabelBinarizer # New imports for DL
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Flatten, Dropout, MaxPooling1D
from tensorflow.keras.utils import to_categorical

# --- Configuration ---
# 1. PATH TO YOUR TRAINING DATA
TRAIN_FILE_PATH = r"E:\ALL_radiomics_features_with_predictions.csv"
# 2. TARGET COLUMN NAME (The column to predict)
TARGET_COLUMN = "Predicted_Origin"
# 3. IDENTIFIER COLUMNS TO EXCLUDE 
EXCLUDE_COLUMNS = [TARGET_COLUMN, 'Patient_ID', 'ID']
# ---------------------

## 1. Data Loading and Cleaning

print(f"Loading data from: {TRAIN_FILE_PATH}")
df = pd.read_csv(TRAIN_FILE_PATH)

# Identify feature (X) and target (y) columns
y = df[TARGET_COLUMN]

# Exclude identifier/metadata columns from features (X)
X = df.drop(columns=EXCLUDE_COLUMNS, errors='ignore')

# Ensure all remaining feature columns are strictly numeric
X_numeric = X.select_dtypes(include=np.number)
X_clean = X_numeric.fillna(X_numeric.mean())

# Get the list of actual features used for training
FEATURE_COUNT = X_clean.shape[1]
print(f"Features used for training: {FEATURE_COUNT} columns.")

## 2. Target Encoding (One-Hot Encoding)

# Convert target variable (y) to numerical codes
lb = LabelBinarizer()
y_encoded = lb.fit_transform(y)

# Get the original class names and the number of classes
class_names = lb.classes_
NUM_CLASSES = len(class_names)
print(f"Number of classes: {NUM_CLASSES}")

## 3. Split Data and Feature Scaling

# Use 80% for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(
    X_clean, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# Scaling (Crucial for Deep Learning)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Reshape data for 1D CNN: [samples, features, 1 channel]
# We treat the feature vector as a 1D sequence.
X_train_reshaped = X_train_scaled.reshape(X_train_scaled.shape[0], FEATURE_COUNT, 1)
X_test_reshaped = X_test_scaled.reshape(X_test_scaled.shape[0], FEATURE_COUNT, 1)

print(f"Training samples: {len(X_train)} | Testing samples: {len(X_test)}")
print(f"Input shape for CNN: {X_train_reshaped.shape[1:]}")


## 4. Define and Train the 1D CNN Model

def create_cnn_model(input_shape, num_classes):
    model = Sequential()
    
    # 1D Convolutional Layer (Learns local patterns in the feature sequence)
    model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=input_shape))
    model.add(MaxPooling1D(pool_size=2))
    
    model.add(Conv1D(filters=32, kernel_size=3, activation='relu'))
    model.add(Dropout(0.3))
    
    # Flatten the output of the convolutional layers
    model.add(Flatten())
    
    # Standard Dense (Fully Connected) Layers
    model.add(Dense(100, activation='relu'))
    model.add(Dense(num_classes, activation='softmax')) # Softmax for multi-class classification
    
    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy', # Standard loss for one-hot encoded multi-class
        metrics=['accuracy']
    )
    return model

# Create and compile the model
model = create_cnn_model((FEATURE_COUNT, 1), NUM_CLASSES)
model.summary()

# Train the model
print("\nStarting 1D CNN training (fitting)...")
history = model.fit(
    X_train_reshaped, y_train, 
    epochs=50, # Number of training iterations (can be adjusted)
    batch_size=32, 
    validation_data=(X_test_reshaped, y_test),
    verbose=0 # Set to 1 for progress bar
)
print("Model training complete.")

## 5. Model Evaluation

# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test_reshaped, y_test, verbose=0)
print("\n" + "="*50)
print("          ✨ 1D CNN EVALUATION RESULTS (Keras) ✨")
print("="*50)
print(f"Overall Test Accuracy: {accuracy:.4f}")

# Generate predictions
y_proba = model.predict(X_test_reshaped)
y_pred_encoded = np.argmax(y_proba, axis=1)

# Decode predictions and true values back to original class names
y_test_decoded = lb.inverse_transform(y_test)
y_pred_decoded = lb.inverse_transform(to_categorical(y_pred_encoded, num_classes=NUM_CLASSES))

# Classification Report
print("\n==== Classification Report (Test Set) ====")
print(classification_report(y_test_decoded, y_pred_decoded, target_names=class_names))

# Confusion Matrix
print("\n==== Confusion Matrix (Test Set) ====")
cm = confusion_matrix(y_test_decoded, y_pred_decoded, labels=class_names)
print(pd.DataFrame(cm, index=class_names, columns=class_names))

print("="*50)

Loading data from: E:\ALL_radiomics_features_with_predictions.csv
Features used for training: 1414 columns.
Number of classes: 6
Training samples: 1635 | Testing samples: 409
Input shape for CNN: (1414, 1)



Starting 1D CNN training (fitting)...
Model training complete.

          ✨ 1D CNN EVALUATION RESULTS (Keras) ✨
Overall Test Accuracy: 0.7506
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step

==== Classification Report (Test Set) ====
                            precision    recall  f1-score   support

             Breast cancer       0.60      0.54      0.57        52
  Gastrointestinal cancers       0.33      0.14      0.20         7
                  Melanoma       0.81      0.65      0.72       113
Non small cell lung cancer       0.77      0.87      0.81       230
      Renal cell carcinoma       0.50      0.33      0.40         3
    Small cell lung cancer       0.67      1.00      0.80         4

                  accuracy                           0.75       409
                 macro avg       0.61      0.59      0.58       409
              weighted avg       0.75      0.75      0.74       409


==== Confusion Matrix (Test Set) ====
                  