In [2]:
pip install pyradiomics





In [3]:
import os
import nibabel as nib
import pandas as pd
import pyradiomics
# from pyradiomics import featureextractor
import logging

# Optional: suppress verbose logging
logging.getLogger('radiomics').setLevel(logging.ERROR)

# Path to your segmented tumor masks
base_dir = r"C:\PKG - UPENN-GBM-NIfTI\UPENN-GBM\NIfTI-files\automated_segm"
output_csv = "tumor_features.csv"

# Configure feature extractor
extractor = featureextractor.RadiomicsFeatureExtractor()
extractor.enableAllFeatures()

data = []

for file in os.listdir(base_dir):
    if file.endswith('.nii.gz'):
        patient_id = file.split('_')[2] if '_' in file else file.split('.')[0]
        path = os.path.join(base_dir, file)

        # Since it's a binary tumor mask, use the same file as both image and mask
        try:
            result = extractor.execute(imageFilepath=path, maskFilepath=path)
            filtered_result = {k: v for k, v in result.items() if "diagnostics" not in k}
            filtered_result['PatientID'] = patient_id
            data.append(filtered_result)
        except Exception as e:
            print(f"Failed for {file}: {e}")

# Create DataFrame and save to CSV
df = pd.DataFrame(data)
df.set_index("PatientID", inplace=True)
df.to_csv(output_csv)

print(f"Saved {len(df)} feature vectors to {output_csv}")

ModuleNotFoundError: No module named 'pyradiomics'

In [8]:
import os
import torch
import torch.nn as nn
import nibabel as nib
import numpy as np
import pandas as pd
from scipy.ndimage import zoom

# ======= 1. CNN Model (Simple 3D CNN as feature extractor) ============
class TumorFeatureCNN(nn.Module):
    def __init__(self):
        super(TumorFeatureCNN, self).__init__()
        self.conv_layers = nn.Sequential(
            nn.Conv3d(1, 8, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool3d(2),
            nn.Conv3d(8, 16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.AdaptiveAvgPool3d((4, 4, 4))  # Output shape: (16, 4, 4, 4)
        )
    def forward(self, x):
        x = self.conv_layers(x)
        return x.view(x.size(0), -1)  # Flatten features (batch_size, 1024)

# ========= 2. NIfTI loader and preprocessing ===============
def load_nifti(path, target_shape=(64, 64, 64)):
    data = nib.load(path).get_fdata()
    data = (data - data.min()) / (data.max() - data.min() + 1e-6)  # Normalize to [0, 1]
    zoom_factors = [t / s for t, s in zip(target_shape, data.shape)]
    data = zoom(data, zoom_factors, order=1)  # Resize
    data = np.expand_dims(data, axis=0)  # Add channel dim (1, D, H, W)
    return torch.tensor(data, dtype=torch.float32)

# ========== 3. Feature Extraction Pipeline ================
def extract_features_from_folder(folder_path):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = TumorFeatureCNN().to(device).eval()

    features = []
    patient_ids = []

    for fname in os.listdir(folder_path):
        if fname.endswith('.nii.gz'):
            path = os.path.join(folder_path, fname)
            # Extract everything before "_automated_approx_segm.nii.gz"
            patient_id = fname.replace('_automated_approx_segm.nii.gz', '')

            try:
                tensor = load_nifti(path).unsqueeze(0).to(device)
                with torch.no_grad():
                    feat = model(tensor).cpu().numpy().flatten()
                features.append(feat)
                patient_ids.append(patient_id)
            except Exception as e:
                print(f"[ERROR] {fname}: {e}")


    df = pd.DataFrame(features)
    df.insert(0, "PatientID", patient_ids)
    return df

# ========== 4. Run and Save to CSV ========================
if __name__ == "__main__":
    input_folder = r"C:\PKG - UPENN-GBM-NIfTI\UPENN-GBM\NIfTI-files\automated_segm"
    output_csv = "cnn_tumor_features.csv"

    df = extract_features_from_folder(input_folder)
    df.to_csv(output_csv, index=False)
    print(f"✅ Features extracted and saved to {output_csv}")

✅ Features extracted and saved to cnn_tumor_features.csv


In [13]:
import pandas as pd

# File paths
cnn_features_path = r'D:\mlpr data\Glioblastoma-ML-model\cnn_tumor_features.csv'
clinical_info_path = r'D:\mlpr data\Glioblastoma-ML-model\UPENN-GBM_clinical_info_v2.1.csv'
output_path = r'D:\mlpr data\Glioblastoma-ML-model\cnn_stacked.csv'

# Load CSVs
cnn_df = pd.read_csv(cnn_features_path)
clinical_df = pd.read_csv(clinical_info_path)

# Merge: match cnn_df.PatientID with clinical_df.ID
merged_df = cnn_df.merge(
    clinical_df[['ID', 'Survival_from_surgery_days_UPDATED']],
    left_on='PatientID',
    right_on='ID',
    how='left'
)

# Drop redundant 'ID' column (optional)
merged_df.drop(columns=['ID'], inplace=True)

# Save result
merged_df.to_csv(output_path, index=False)

print("✅ Merged CSV saved as cnn_stacked.csv at:", output_path)


✅ Merged CSV saved as cnn_stacked.csv at: D:\mlpr data\Glioblastoma-ML-model\cnn_stacked.csv


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from imblearn.over_sampling import SMOTE
import xgboost as xgb
from sklearn.ensemble import StackingClassifier

# Load the dataset
file_path = r"D:\mlpr data\Glioblastoma-ML-model\cnn_stacked.csv"
df = pd.read_csv(file_path)

# Convert target column to numeric
df["Survival_from_surgery_days_UPDATED"] = pd.to_numeric(df["Survival_from_surgery_days_UPDATED"], errors="coerce")

# Drop rows where target variable is NaN
df = df.dropna(subset=["Survival_from_surgery_days_UPDATED"])

# Percentile-Based Binning for classification
percentiles = np.percentile(df["Survival_from_surgery_days_UPDATED"], [25, 50, 75])
bins = [0, percentiles[0], percentiles[1], percentiles[2], np.inf]
labels = [0, 1, 2, 3]
df["Survival_Category"] = pd.cut(df["Survival_from_surgery_days_UPDATED"], bins=bins, labels=labels)

# Separate features and target
X = df.drop(columns=[df.columns[0], "Survival_from_surgery_days_UPDATED", "Survival_Category"])  # Drop ID and targets
y = df["Survival_Category"]

# Fill missing values with median
X = X.apply(pd.to_numeric, errors="coerce").fillna(X.median())

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Dimensionality reduction
lda = LinearDiscriminantAnalysis(n_components=3)
X_lda = lda.fit_transform(X_scaled, y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_lda, y, test_size=0.2, random_state=42, stratify=y)

# Balance classes with SMOTE
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

# Define base models
rf_clf = RandomForestClassifier(n_estimators=150, random_state=42)
xgb_clf = xgb.XGBClassifier(n_estimators=200, learning_rate=0.05, max_depth=6, random_state=42)
log_reg = LogisticRegression(max_iter=1000)
lda_clf = LinearDiscriminantAnalysis()

# Voting Classifier
ensemble_model = VotingClassifier(
    estimators=[("RandomForest", rf_clf), ("XGBoost", xgb_clf), ("LogReg", log_reg), ("LDA", lda_clf)],
    voting="hard"
)

# Stacking Classifier
stacking_clf = StackingClassifier(
    estimators=[("RandomForest", rf_clf), ("XGBoost", xgb_clf), ("LogReg", log_reg), ("LDA", lda_clf)],
    final_estimator=RandomForestClassifier()
)

# Train models
stacking_clf.fit(X_train, y_train)
ensemble_model.fit(X_train, y_train)

# Predictions
y_pred_ensemble = ensemble_model.predict(X_test)
y_pred_stacking = stacking_clf.predict(X_test)

# Evaluate
accuracy_ensemble = accuracy_score(y_test, y_pred_ensemble)
accuracy_stacking = accuracy_score(y_test, y_pred_stacking)
report_ensemble = classification_report(y_test, y_pred_ensemble)
report_stacking = classification_report(y_test, y_pred_stacking)

print(f"Ensemble Model Accuracy with LDA: {accuracy_ensemble:.2f}")
print("Classification Report:\n", report_ensemble)
print(f"Stacking Model Accuracy with LDA: {accuracy_stacking:.2f}")
print("Classification Report:\n", report_stacking)


Ensemble Model Accuracy with LDA: 0.35
Classification Report:
               precision    recall  f1-score   support

           0       0.39      0.50      0.44        30
           1       0.27      0.24      0.25        29
           2       0.35      0.38      0.37        29
           3       0.36      0.28      0.31        29

    accuracy                           0.35       117
   macro avg       0.35      0.35      0.34       117
weighted avg       0.35      0.35      0.34       117

Stacking Model Accuracy with LDA: 0.31
Classification Report:
               precision    recall  f1-score   support

           0       0.28      0.30      0.29        30
           1       0.30      0.31      0.31        29
           2       0.27      0.28      0.27        29
           3       0.40      0.34      0.37        29

    accuracy                           0.31       117
   macro avg       0.31      0.31      0.31       117
weighted avg       0.31      0.31      0.31       117

