In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# ---- STEP 1: Load Dataset ----
file_path = "/NSCLC ML RESEARCH.csv"  # Update with correct file path
df = pd.read_csv(file_path)

# Drop irrelevant columns
irrelevant_columns = ["Study ID", "Patient ID", "Sample ID", "Form completion date", "Other Patient ID"]
df_cleaned = df.drop(columns=irrelevant_columns, errors='ignore')

# Handle missing values: Fill numerical with mean, categorical with mode
for col in df_cleaned.columns:
    if df_cleaned[col].dtype == "object":
        df_cleaned[col] = df_cleaned[col].fillna(df_cleaned[col].mode()[0])  # Fill categorical with mode
    else:
        df_cleaned[col] = df_cleaned[col].fillna(df_cleaned[col].mean())  # Fill numerical with mean

# Encode categorical variables
label_encoders = {}
for col in df_cleaned.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df_cleaned[col] = le.fit_transform(df_cleaned[col])
    label_encoders[col] = le  # Store label encoders for inverse transformation if needed

# Select target variable (Overall Survival Status)
target_column = "Overall Survival Status"
X = df_cleaned.drop(columns=[target_column])  # Features
y = df_cleaned[target_column]  # Target

# Handle remaining missing values using Imputer
imputer = SimpleImputer(strategy="mean")  # Replace NaNs with column mean
X_imputed = imputer.fit_transform(X)

# Normalize numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Split data into Training (70%), Validation (15%), and Test (15%)
X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.30, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.50, random_state=42)

# ---- STEP 2: Train & Evaluate Models ----
model_results = {}

# ---- Train Random Forest ----
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
model_results["Random Forest"] = accuracy_score(y_test, y_pred_rf)

# ---- Train XGBoost ----
xgb_model = xgb.XGBClassifier(n_estimators=100, learning_rate=0.05, eval_metric='logloss')
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)
model_results["XGBoost"] = accuracy_score(y_test, y_pred_xgb)

# ---- Train SVM ----
svm_model = SVC(kernel='rbf', probability=True)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
model_results["SVM"] = accuracy_score(y_test, y_pred_svm)

# ---- Train Decision Tree ----
dt_model = DecisionTreeClassifier(max_depth=4)
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)
model_results["Decision Tree"] = accuracy_score(y_test, y_pred_dt)

# ---- Train Neural Network ----
nn_model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')  # Binary classification (Survival: Yes/No)
])

nn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
nn_model.fit(X_train, y_train, epochs=20, batch_size=16, validation_data=(X_val, y_val), verbose=0)
test_loss, test_acc_nn = nn_model.evaluate(X_test, y_test, verbose=0)
model_results["Neural Network"] = test_acc_nn

# ---- STEP 3: Compare Model Performances ----
print("\nModel Performance Comparison (Accuracy Scores):")
for model, acc in model_results.items():
    print(f"{model}: {acc:.4f}")


 'Neoplasm Histologic Grade' 'Primary Lymph Node Presentation Assessment'
 'Number of Samples Per Patient' 'Patient Weight']. At least one non-missing value is needed for imputation with strategy='mean'.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



Model Performance Comparison (Accuracy Scores):
Random Forest: 0.9051
XGBoost: 0.9430
SVM: 0.8671
Decision Tree: 0.8671
Neural Network: 0.8734
