In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Load dataset
file_path = "/NSCLC ML RESEARCH.csv"
df = pd.read_csv(file_path)

# Drop irrelevant columns
irrelevant_columns = ["Study ID", "Patient ID", "Sample ID", "Form completion date", "Other Patient ID"]
df_cleaned = df.drop(columns=irrelevant_columns, errors='ignore')

# Handle missing values: Fill numerical with mean, categorical with mode
for col in df_cleaned.columns:
    if df_cleaned[col].dtype == "object":
        df_cleaned[col].fillna(df_cleaned[col].mode()[0], inplace=True)  # Fill categorical with mode
    else:
        df_cleaned[col].fillna(df_cleaned[col].mean(), inplace=True)  # Fill numerical with mean

# Encode categorical variables
label_encoders = {}
for col in df_cleaned.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df_cleaned[col] = le.fit_transform(df_cleaned[col])
    label_encoders[col] = le  # Store label encoders for inverse transformation if needed

# Select target variable (Overall Survival Status)
target_column = "Overall Survival Status"
X = df_cleaned.drop(columns=[target_column], errors='ignore')  # Features
y = df_cleaned[target_column]  # Target

# Normalize numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into Training (70%), Validation (15%), and Test (15%)
X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.30, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.50, random_state=42)

# Display dataset shapes
print(f"Training Set: {X_train.shape}, Validation Set: {X_val.shape}, Test Set: {X_test.shape}")


Training Set: (737, 57), Validation Set: (158, 57), Test Set: (158, 57)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned[col].fillna(df_cleaned[col].mean(), inplace=True)  # Fill numerical with mean
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned[col].fillna(df_cleaned[col].mode()[0], inplace=True)  # Fill categorical with mode
  updated_mean = (last_sum + new_sum) / updated_s