In [74]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder



In [75]:
data = pd.read_csv("out/data.csv")

# Step 3: Initial exploration
print("Initial data shape:", data.shape)
print(data.head())
print(data.info())
print(data['OS'].value_counts())


Initial data shape: (253, 61)
  project.project_id       cases.disease_type cases.index_date  \
0          TCGA-LUSC  Squamous Cell Neoplasms        Diagnosis   
1          TCGA-LUSC  Squamous Cell Neoplasms        Diagnosis   
2          TCGA-LUSC  Squamous Cell Neoplasms        Diagnosis   
3          TCGA-LUSC  Squamous Cell Neoplasms        Diagnosis   
4          TCGA-LUSC  Squamous Cell Neoplasms        Diagnosis   

  cases.primary_site  demographic.age_at_index  demographic.age_is_obfuscated  \
0  Bronchus and lung                      67.0                          False   
1  Bronchus and lung                      72.0                          False   
2  Bronchus and lung                      74.0                          False   
3  Bronchus and lung                      81.0                          False   
4  Bronchus and lung                      63.0                          False   

  demographic.country_of_residence_at_enrollment  demographic.days_to_birth  \
0      

In [76]:
for col in data.columns:
    if data[col].dtype == 'object':
        data[col] = data[col].astype(str)



In [77]:
num_cols = data.select_dtypes(include=[np.number]).columns
cat_cols = data.select_dtypes(include=['object']).columns


In [78]:
num_imputer = SimpleImputer(strategy='median')
for col in num_cols:
    if pd.api.types.is_numeric_dtype(data[col]):
        data[col] = data[col].astype(float)
        data[col] = num_imputer.fit_transform(data[[col]])




In [79]:
if len(cat_cols) > 0:
    cat_imputer = SimpleImputer(strategy='most_frequent')
    data[cat_cols] = pd.DataFrame(
        cat_imputer.fit_transform(data[cat_cols]),
        columns=cat_cols,
        index=data.index
    )


In [80]:
le_dict = {}
for col in cat_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    le_dict[col] = le


In [81]:
X = data.drop('OS', axis=1)
y = data['OS']  # make sure OS is categorical (0/1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [82]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [83]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)


0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [84]:
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.7058823529411765
Classification Report:
               precision    recall  f1-score   support

         0.0       0.70      0.67      0.68        24
         1.0       0.71      0.74      0.73        27

    accuracy                           0.71        51
   macro avg       0.70      0.70      0.70        51
weighted avg       0.71      0.71      0.71        51

Confusion Matrix:
 [[16  8]
 [ 7 20]]


In [85]:
importances = clf.feature_importances_
feature_importance_df = pd.DataFrame({'feature': X.columns, 'importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=False)
print(feature_importance_df.head(10))


                                           feature  importance
12                      diagnoses.age_at_diagnosis    0.076465
7                        demographic.days_to_birth    0.071368
4                         demographic.age_at_index    0.063579
36                     diagnoses.year_of_diagnosis    0.059654
16                     diagnoses.ajcc_pathologic_t    0.054359
15                 diagnoses.ajcc_pathologic_stage    0.043690
6   demographic.country_of_residence_at_enrollment    0.039570
17           diagnoses.ajcc_staging_system_edition    0.035174
54                       treatments.treatment_type    0.034580
31                  diagnoses.sites_of_involvement    0.029642
