In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder



In [14]:

embeddings_df = pd.read_csv("out/embeddings.csv")
clinical_df = pd.read_csv("out/data.csv")[["cases.submitter_id", "OS"]]

embeddings_df = embeddings_df['embedding'].str.split(',', expand=True)

# Optional: convert all columns to float
embeddings_df = embeddings_df.astype(float)

# Optional: rename columns like 'emb_0', 'emb_1', ...
embeddings_df.columns = [f'emb_{i}' for i in range(embeddings_df.shape[1])]

# Combine with the patient ID
embeddings_df = pd.concat([pd.read_csv("out/embeddings.csv")['patient'], embeddings_df], axis=1)

data = pd.concat([embeddings_df, clinical_df], axis=1)

data = data.drop(columns=["cases.submitter_id"])

# Step 3: Initial exploration
print("Initial data shape:", data.shape)
print(data.head())
print(data.info())
print(data['OS'].value_counts())


Initial data shape: (9523, 770)
                                             patient     emb_0     emb_1  \
0  TCGA-BP-5195.25c0b433-5557-4165-922e-2c1eac9c26f0 -0.791706  0.917843   
1  TCGA-D7-8573.b7306a47-697d-4ed3-bbe1-81d49674a8f8 -0.092357  1.072981   
2  TCGA-EI-7004.13591eed-30e5-47a3-91be-7a370663d2d4  0.046483  0.670229   
3  TCGA-EB-A82B.23E186C6-739C-4EF1-8788-79AA89C6E87A -0.119469  1.369852   
4  TCGA-A6-3808.e1505f65-72ef-438d-a5e1-93ed8bf6635d -0.134870  1.056534   

      emb_2     emb_3     emb_4     emb_5     emb_6     emb_7     emb_8  ...  \
0 -0.209761  0.121982 -0.600781  0.184892  0.030984  0.852848  1.510978  ...   
1 -0.159447  0.515891 -0.367665  0.330049 -0.035655  0.727242  0.846827  ...   
2 -0.156231  0.588667 -0.083936  0.204165  0.078950  1.097972  0.582440  ...   
3 -0.230224  0.212354 -0.324304  0.094714 -0.402293  0.979625  0.758534  ...   
4 -0.171030  0.619202 -0.385450  0.396186  0.126516  0.904694  1.248202  ...   

    emb_759   emb_760   emb_76

In [15]:
for col in data.columns:
    if data[col].dtype == 'object':
        data[col] = data[col].astype(str)



In [16]:
num_cols = data.select_dtypes(include=[np.number]).columns
cat_cols = data.select_dtypes(include=['object']).columns


In [17]:
num_imputer = SimpleImputer(strategy='median')
for col in num_cols:
    if pd.api.types.is_numeric_dtype(data[col]):
        data[col] = data[col].astype(float)
        data[col] = num_imputer.fit_transform(data[[col]])


In [18]:
if len(cat_cols) > 0:
    cat_imputer = SimpleImputer(strategy='most_frequent')
    data[cat_cols] = pd.DataFrame(
        cat_imputer.fit_transform(data[cat_cols]),
        columns=cat_cols,
        index=data.index
    )


In [19]:
le_dict = {}
for col in cat_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    le_dict[col] = le


In [20]:
X = data.drop('OS', axis=1)
y = data['OS']  # make sure OS is categorical (0/1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [21]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [22]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)


0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [23]:
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.979002624671916
Classification Report:
               precision    recall  f1-score   support

         0.0       0.00      0.00      0.00        40
         1.0       0.98      1.00      0.99      1865

    accuracy                           0.98      1905
   macro avg       0.49      0.50      0.49      1905
weighted avg       0.96      0.98      0.97      1905

Confusion Matrix:
 [[   0   40]
 [   0 1865]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [24]:
importances = clf.feature_importances_
feature_importance_df = pd.DataFrame({'feature': X.columns, 'importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=False)
print(feature_importance_df.head(10))


     feature  importance
0    patient    0.010966
400  emb_399    0.008620
414  emb_413    0.006934
256  emb_255    0.005810
589  emb_588    0.005218
363  emb_362    0.004979
462  emb_461    0.004741
501  emb_500    0.004074
409  emb_408    0.004037
651  emb_650    0.003935
