In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score

In [2]:
# Load the data from the Excel file
data = pd.read_csv('clinical_data_with_img_features.csv')

# Preprocess the data
data = data.fillna(-999) # replace missing values with -999

# remove the '%' symbol from the 'total_response_percent' column
data['total_response_percent'] = data['total_response_percent'].str.rstrip('%')
data['necrosis_percent'] = data['necrosis_percent'].str.rstrip('%')
data['fibrosis_percent'] = data['fibrosis_percent'].str.rstrip('%')
data['mucin_percent'] = data['mucin_percent'].str.rstrip('%')

# convert the 'total_response_percent' column to float
data['total_response_percent'] = data['total_response_percent'].astype(float)
data['necrosis_percent'] = data['necrosis_percent'].astype(float)
data['fibrosis_percent'] = data['fibrosis_percent'].astype(float)
data['mucin_percent'] = data['mucin_percent'].astype(float)

# Drop the irrelevant columns
X = data.drop(['Patient-ID', 'De-identify Scout Name', 'progression_or_recurrence_liveronly', 'relevant_notes'], axis=1)
y = data['progression_or_recurrence_liveronly']

# Create a random forest classifier and fit the data
rfc = RandomForestClassifier()
rfc.fit(X, y)

# Get the feature importances and sort them in descending order
importances = list(rfc.feature_importances_)
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(X.columns, importances)]
feature_importances = sorted(feature_importances, key=lambda x: x[1], reverse=True)

# Print the top 5 most influential features
for feature, importance in feature_importances[:5]:
    print('{}: {}'.format(feature, importance))

months_to_liver_DFS_progression: 0.18
vital_status_liver_DFS: 0.11
months_to_DFS_progression: 0.09
overall_survival_months: 0.06
progression_or_recurrence: 0.06


In [4]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a random forest classifier and fit the training data
rfc = RandomForestClassifier()
# rfc.fit(X_train, y_train)

# Predict the test data and calculate the accuracy and precision
# y_pred = rfc.predict(X_test)
# accuracy = accuracy_score(y_test, y_pred)
# precision = precision_score(y_test, y_pred)

# # Print the accuracy and precision
# print('Accuracy: {:.2f}%'.format(accuracy * 100))
# print('Precision: {:.2f}%'.format(precision * 100))

In [6]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(rfc, X, y, cv=5)
scores.mean()

0.8574358974358974