In [44]:
import data_analysis.data_processing as dp
import data_analysis.model_training as mt 


In [45]:
# Load the data

filename = 'data/sample_diabetes_mellitus_data.csv'
df = dp.load_data(filename)
df.head()

Unnamed: 0.1,Unnamed: 0,encounter_id,hospital_id,age,bmi,elective_surgery,ethnicity,gender,height,hospital_admit_source,...,ventilated_apache,wbc_apache,aids,cirrhosis,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,diabetes_mellitus
0,0,214826,118,68.0,22.732803,0,Caucasian,M,180.3,Floor,...,0,14.1,0,0,0,0,0,0,0,1
1,1,246060,81,77.0,27.421875,0,Caucasian,F,160.0,Floor,...,1,12.7,0,0,0,0,0,0,0,1
2,2,276985,118,25.0,31.952749,0,Caucasian,F,172.7,Emergency Department,...,0,,0,0,0,0,0,0,0,0
3,3,262220,118,81.0,22.635548,1,Caucasian,F,165.1,Operating Room,...,1,8.0,0,0,0,0,0,0,0,0
4,4,201746,33,19.0,,0,Caucasian,M,188.0,,...,0,,0,0,0,0,0,0,0,0


In [46]:
# Split the data between train and test

df_train, df_test = dp.split_data(df)

In [47]:
# Remove those rows that contain NaN values in the columns: age, gender, ethnicity

df_train = dp.remove_nan_rows(df_train)
df_test = dp.remove_nan_rows(df_test)

In [48]:
# Fill NaN with the mean value of the column in the columns: height, weight

df_train = dp.fill_nan_with_mean(df_train)
df_test = dp.fill_nan_with_mean(df_test)


In [49]:
# Generate dummies for ethnicity column (One hot encoding)

df_train = dp.generate_dummies_for_ethnicity(df_train)
df_test = dp.generate_dummies_for_ethnicity(df_test)

In [50]:
# Create a binary variable for gender M/F

df_train = dp.create_binary_gender_variable(df_train)
df_test = dp.create_binary_gender_variable(df_test)

In [51]:
# Split features from the target variable

features = ['age', 'height', 'weight', 'aids', 'cirrhosis', 'hepatic_failure',
'immunosuppression', 'leukemia', 'lymphoma', 'solid_tumor_with_metastasis']
target = 'diabetes_mellitus'
X_train, y_train = mt.features_and_target_split(df_train, features, target)
X_test, y_test = mt.features_and_target_split(df_test, features, target)


In [52]:
# Train the model and Predict the targets for both the train and test sets

train_probs, test_probs = mt.train_model(X_train, y_train, X_test)

print(f"The prediction for the training dataset is: {train_probs}")
print(f"The prediction for the test dataset is: {test_probs}")

The prediction for the training dataset is: [0.32983713 0.3462329  0.14629259 ... 0.14701363 0.30247299 0.11497016]
The prediction for the test dataset is: [0.30706795 0.29686519 0.12054503 ... 0.26055694 0.23011625 0.20078105]


In [53]:
# Compute the train and test roc_auc metric

roc_auc_train, roc_auc_test = mt.evaluation(y_train, y_test, train_probs, test_probs)

print(f"The accuracy of the training dataset is: {roc_auc_train}")
print(f"The accuracy of the test dataset is: {roc_auc_test}")

The accuracy of the training dataset is: 0.6736538115694989
The accuracy of the test dataset is: 0.6496783570195794
