# Importing libraries 

In [38]:
# This is a quick check of whether the notebook is currently running on Google Colaboratory
# or on Kaggle, as that makes some difference for the code below.
# We'll do this in every notebook of the course.
try:
    import google.colab
    colab=True
except:
    colab=False

import os
kaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')

In [39]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Loading data

In [40]:
training_data_file_path = "data/training_data.csv"
training_data = pd.read_csv(training_data_file_path)
training_data.head(4)

Unnamed: 0,id,vdate,rcount,gender,dialysisrenalendstage,asthma,irondef,pneum,substancedependence,psychologicaldisordermajor,...,sodium,glucose,bloodureanitro,creatinine,bmi,pulse,respiration,secondarydiagnosisnonicd9,facid,lengthofstay
0,76513,2012-07-05,4,M,False,False,False,False,False,True,...,137.150067,148.420532,23.0,0.955051,28.450268,63,6.7,1,E,7
1,60406,2012-02-26,1,M,False,False,False,False,False,False,...,140.643655,166.718526,9.0,1.385747,29.233048,81,6.5,0,B,3
2,27322,2012-11-19,1,F,False,False,False,False,False,False,...,138.459305,106.468142,12.0,0.816984,34.295535,75,6.5,1,B,5
3,53699,2012-01-30,0,F,False,False,False,False,False,False,...,138.667613,144.311236,12.0,1.082725,31.207914,75,6.5,1,B,2


In [41]:
columns_with_nan = training_data.columns[training_data.isna().any()]
print(columns_with_nan)

Index(['gender', 'hemo', 'bmi'], dtype='object')


In [42]:
training_data.shape

(70000, 27)

In [43]:
X = training_data.select_dtypes(include=['int64', 'float64'])[training_data.select_dtypes(include=['int64', 'float64']).columns[0:-1]]
boolean_columns = training_data.select_dtypes(include=['bool'])

In [44]:
int_columns = boolean_columns.astype(int)

In [45]:
gender_colums =  training_data['gender'].replace({'M': 1, 'F': 0})
gender_colums = gender_colums.fillna(1)
gender_colums.head(2)

0    1.0
1    1.0
Name: gender, dtype: float64

In [46]:
print(gender_colums.isnull().sum())

0


In [47]:
facid_colums = training_data['facid'].replace({'A': 0, 'B': 1, 'C': 2, 'D': 3,'E': 4})

In [48]:
hemo_column = training_data['hemo'].replace({False: 0, True: 1})
hemo_column = hemo_column.fillna(1)

In [49]:
print(hemo_column.isnull().sum())

0


In [50]:
X = pd.concat([X, int_columns], axis=1)
X = pd.concat([X, gender_colums], axis=1)
X = pd.concat([X, facid_colums], axis=1)
X = pd.concat([X, hemo_column], axis=1)
X = X.fillna(X.mean())
#X = X.drop('id', axis=1)
X.shape

(70000, 25)

In [51]:
columns_with_nan = X.columns[X.isna().any()]
print(columns_with_nan)

Index([], dtype='object')


In [52]:
test_data_last_path = "data/test_data.csv"
test_data = pd.read_csv(test_data_last_path)
#test_data = test_data.drop('id', axis=1)
test_data_last_columns = test_data.columns;
test_data_last_columns_array = list(test_data_last_columns)
X = X[test_data_last_columns_array]


In [53]:
y = training_data['lengthofstay']

In [54]:
from sklearn.model_selection import train_test_split

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [56]:
print(f'Training data: {X_train.shape}\nTest data: {X_test.shape}')

Training data: (52500, 25)
Test data: (17500, 25)


In [57]:
from sklearn.ensemble import RandomForestClassifier

In [58]:
rf = RandomForestClassifier(random_state= 42, n_estimators=100)

In [59]:
rf.fit(X_train, y_train)

In [60]:
y_pred = rf.predict(X_test)

In [61]:
from sklearn.metrics import accuracy_score

In [62]:
accuracy_score(y_test, y_pred)

0.6320571428571429

In [63]:
importances = rf.feature_importances_
cols = X.columns

# Create a list of tuples (importance, column name)
importance_list = list(zip(importances, cols))

# Sort the list by importance (in descending order)
importance_list.sort(reverse=True)

# Print the sorted list
for importance, col in importance_list:
    print(f'{col: <27}: {importance:.4f}')

rcount                     : 0.2098
bmi                        : 0.0777
glucose                    : 0.0772
creatinine                 : 0.0766
sodium                     : 0.0764
pulse                      : 0.0695
hematocrit                 : 0.0670
neutrophils                : 0.0577
id                         : 0.0522
facid                      : 0.0460
respiration                : 0.0424
bloodureanitro             : 0.0395
secondarydiagnosisnonicd9  : 0.0272
psychologicaldisordermajor : 0.0177
hemo                       : 0.0128
gender                     : 0.0099
irondef                    : 0.0073
substancedependence        : 0.0067
dialysisrenalendstage      : 0.0053
depress                    : 0.0051
asthma                     : 0.0044
pneum                      : 0.0040
malnutrition               : 0.0036
psychother                 : 0.0034
fibrosisandother           : 0.0007


In [64]:
def predict_lengthofstay(gender, bmi, rcount, glucose, creatinine):
    
    input_data = pd.DataFrame({
        'id': [X['id'].mean()],
        #'rcount': [X['rcount'].mean()],
        'rcount': [rcont],
        'gender': [gender], 
        'dialysisrenalendstage': [X['dialysisrenalendstage'].mean()],
        'asthma': [X['asthma'].mean()],
        'irondef': [X['irondef'].mean()],
        'pneum': [X['pneum'].mean()],
        'substancedependence': [X['substancedependence'].mean()],
        'psychologicaldisordermajor': [X['psychologicaldisordermajor'].mean()],
        'depress': [X['depress'].mean()],
        'psychother': [X['psychother'].mean()],
        'fibrosisandother': [X['fibrosisandother'].mean()],
        'malnutrition': [X['malnutrition'].mean()],
        'hemo': [X['hemo'].mean()],
        'hematocrit': [X['hematocrit'].mean()],
        'neutrophils': [X['neutrophils'].mean()],
        'sodium': [X['sodium'].mean()],
        #'glucose': [X['glucose'].mean()],
        'glucose': [glucose],
        'bloodureanitro': [X['bloodureanitro'].mean()],
        #'creatinine': [X['creatinine'].mean()],
        'creatinine': [creatinine],
        'bmi': [bmi],  # Replace with the actual bmi value
        'pulse': [X['pulse'].mean()],
        'respiration': [X['respiration'].mean()],
        'secondarydiagnosisnonicd9': [X['secondarydiagnosisnonicd9'].mean()],
        'facid': [X['facid'].mean()]
    })
    
    prediction = rf.predict(input_data)


    
    # Return the result
    return prediction[0]

In [65]:
gender = 1
bmi =  34
rcont = 5
glucose = 268
creatinine = 2
    
predict_lengthofstay(gender, bmi, rcont, glucose, creatinine)

9

In [66]:
if (colab or kaggle):
    %pip install gradio

In [67]:
import gradio as gr

In [68]:
# Set the minimum, maximum, and default values for the variables
gender_min, gender_max, gender_default = 0, 1, 1
bmi_min, bmi_max, bmi_default = 15, 39, 34
rcount_min, rcount_max, rcount_default = 0, 5, 3
glucose_min, glucose_max, glucose_default = 0, 268, 200
creatinine_min, creatinine_max, creatinine_default = 0, 3, 1




In [69]:
# Set the minimum, maximum, and default values for the variables
gender_min, gender_max, gender_default = 0, 1, 1
bmi_min, bmi_max, bmi_default = 15, 39, 34
rcount_min, rcount_max, rcount_default = 0, 5, 3
glucose_min, glucose_max, glucose_default = 0, 268, 200
creatinine_min, creatinine_max, creatinine_default = 0, 2, 1

# Create the interface
iface = gr.Interface(
    fn=predict_lengthofstay, 
    inputs=[
        gr.components.Slider(minimum=gender_min, maximum=gender_max, value=gender_default, label="Gender", step=1),
        gr.components.Slider(minimum=bmi_min, maximum=bmi_max, value=bmi_default, label="BMI"),
        gr.components.Slider(minimum=rcount_min, maximum=rcount_max, value=rcount_default, label="Rcount",step=1),
        gr.components.Slider(minimum=glucose_min, maximum=glucose_max, value=glucose_default, label="Glucose Level"),
        gr.components.Slider(minimum=creatinine_min, maximum=creatinine_max, value=creatinine_default, label="Creatinine")
    ], 
    outputs=gr.components.Textbox(label="Prediction"),
    title="Length of Stay Predictor",
    description="Enter the values to predict the length of stay.",
)

# Launch the interface
iface.launch(share=True)

Running on local URL:  http://127.0.0.1:7861

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.




In [70]:
test_pred = rf.predict(test_data)

In [71]:
submission_df = pd.DataFrame({
    "id": test_data["id"].astype(int),
    "lengthofstay": test_pred.astype(int)
})

In [72]:
submission_df.shape

(30000, 2)

In [73]:
file_path = 'submission.csv'

# Write the DataFrame to a CSV file
submission_df.to_csv(file_path, index=None)

In [74]:
pd.read_csv("submission.csv")

Unnamed: 0,id,lengthofstay
0,75721,2
1,80184,2
2,19864,7
3,76699,3
4,92991,3
...,...,...
29995,42648,6
29996,86306,3
29997,45466,6
29998,63724,5
