In [None]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import numpy as np
warnings.filterwarnings("ignore")
plt.style.use('ggplot')
pd.set_option('display.max_columns', None)
import plotly.graph_objects as go
import random
import math 

from dash import Dash, html, dcc
from jupyter_dash import JupyterDash
from dash.dependencies import Output, Input
from dash.exceptions import PreventUpdate

In [None]:
data = pd.read_csv('healthcare_dataset.csv')

data.isna().sum()

data.duplicated().sum()

data.info()

In [None]:
# Change the datetime value
for date in ['Date of Admission', 'Discharge Date']:
    data[date] = pd.to_datetime(data[date])

# calculate days of stay
data['Days of Stay'] = data['Discharge Date'] - data['Date of Admission']
data['Days of Stay'] = data['Days of Stay'].astype('str').str.split(' ', expand=True)[0]
data['Days of Stay'] = data['Days of Stay'].astype('int')

data.drop('Name', axis=1, inplace=True)
data.drop('Doctor', axis=1, inplace=True)
data.drop(['Hospital', 'Room Number'], axis=1, inplace=True)

In [None]:
#Data distribution for different columns
for col in data.columns:
    if data[col].dtype == 'O' and col != 'Hospital':
        print('\n')
        print(data[col].value_counts())
        print('--')   

In [None]:
#bar plots for categorical columns

cat_col = data.select_dtypes(include='object').columns

plt.figure(figsize=(24, 12))
for i, column in enumerate(cat_col):
    plt.subplot(2, (len(cat_col)+1)//2, i+1)
    sns.barplot(data=data[column].value_counts().reset_index(),
                x='index',
                y=column)
    plt.title(column)

plt.show()
plt.tight_layout()

In [None]:
#Histograms for numeric values

numeric_columns = data.select_dtypes(['int', 'float']).columns
plt.figure(figsize=(16, 8))

for i, column in enumerate(numeric_columns):
    plt.subplot(3, len(numeric_columns)//1, i+1)
    sns.histplot(data[column], kde=True)
    plt.title(column)
    
plt.show()
plt.tight_layout()

Age Analysis 

In [None]:
data.Age.max()
data.Age.min()

#Define each age group
def age_group(col):
    blanks=[]
    for x in col:
        if x <= 30:
            blanks.append('18-30')
        elif x <= 40:
            blanks.append('31-40')
        elif x <= 50:
            blanks.append('41-50')
        elif x <= 60:
            blanks.append('51-60')
        elif x <= 70:
            blanks.append('61-70')
        elif x <= 80:
            blanks.append('71-80')
        else:
            blanks.append('Over 80')
    return blanks

data['Age Group']=age_group(data['Age'])

In [None]:
#Age Group Distribution visuals
px.bar(
    data['Age Group'].value_counts().reset_index().rename(columns={'index':'Age Group', 'Age Group':'Cnt'}),
    x='Age Group',
    y='Cnt',
    color='Age Group',
    text_auto=True,
    title='Age Group Distribution'
)

In [None]:
#Gender Value Count in each Age Group
px.bar(
    pd.DataFrame(data.groupby('Age Group')['Gender'].value_counts()).rename(columns={'Gender':'Cnt'}).reset_index(),
    x='Age Group',
    y='Cnt',
    color='Gender',
    barmode='group',
    title='Gender Value Count in each Age Group',
    text_auto=True
)

#same result with seaborn
# sns.barplot(
#     data = pd.DataFrame(data.groupby('Age Group')['Gender'].value_counts()).rename(columns={'Gender':'Cnt'}).reset_index(),
#     x='Age Group',
#     y='Cnt',
#     hue='Gender') 
    

In [None]:
#function to show differnet column distribution Among Patient in different age group
def distribution_among_age_group(col):
    for age_group in data['Age Group'].unique():
        df = data.loc[data['Age Group'] == age_group]
        dfm = df[col].value_counts().reset_index().rename(columns={'index':col,
                                                                    col: 'Cnt'})
        fig = px.bar(
            dfm, 
            x=col,
            y='Cnt',
            color=col,
            title=f'{col} Distribution Among Patient Age Group {age_group}',
            text_auto=True
        )
    
        fig.show()
        
distribution_among_age_group('Medical Condition')
# distribution_among_age_group('Insurance Provider')
# distribution_among_age_group('Admission Type')
# distribution_among_age_group('Test Results')
# distribution_among_age_group('Blood Type')

In [None]:
#Function to show average Days of stay among different age groups of different medical conditions
def age_stay(col):
    for age in data['Age Group'].unique():

        filtered = data.loc[data['Age Group']==age]
        avg_days = filtered.groupby(col)['Days of Stay'].mean().reset_index()\
                           .sort_values('Days of Stay', ascending=False)

        fig = px.bar(
            avg_days,
            x=col,
            y='Days of Stay',
            color=col,
            text_auto=True,
            title=f'Avg Days of Stay for Patient Age Group {age} in Different {col}'
        )

        fig.show()
        
        
age_stay('Medical Condition')
# age_stay('Admission Type')
# age_stay('Test Results')
# age_stay('Medication')

In [None]:
# Plotly & Dash way of doing above plot
app = JupyterDash(__name__)

app.layout = html.Div([
    html.H3('Age Group Selection'),
    dcc.Dropdown(id='dropdown',
                options=data['Age Group'].unique(),
                value='18-30'),
    dcc.Graph(id='visual')
])

@app.callback(Output('visual', 'figure'), Input('dropdown', 'value'))
def plot(age):
    
    filtered = data.loc[data['Age Group']==age]
    avg_days = filtered.groupby('Medical Condition')['Days of Stay'].mean().reset_index()\
                       .sort_values('Days of Stay', ascending=False)
    
    fig = px.bar(
        avg_days, 
        x='Medical Condition',
        y='Days of Stay',
        color='Medical Condition',
        text_auto=True,
        title=f'Avg Days of Stay for Patient Age Group {age} in Different Medical Condition',
    )

    return fig

if __name__ == '__main__':
    app.run_server(mode='inline', debug=True, port=7001)

Gender Analysis 

In [None]:
data['Gender'].value_counts()

#Funciton to show columns distribution among different genders
def gender_col_distribution(col):
    for x in data['Gender'].unique():
        df = data.loc[data['Gender']==x]
        dfm = df[col].value_counts().reset_index().rename(columns={'index':col, 
                                                                   col:'Cnt'})
        fig=px.bar(
            dfm,
            x=col,
            y='Cnt',
            color=col,
            text_auto=True,
            title=f'Medical Condition Distribution Among {x}'
        ) 

        fig.show()
        
gender_col_distribution('Medical Condition')
# gender_col_distribution('Blood Type')
# gender_col_distribution('Insurance Provider')
# gender_col_distribution('Admission Type')
# gender_col_distribution('Medication')
# gender_col_distribution('Test Results')

In [None]:
#Funciton to show columns avg days of stay among different genders
def gender_stay(col):
    for x in data['Gender'].unique():
        df = data.loc[data['Gender']==x]
        dfm = df.groupby([col])['Days of Stay'].mean().reset_index()
        
        fig=px.bar(
            dfm,
            x=col,
            y='Days of Stay',
            color=col,
            text_auto=True,
            title=f'Avg Days of Stay for {x} patient in Different {col}'
        ) 

        fig.show()
        
gender_stay('Medical Condition')
# gender_stay('Blood Type')
# gender_stay('Insurance Provider')
# gender_stay('Admission Type')
# gender_stay('Medication')
# gender_stay('Test Results')

In [None]:
# Plotly & Dash way of doing above plot
app = JupyterDash(__name__)

app.layout = html.Div([
    html.H3('Gender Selection'),
    dcc.Dropdown(id='dropdown',
                options=data['Gender'].unique(),
                value='Male'),
    dcc.Graph(id='visual')
])

@app.callback(Output('visual', 'figure'), Input('dropdown', 'value'))
def plot(gender):
    
    df = data.loc[data['Gender']==gender]
    dfm = df.groupby(['Test Results'])['Days of Stay'].mean().reset_index()
    
    fig = px.bar(
        dfm, 
        x='Test Results',
        y='Days of Stay',
        color='Test Results',
        text_auto=True,
        title=f'Avg Days of Stay for {gender} Patient with Different Test Results',
    )

    return fig

if __name__ == '__main__':
    app.run_server(mode='inline', debug=True, port=7002)

Insurance Provider Analysis

In [None]:
data['Insurance Provider'].value_counts()

In [None]:
px.bar(
    pd.DataFrame(data.groupby(['Gender'])['Insurance Provider'].value_counts())\
.rename(columns={'Insurance Provider':'Cnt'}).reset_index(),
    x='Insurance Provider',
    y='Cnt',
    color='Gender',
    barmode='group',
    text_auto=True,
    title='Gender Distribution among Different Insurance Provider'
)

In [None]:
#Function to show avg Billing under different insurance for differnet col
data.groupby(['Insurance Provider', 'Medical Condition'])['Billing Amount'].mean().reset_index()

def insurance_avg(col):
    for x in data['Insurance Provider'].unique():
        df = data.loc[data['Insurance Provider']==x]
        dfm = df.groupby([col])['Billing Amount'].mean().reset_index()\
                .sort_values('Billing Amount', ascending=False)
        
        fig=px.bar(
            dfm,
            x=col,
            y='Billing Amount',
            color=col,
            text_auto=True,
            title=f'Avg Billing Amount for {x} for Patient under {col}'
        ) 

        fig.show()
        
insurance_avg('Medical Condition')
# insurance_avg('Admission Type')
# insurance_avg('Medication')
# insurance_avg('Test Results')


In [None]:
#function to show Avg Billing Amount of different insurance for Patient under Different col of Different col
def insurance_avg_multi(col1, col2):
    for x in data['Insurance Provider'].unique():
        df = data.loc[data['Insurance Provider']==x]
        dfm = df.groupby([col1, col2])['Billing Amount'].mean().reset_index()\
                .sort_values('Billing Amount', ascending=False)
        
        fig=px.bar(
            dfm,
            x=col1,
            y='Billing Amount',
            color='Test Results',
            barmode='group',
            text_auto=True,
            title=f'Avg Billing Amount for {x} for Patient under Different {col1} of Different {col2}'
        ) 

        fig.show()
        
# insurance_avg_multi('Medical Condition', 'Test Results')
insurance_avg_multi('Medication', 'Test Results')

In [None]:
data.head()

Correlation

In [None]:
#Spilting the data to num and cat

data_cat = data[[col for col in data.columns if data[col].dtype == 'O']]
data_num = data[[col for col in data.columns if data[col].dtype != 'O']]

data_num.drop(['Date of Admission', 'Discharge Date'], axis=1, inplace=True)

In [None]:
#Corrlation of col in data_cat

def correlation(col):
    
    from sklearn.preprocessing import LabelEncoder

    label_encoder = LabelEncoder()

    data_cat[col] = label_encoder.fit_transform(data_cat[col])
    
    for x in data_cat.columns:
        if x != col:
            dict1 = data_cat.groupby([x])[col].mean().to_dict()
            data_cat[x] = data_cat[x].map(dict1)

            df = pd.concat([data_cat, data_num], axis=1)
            

    return df.corr()[col].sort_values(ascending=False)


for x in data_cat.columns:
    print(f'-{x}-')
    print(correlation(x))
    print('\n')

Prediction 

In [None]:
def display_scores(scores):
    print('===============================================')
    print('Scores: {}'.format(scores))
    print('===============================================')
    print('Mean Score: {}'.format(scores.mean()))
    print('===============================================')
    print('Standard Deviation of Scores: {}'.format(scores.std()))
    print('===============================================')
    
    return None

In [None]:
#function to predict that does all the feature engineering that predicts a column

def prediction(col, ml_model):
    
    from sklearn.preprocessing import LabelEncoder

    label_encoder = LabelEncoder()

    data_cat[col] = label_encoder.fit_transform(data_cat[col])
    
    for x in data_cat.columns:
        if x != col:
            dict1 = data_cat.groupby([x])[col].mean().to_dict()
            data_cat[x] = data_cat[x].map(dict1)

            df = pd.concat([data_cat, data_num], axis=1)
            
    X = df.drop([col], axis=1)
    y = df.loc[:, [col]]
    
    from sklearn.linear_model import Lasso
    from sklearn.feature_selection import SelectFromModel

    model = SelectFromModel(Lasso(0.005))
    model.fit(X, y)
    
    X = X[X.columns[model.get_support()]]
    
    from sklearn.metrics import mean_squared_error, accuracy_score, confusion_matrix, f1_score
    from sklearn.model_selection import cross_val_score
    from sklearn.preprocessing import StandardScaler
    
    
    from sklearn.model_selection import train_test_split

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    
    ssc = StandardScaler()
    
    X_train = ssc.fit_transform(X_train)
    X_test = ssc.fit_transform(X_test)

    ml_model.fit(X_train, y_train)
    pred = ml_model.predict(X_test)

    print(f'Predictions: {pred}')
    print(f'Training Score: {ml_model.score(X_train, y_train)}')
    print('\n')
    print(f'{confusion_matrix(pred, y_test)}')
    print('\n')
#     print(f'F1 Score: {f1_score(pred, y_test)}')
    print(f'Accuracy Score: {accuracy_score(pred, y_test)}')
    print(f'Mean Squared Error: {mean_squared_error(pred, y_test)}')
    
    scores = cross_val_score(ml_model,
               X_train,
               y_train,
               scoring='accuracy',
               cv=10)
    print('\n')
    display_scores(scores)

    plt.figure(figsize=(4,2))
    sns.kdeplot(pred, shade=True)
    sns.kdeplot(y_test.values.flatten(), shade=True)
    plt.legend(['pred', 'y_test'])
    
    print('\n')
    plt.show()

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=11, metric='euclidean')

prediction('Test Results', knn)

In [None]:
#Prediction of the a column with various models
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC

forest = RandomForestClassifier()
tree = DecisionTreeClassifier(ccp_alpha=0.01)
# math.sqrt(len(y_test))
knn = KNeighborsClassifier(n_neighbors=11, metric='euclidean')
xgb = XGBClassifier()
svc = SVC()

def predict_col(col):
    for x in [forest, tree, knn, xgb, svc]:
        print(x)
        prediction(col, x)
        print('__________')
        print('\n')

In [None]:
predict_col('Test Results')
# predict_col('Medical Condition')
# predict_col('Admission Type')
# predict_col('Medication')

Parameter Tuning

#The function below takes the col we want to predict, the model we like to use, and the parameter options that we like to give#

In [None]:
def tuning(col, ml_model, param_grid):
    
    #Exact same code from the 'prediction' function
    from sklearn.preprocessing import LabelEncoder

    label_encoder = LabelEncoder()

    data_cat[col] = label_encoder.fit_transform(data_cat[col])
    
    for x in data_cat.columns:
        if x != col:
            dict1 = data_cat.groupby([x])[col].mean().to_dict()
            data_cat[x] = data_cat[x].map(dict1)

            df = pd.concat([data_cat, data_num], axis=1)
            
    X = df.drop([col], axis=1)
    y = df.loc[:, [col]]
    
    from sklearn.linear_model import Lasso
    from sklearn.feature_selection import SelectFromModel

    model = SelectFromModel(Lasso(0.005))
    model.fit(X, y)
    
    X = X[X.columns[model.get_support()]]
    
    from sklearn.metrics import mean_squared_error, accuracy_score, confusion_matrix, f1_score
    from sklearn.model_selection import cross_val_score
    from sklearn.preprocessing import StandardScaler
    
    
    from sklearn.model_selection import train_test_split

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    
    ssc = StandardScaler()
    
    X_train = ssc.fit_transform(X_train)
    X_test = ssc.fit_transform(X_test)

    ml_model.fit(X_train, y_train)
    pred = ml_model.predict(X_test)

    print(f'Predictions: {pred}')
    print(f'Training Score: {ml_model.score(X_train, y_train)}')
    print('\n')
    print(f'{confusion_matrix(pred, y_test)}')
    print('\n')
#     print(f'F1 Score: {f1_score(pred, y_test)}')
    print(f'Accuracy Score: {accuracy_score(pred, y_test)}')
    print(f'Mean Squared Error: {mean_squared_error(pred, y_test)}')
    
    scores = cross_val_score(ml_model,
               X_train,
               y_train,
               scoring='accuracy',
               cv=10)
    print('\n')
    display_scores(scores)

    plt.figure(figsize=(4,2))
    sns.kdeplot(pred, shade=True)
    sns.kdeplot(y_test.values.flatten(), shade=True)
    plt.legend(['pred', 'y_test'])
    
    print('\n')
    plt.show()
    
    
    #Here comes the tunning code:
    
    from sklearn.model_selection import RandomizedSearchCV
    
    random_search = RandomizedSearchCV(ml_model, 
                   param_distributions=param_grid, 
                   n_iter=10, 
                   cv=5)
    
    random_search.fit(X_train, y_train)

    random_search.best_estimator_

    tuned_pred = random_search.best_estimator_.predict(X_test)
    
    print('Parameter Tuning Results:')
    print('\n')
    print(f'Best Params: {random_search.best_estimator_}')
    print('\n')
    print(f'Predictions: {tuned_pred}')
    print(f'Training Score: {random_search.score(X_train, y_train)}')
    print('\n')
    print(f'{confusion_matrix(tuned_pred, y_test)}')
    print('\n')
#     print(f'F1 Score: {f1_score(pred, y_test)}')
    print(f'Accuracy Score: {accuracy_score(tuned_pred, y_test)}')
    print(f'Mean Squared Error: {mean_squared_error(tuned_pred, y_test)}')

    plt.figure(figsize=(4,2))
    sns.kdeplot(tuned_pred, shade=True)
    sns.kdeplot(y_test.values.flatten(), shade=True)
    plt.legend(['pred', 'y_test'])
    
    print('\n')
    plt.show()

In [None]:
#Parameter tunning for decision tree

tree_param_grid = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': ['auto', 'sqrt', 'log2', None],
    'criterion': ['gini', 'entropy'],
    'max_leaf_nodes': [None, 5, 10, 20]
}

tuning('Test Results', tree, tree_param_grid)

In [None]:
#Parameter tuning for random forest

forest_param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

tuning('Test Results', forest, forest_param_grid)

In [None]:
#Parameter tuning for knn

knn_param_grid = {
    'n_neighbors': [3, 5, 7, 10, 11, 13],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'p': [1, 2]
}

tuning('Test Results', knn, knn_param_grid)

In [None]:
#parameter tuning for xgboost
xgb_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.5, 0.7, 0.9],
    'colsample_bytree': [0.5, 0.7, 0.9],
    'gamma': [0, 0.1, 0.2]
}

tuning('Test Results', xgb, xgb_param_grid)