# HopSlot - Patient Urgency/Severity Predictor

In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import  train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from pathlib import Path
from scipy.sparse import hstack

In [2]:
BASE_DATASET_PATH = Path('./datasets/synt/patient_data.csv')

In [3]:
patient_df = pd.read_csv(BASE_DATASET_PATH)

patient_df

Unnamed: 0,Patient_Name,Patient_ID,Age,Cause_of_Visit,No_of_Previous_Visits,Cause_Matches_Previous_Visit,Doctor_Availability,Work_Shift_Time,Schedule_date_time,Delayed_by,Severity_of_Ailment
0,Arjun Patel,P0001,45,['change in bowel habits' 'raised' 'physical d...,4,No,Available,Evening,2024-11-28T07:20:38,0,Severe
1,Avani Kumar,P0002,48,['pain in the lower right abdomen' 'runny nose...,5,No,Unavailable,Evening,2024-04-25T16:54:40,10,Mild
2,Dhruv Gupta,P0003,65,['pain in the upper right abdomen' 'increased ...,0,Yes,Available,Evening,2024-01-21T16:49:33,0,Severe
3,Aditi Singh,P0004,68,['pain in the lower back or side' 'itchy' 'fat...,3,Yes,Unavailable,Evening,2024-06-08T06:27:27,0,Severe
4,Advik Sharma,P0005,68,['condition that affects the lungs' 'raised'\n...,2,No,Unavailable,Afternoon,2024-09-21T09:33:23,5,Moderate
...,...,...,...,...,...,...,...,...,...,...,...
995,Aditi Singh,P0996,6,['cough' 'damage to the kidneys that can lead ...,9,Yes,Unavailable,Morning,2024-03-04T22:27:33,15,Severe
996,Advik Patel,P0997,39,['elevated body temperature ' 'hypothyroidism'...,3,No,Unavailable,Evening,2024-08-21T00:56:49,0,Severe
997,Aarav Chauhan,P0998,39,['pain' 'excessive worry' 'memory loss'],8,Yes,Available,Afternoon,2024-02-03T19:54:02,15,Mild
998,Diya Reddy,P0999,66,['difficulty getting'\n 'condition in which th...,2,No,Unavailable,Afternoon,2024-10-30T01:36:36,0,Severe


In [4]:
X = patient_df.iloc[:, 2:-1]
y = patient_df.iloc[:, -1]

X, y

(     Age                                     Cause_of_Visit  \
 0     45  ['change in bowel habits' 'raised' 'physical d...   
 1     48  ['pain in the lower right abdomen' 'runny nose...   
 2     65  ['pain in the upper right abdomen' 'increased ...   
 3     68  ['pain in the lower back or side' 'itchy' 'fat...   
 4     68  ['condition that affects the lungs' 'raised'\n...   
 ..   ...                                                ...   
 995    6  ['cough' 'damage to the kidneys that can lead ...   
 996   39  ['elevated body temperature ' 'hypothyroidism'...   
 997   39           ['pain' 'excessive worry' 'memory loss']   
 998   66  ['difficulty getting'\n 'condition in which th...   
 999   60             ['swollen' 'abdominal pain' 'fatigue']   
 
      No_of_Previous_Visits Cause_Matches_Previous_Visit Doctor_Availability  \
 0                        4                           No           Available   
 1                        5                           No         Unava

In [5]:
X['Cause_of_Visit'].dtype

dtype('O')

In [6]:
'\',\''.join(X['Cause_of_Visit'][0].split('\' \''))

"['change in bowel habits','raised','physical dependence on alcohol']"

In [7]:
import ast

X['Cause_of_Visit'] = X['Cause_of_Visit'].apply(lambda symptom: ast.literal_eval('\',\''.join(symptom.split('\' \''))))

In [8]:
X['Cause_of_Visit'].shape

(1000,)

In [9]:
# Unique Symptoms Vocabulary

symptoms = X['Cause_of_Visit'].sum()
symptoms = list(set(symptoms))

symptoms = np.array(symptoms)

# symptoms = np.expand_dims(symptoms, axis=1)
symptoms

array(['disturbance in eating behavior that can lead to serious health problemssudden weakness',
       'difficulty gettingdifficulty urinating',
       'sexually transmitted infection that can cause burning during urinationpain in the lower right abdomen',
       'disturbance in eating behavior that can lead to serious health problemshyperthyroidism',
       'chronic autoimmune disease that can affect any part of the bodydiarrhea',
       'change in bowel habitsincreased pressure in the eye that can damage the optic nerveinflammation of the liver',
       'sudden weaknessdifficulty with social interaction',
       'widespread painsexually transmitted infection that can cause blisters on the genitalsphysical dependence on alcohol',
       'nauseacondition in which the heart cannot pump blood as well as it shouldbulge in the wall of the aorta',
       'sexually transmitted infection that can cause blisters on the genitalspainful urination',
       'increased pressure in the eye that can

In [10]:
def join_symptoms(symptoms_list):
  return ' '.join(symptoms_list) if isinstance(symptoms_list, list) else symptoms_list


In [11]:
X['Cause_of_Visit'] = X['Cause_of_Visit'].apply(join_symptoms)

In [12]:
symptoms_vectorizer = TfidfVectorizer(lowercase=False)

# X['Cause_of_Visit'] = X['Cause_of_Visit'].apply(lambda symptom: ' '.join(symptom))

X['Cause_of_Visit'] = symptoms_vectorizer.fit_transform(X['Cause_of_Visit']).toarray()

symptoms_vectorizer.transform(['fever']).toarray()

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 

In [13]:
def vectorize_symptoms(symptoms: list):
    print(f"Vectorizing {len(symptoms)}")
    print(f"Symptoms: {symptoms}")
    v = symptoms_vectorizer.transform([symptoms])
    # print(f"Vectorized Symptoms: {v.toarray()[0]}")
    return v

In [14]:
# X_tfidf = X['Cause_of_Visit'].apply(vectorize_symptoms)
# X['Cause_of_Visit'] = X['Cause_of_Visit'].apply(vectorize_symptoms)

# X = X.drop(columns=['Cause_of_Visit'])

# X = hstack((X_tfidf, X))

In [15]:
# Label Encode Doctor Availability
doc_avail_le = LabelEncoder()
X['Doctor_Availability'] = doc_avail_le.fit_transform(X['Doctor_Availability'])

X['Doctor_Availability']

0      0
1      1
2      0
3      1
4      1
      ..
995    1
996    1
997    0
998    1
999    0
Name: Doctor_Availability, Length: 1000, dtype: int64

In [16]:
# Label Encode Work Shift time

work_shift_le = LabelEncoder()
X['Work_Shift_Time'] = work_shift_le.fit_transform(X['Work_Shift_Time'])

X['Work_Shift_Time']

0      1
1      1
2      1
3      1
4      0
      ..
995    2
996    1
997    0
998    0
999    0
Name: Work_Shift_Time, Length: 1000, dtype: int64

In [17]:
# Label Encode prev match
prev_matching_le = LabelEncoder()

X['Cause_Matches_Previous_Visit'] = prev_matching_le.fit_transform(X['Cause_Matches_Previous_Visit'])

In [18]:
X.dtypes

Age                               int64
Cause_of_Visit                  float64
No_of_Previous_Visits             int64
Cause_Matches_Previous_Visit      int64
Doctor_Availability               int64
Work_Shift_Time                   int64
Schedule_date_time               object
Delayed_by                        int64
dtype: object

In [19]:
# X['Schedule_date_time'] = pd.to_datetime(X['Schedule_date_time'])
# 
# X['Schedule_date_time_day'] = X['Schedule_date_time'].dt.day
# X['Schedule_date_time_month'] = X['Schedule_date_time'].dt.month
# X['Schedule_date_time_year'] = X['Schedule_date_time'].dt.year
# 
# # Hour of Day
# X['Schedule_date_time_hour_of_day'] = X['Schedule_date_time'].dt.hour
# 
# # Cyclical Encoding of Hours
# X['Schedule_date_time_hour_sin'] = np.sin(2 * np.pi * X['Schedule_date_time_hour_of_day']/24)
# X['Schedule_date_time_hour_cos'] = np.cos(2 * np.pi * X['Schedule_date_time_hour_of_day']/24)

X = X.drop(columns=['Schedule_date_time'])

X.dtypes

Age                               int64
Cause_of_Visit                  float64
No_of_Previous_Visits             int64
Cause_Matches_Previous_Visit      int64
Doctor_Availability               int64
Work_Shift_Time                   int64
Delayed_by                        int64
dtype: object

In [20]:
target_le = LabelEncoder()

y = target_le.fit_transform(y)

y

array([2, 0, 2, 2, 1, 1, 0, 0, 1, 2, 2, 1, 2, 0, 1, 1, 1, 2, 1, 1, 2, 2,
       1, 2, 2, 1, 2, 1, 1, 1, 2, 2, 0, 2, 1, 2, 2, 0, 1, 0, 0, 0, 1, 1,
       0, 1, 2, 2, 2, 1, 2, 2, 2, 1, 0, 2, 2, 0, 1, 1, 1, 1, 2, 1, 2, 2,
       0, 2, 0, 1, 2, 1, 2, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 2, 2, 2,
       1, 0, 1, 2, 1, 1, 0, 2, 0, 0, 0, 1, 2, 1, 2, 2, 2, 0, 0, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 1, 2, 1, 1, 1, 0, 1, 0, 2, 0, 0, 0, 0, 1, 0, 2,
       2, 2, 1, 1, 1, 0, 0, 1, 2, 1, 2, 1, 2, 2, 1, 1, 0, 2, 2, 0, 0, 2,
       2, 1, 1, 1, 1, 0, 2, 2, 0, 0, 1, 1, 2, 1, 2, 0, 0, 1, 0, 0, 1, 1,
       2, 2, 1, 1, 0, 1, 1, 2, 2, 0, 2, 0, 2, 1, 0, 1, 1, 1, 2, 0, 2, 0,
       0, 1, 0, 2, 1, 1, 0, 0, 2, 1, 2, 1, 2, 2, 0, 2, 2, 2, 1, 0, 2, 1,
       2, 2, 2, 0, 0, 1, 0, 1, 0, 2, 2, 2, 0, 0, 1, 0, 0, 0, 0, 1, 2, 1,
       2, 0, 2, 1, 1, 2, 0, 0, 1, 0, 1, 1, 0, 0, 0, 2, 0, 2, 2, 2, 1, 1,
       0, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1, 1, 1, 0, 1, 1, 2, 2, 1, 0, 2, 0,
       1, 2, 2, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0,

In [21]:
# X = X.drop(columns=['Cause_of_Visit'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

len(X_train), len(X_test), len(y_train), len(y_test)

(800, 200, 800, 200)

In [22]:
rfc_model = RandomForestClassifier(n_estimators=200, random_state=42, criterion='entropy')

rfc_model.fit(X_train, y_train)

In [23]:
y_preds = rfc_model.predict(X_test)

print(target_le.inverse_transform(y_preds))
accuracy_score(y_test, y_preds)

['Severe' 'Mild' 'Severe' 'Severe' 'Mild' 'Severe' 'Moderate' 'Mild'
 'Severe' 'Mild' 'Mild' 'Moderate' 'Moderate' 'Moderate' 'Severe'
 'Moderate' 'Mild' 'Severe' 'Mild' 'Mild' 'Moderate' 'Mild' 'Moderate'
 'Mild' 'Mild' 'Severe' 'Moderate' 'Mild' 'Severe' 'Moderate' 'Mild'
 'Mild' 'Mild' 'Moderate' 'Mild' 'Mild' 'Severe' 'Moderate' 'Severe'
 'Mild' 'Mild' 'Severe' 'Moderate' 'Severe' 'Severe' 'Mild' 'Moderate'
 'Severe' 'Severe' 'Severe' 'Mild' 'Severe' 'Moderate' 'Moderate' 'Mild'
 'Moderate' 'Severe' 'Mild' 'Mild' 'Moderate' 'Severe' 'Moderate' 'Mild'
 'Mild' 'Mild' 'Mild' 'Moderate' 'Mild' 'Severe' 'Severe' 'Moderate'
 'Mild' 'Severe' 'Moderate' 'Severe' 'Mild' 'Moderate' 'Mild' 'Moderate'
 'Mild' 'Moderate' 'Mild' 'Moderate' 'Severe' 'Mild' 'Severe' 'Moderate'
 'Severe' 'Moderate' 'Severe' 'Moderate' 'Severe' 'Severe' 'Mild'
 'Moderate' 'Mild' 'Moderate' 'Mild' 'Moderate' 'Moderate' 'Severe' 'Mild'
 'Mild' 'Mild' 'Moderate' 'Moderate' 'Severe' 'Severe' 'Moderate' 'Mild'
 'Mild' 'S

0.36