# HopSlot - Patient Urgency/Severity Predictor Copy

In [263]:
import pandas as pd
import numpy as np
import random
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import  train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from pathlib import Path
from sklearn.svm import SVC
from scipy.sparse import hstack

In [264]:
BASE_DATASET_PATH = Path('./datasets/synt/patient_data_2.csv')

In [265]:
patient_df = pd.read_csv(BASE_DATASET_PATH)

patient_df

Unnamed: 0,Age,Cause_of_Visit,No_of_Previous_Visits,Cause_Matches_Previous_Visit,Doctor_Availability,Work_Shift_Time,Schedule_date_time,Delayed_by,Severity_of_Ailment
0,4,"['diarrhea', 'bulge in the wall of the aorta',...",8,Yes,Unavailable,Evening,2024-11-17 10:03:49,0,Severe
1,23,"['decreased kidney function', 'cough that does...",0,No,Available,Evening,2024-05-18 02:05:25,5,Severe
2,34,"['chest pain', 'fever', 'difficulty getting']",9,Yes,Available,Evening,2024-01-02 06:56:37,10,Severe
3,80,"['fever', 'hyperthyroidism', 'widespread pain']",7,Yes,Unavailable,Evening,2024-11-06 08:43:16,0,Moderate
4,87,"['extreme mood swings', 'fever', 'sudden weakn...",5,Yes,Available,Morning,2024-10-25 05:12:30,10,Moderate
...,...,...,...,...,...,...,...,...,...
1995,81,"['nausea', 'pain in the lower back or side', '...",2,No,Available,Morning,2024-08-04 08:45:52,0,Severe
1996,23,"['difficulty urinating', 'rash', 'fever']",1,No,Available,Morning,2024-01-30 16:31:47,0,Mild
1997,21,"['lump', 'chronic autoimmune disease that can ...",5,Yes,Available,Afternoon,2024-02-27 21:11:08,15,Severe
1998,53,"['cough', 'pain in the lower back or side', 'c...",2,No,Available,Evening,2024-12-28 03:20:37,10,Moderate


In [266]:
X = patient_df.iloc[:, :-1]
y = patient_df.iloc[:, -1]

X, y

(      Age                                     Cause_of_Visit  \
 0       4  ['diarrhea', 'bulge in the wall of the aorta',...   
 1      23  ['decreased kidney function', 'cough that does...   
 2      34      ['chest pain', 'fever', 'difficulty getting']   
 3      80    ['fever', 'hyperthyroidism', 'widespread pain']   
 4      87  ['extreme mood swings', 'fever', 'sudden weakn...   
 ...   ...                                                ...   
 1995   81  ['nausea', 'pain in the lower back or side', '...   
 1996   23          ['difficulty urinating', 'rash', 'fever']   
 1997   21  ['lump', 'chronic autoimmune disease that can ...   
 1998   53  ['cough', 'pain in the lower back or side', 'c...   
 1999   35  ['sexually transmitted infection that can caus...   
 
       No_of_Previous_Visits Cause_Matches_Previous_Visit Doctor_Availability  \
 0                         8                          Yes         Unavailable   
 1                         0                           N

In [267]:
X['Cause_of_Visit'].dtype

dtype('O')

In [268]:
'\',\''.join(X['Cause_of_Visit'][0].split('\' \''))

"['diarrhea', 'bulge in the wall of the aorta', 'difficulty getting']"

In [269]:
import ast

X['Cause_of_Visit'] = X['Cause_of_Visit'].apply(lambda symptom: ast.literal_eval('\',\''.join(symptom.split('\' \''))))

In [270]:
X['Cause_of_Visit'].shape

(2000,)

In [271]:
# Unique Symptoms Vocabulary

symptoms = X['Cause_of_Visit'].sum()
symptoms = list(set(symptoms))

symptoms = np.array(symptoms)

# symptoms = np.expand_dims(symptoms, axis=1)
symptoms

array(['disturbance in eating behavior that can lead to serious health problemssudden weakness',
       'difficulty gettingdifficulty urinating',
       'sexually transmitted infection that can cause burning during urinationpain in the lower right abdomen',
       'disturbance in eating behavior that can lead to serious health problemshyperthyroidism',
       'chronic autoimmune disease that can affect any part of the bodydiarrhea',
       'change in bowel habitsincreased pressure in the eye that can damage the optic nerveinflammation of the liver',
       'widespread painsexually transmitted infection that can cause blisters on the genitalsphysical dependence on alcohol',
       'sudden weaknessdifficulty with social interaction',
       'sexually transmitted infection that can cause blisters on the genitalspainful urination',
       'increased pressure in the eye that can damage the optic nervepain in the lower right abdomen',
       'diarrheaincreased pressure in the eye that can da

In [272]:
def join_symptoms(symptoms_list):
  return ' '.join(symptoms_list) if isinstance(symptoms_list, list) else symptoms_list


def tokens(symptom):
    return symptom

In [273]:
# X['Cause_of_Visit'] = X['Cause_of_Visit'].apply(join_symptoms)

In [274]:
symptoms_vectorizer = TfidfVectorizer(lowercase=False, token_pattern=None, tokenizer=tokens)

# X['Cause_of_Visit'] = X['Cause_of_Visit'].apply(lambda symptom: ' '.join(symptom))

X['Cause_of_Visit'] = symptoms_vectorizer.fit_transform(X['Cause_of_Visit']).toarray()

symptoms_vectorizer.transform(['fever']).toarray()

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 

In [275]:
def vectorize_symptoms(symptoms: list):
    print(f"Vectorizing {len(symptoms)}")
    print(f"Symptoms: {symptoms}")
    v = symptoms_vectorizer.transform([symptoms])
    # print(f"Vectorized Symptoms: {v.toarray()[0]}")
    return v

In [276]:
# X_tfidf = X['Cause_of_Visit'].apply(vectorize_symptoms)
# X['Cause_of_Visit'] = X['Cause_of_Visit'].apply(vectorize_symptoms)

# X = X.drop(columns=['Cause_of_Visit'])

# X = hstack((X_tfidf, X))

In [277]:
# Label Encode Doctor Availability
doc_avail_le = LabelEncoder()
X['Doctor_Availability'] = doc_avail_le.fit_transform(X['Doctor_Availability'])

X['Doctor_Availability']

0       1
1       0
2       0
3       1
4       0
       ..
1995    0
1996    0
1997    0
1998    0
1999    0
Name: Doctor_Availability, Length: 2000, dtype: int64

In [278]:
# Label Encode Work Shift time

work_shift_le = LabelEncoder()
X['Work_Shift_Time'] = work_shift_le.fit_transform(X['Work_Shift_Time'])

X['Work_Shift_Time']

0       1
1       1
2       1
3       1
4       2
       ..
1995    2
1996    2
1997    0
1998    1
1999    2
Name: Work_Shift_Time, Length: 2000, dtype: int64

In [279]:
# Label Encode prev match
prev_matching_le = LabelEncoder()

X['Cause_Matches_Previous_Visit'] = prev_matching_le.fit_transform(X['Cause_Matches_Previous_Visit'])

X.drop(columns=['Cause_Matches_Previous_Visit'])

Unnamed: 0,Age,Cause_of_Visit,No_of_Previous_Visits,Doctor_Availability,Work_Shift_Time,Schedule_date_time,Delayed_by
0,4,0.0,8,1,1,2024-11-17 10:03:49,0
1,23,0.0,0,0,1,2024-05-18 02:05:25,5
2,34,0.0,9,0,1,2024-01-02 06:56:37,10
3,80,0.0,7,1,1,2024-11-06 08:43:16,0
4,87,0.0,5,0,2,2024-10-25 05:12:30,10
...,...,...,...,...,...,...,...
1995,81,0.0,2,0,2,2024-08-04 08:45:52,0
1996,23,0.0,1,0,2,2024-01-30 16:31:47,0
1997,21,0.0,5,0,0,2024-02-27 21:11:08,15
1998,53,0.0,2,0,1,2024-12-28 03:20:37,10


In [280]:
X.dtypes

Age                               int64
Cause_of_Visit                  float64
No_of_Previous_Visits             int64
Cause_Matches_Previous_Visit      int64
Doctor_Availability               int64
Work_Shift_Time                   int64
Schedule_date_time               object
Delayed_by                        int64
dtype: object

In [281]:
# X['Schedule_date_time'] = pd.to_datetime(X['Schedule_date_time'])
# 
# X['Schedule_date_time_day'] = X['Schedule_date_time'].dt.day
# X['Schedule_date_time_month'] = X['Schedule_date_time'].dt.month
# X['Schedule_date_time_year'] = X['Schedule_date_time'].dt.year
# 
# # Hour of Day
# X['Schedule_date_time_hour_of_day'] = X['Schedule_date_time'].dt.hour
# 
# # Cyclical Encoding of Hours
# X['Schedule_date_time_hour_sin'] = np.sin(2 * np.pi * X['Schedule_date_time_hour_of_day']/24)
# X['Schedule_date_time_hour_cos'] = np.cos(2 * np.pi * X['Schedule_date_time_hour_of_day']/24)

X = X.drop(columns=['Schedule_date_time'])

X.dtypes

Age                               int64
Cause_of_Visit                  float64
No_of_Previous_Visits             int64
Cause_Matches_Previous_Visit      int64
Doctor_Availability               int64
Work_Shift_Time                   int64
Delayed_by                        int64
dtype: object

In [282]:
target_le = LabelEncoder()

y = target_le.fit_transform(y)

y

array([2, 2, 2, ..., 2, 1, 2])

In [283]:
# X = X.drop(columns=['Cause_of_Visit'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

len(X_train), len(X_test), len(y_train), len(y_test)

(1600, 400, 1600, 400)

In [284]:
X_train.shape, X_test.shape

((1600, 7), (400, 7))

In [285]:
rfc_model = RandomForestClassifier(n_estimators=300, criterion='entropy', random_state=42)

rfc_model.fit(X_train, y_train)

In [286]:
y_preds = rfc_model.predict(X_test)

print(target_le.inverse_transform(y_preds))
accuracy_score(y_test, y_preds)

['Moderate' 'Severe' 'Severe' 'Severe' 'Moderate' 'Moderate' 'Moderate'
 'Moderate' 'Severe' 'Moderate' 'Severe' 'Moderate' 'Moderate' 'Severe'
 'Severe' 'Mild' 'Mild' 'Moderate' 'Severe' 'Moderate' 'Severe' 'Severe'
 'Mild' 'Severe' 'Severe' 'Severe' 'Mild' 'Moderate' 'Mild' 'Severe'
 'Severe' 'Moderate' 'Mild' 'Mild' 'Mild' 'Severe' 'Severe' 'Mild' 'Mild'
 'Mild' 'Severe' 'Moderate' 'Moderate' 'Moderate' 'Moderate' 'Severe'
 'Moderate' 'Moderate' 'Severe' 'Severe' 'Moderate' 'Moderate' 'Mild'
 'Moderate' 'Mild' 'Moderate' 'Moderate' 'Mild' 'Moderate' 'Moderate'
 'Severe' 'Moderate' 'Moderate' 'Moderate' 'Severe' 'Moderate' 'Mild'
 'Severe' 'Severe' 'Severe' 'Severe' 'Mild' 'Moderate' 'Severe' 'Mild'
 'Mild' 'Moderate' 'Moderate' 'Severe' 'Mild' 'Moderate' 'Moderate' 'Mild'
 'Moderate' 'Mild' 'Moderate' 'Severe' 'Mild' 'Moderate' 'Mild' 'Severe'
 'Severe' 'Moderate' 'Moderate' 'Severe' 'Severe' 'Severe' 'Moderate'
 'Severe' 'Mild' 'Mild' 'Severe' 'Severe' 'Moderate' 'Mild' 'Severe'
 '

0.3625

In [287]:
svc_model = SVC(random_state=42)

svc_model.fit(X_train, y_train)

In [288]:
y_preds_svc = svc_model.predict(X_test)

print(accuracy_score(y_test, y_preds_svc))

0.38


In [289]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5, p=2)

knn.fit(X_train, y_train)

In [290]:
y_preds_knn = knn.predict(X_test)

print(accuracy_score(y_test, y_preds_knn))

0.3475
