# Hopslot Severity Predictor

## Preprocessing

### Importing libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import ast
import joblib, dill
from pathlib import Path

### Read train Dataset

In [2]:
df=pd.read_csv('./datasets/final/release_train_patients.csv')
pd.set_option('display.max_colwidth', None)

### Convert list strings to actual python lists for Evidences

In [3]:
df.EVIDENCES = df.EVIDENCES.apply(ast.literal_eval)

In [4]:
df.head()

Unnamed: 0,AGE,DIFFERENTIAL_DIAGNOSIS,SEX,PATHOLOGY,EVIDENCES,INITIAL_EVIDENCE
0,18,"[['Bronchitis', 0.19171203430383882], ['Pneumonia', 0.17579340398940366], ['URTI', 0.1607809719801254], ['Bronchiectasis', 0.12429044460990353], ['Tuberculosis', 0.11367177304035844], ['Influenza', 0.11057936110639896], ['HIV (initial infection)', 0.07333003867293564], ['Chagas', 0.04984197229703562]]",M,URTI,"[E_48, E_50, E_53, E_54_@_V_161, E_54_@_V_183, E_55_@_V_89, E_55_@_V_108, E_55_@_V_167, E_56_@_4, E_57_@_V_123, E_58_@_3, E_59_@_3, E_77, E_79, E_91, E_97, E_201, E_204_@_V_10, E_222]",E_91
1,21,"[['HIV (initial infection)', 0.5189500564407601], ['Chagas', 0.3217819010436332], ['Scombroid food poisoning', 0.13496758062695968], ['Sarcoidosis', 0.024300461888647054]]",M,HIV (initial infection),"[E_9, E_27, E_50, E_51, E_53, E_54_@_V_198, E_55_@_V_62, E_55_@_V_166, E_55_@_V_167, E_56_@_7, E_57_@_V_123, E_58_@_7, E_59_@_2, E_91, E_115, E_129, E_130_@_V_138, E_131_@_V_10, E_132_@_0, E_133_@_V_90, E_133_@_V_91, E_133_@_V_95, E_133_@_V_110, E_133_@_V_111, E_134_@_6, E_135_@_V_12, E_136_@_0, E_148, E_162, E_189, E_204_@_V_10]",E_50
2,19,"[['Bronchitis', 0.11278064619119596], ['Pneumonia', 0.10048134562119852], ['Unstable angina', 0.08462979865697953], ['Possible NSTEMI / STEMI', 0.08343170881871906], ['Bronchiectasis', 0.07764173703530038], ['Boerhaave', 0.07708363099826726], ['Spontaneous rib fracture', 0.07682349455456361], ['Pericarditis', 0.07504681799224715], ['URTI', 0.06705952510476514], ['Stable angina', 0.06667817325012103], ['GERD', 0.06455232704351052], ['Scombroid food poisoning', 0.058790058668404095], ['Chagas', 0.02923384001251551], ['Sarcoidosis', 0.016406922262218876], ['Panic attack', 0.009359973789993502]]",F,Pneumonia,"[E_53, E_54_@_V_179, E_54_@_V_192, E_55_@_V_29, E_55_@_V_55, E_55_@_V_56, E_55_@_V_170, E_55_@_V_171, E_56_@_5, E_57_@_V_123, E_58_@_2, E_59_@_6, E_77, E_78, E_79, E_94, E_106, E_118, E_123, E_124, E_129, E_130_@_V_156, E_131_@_V_12, E_132_@_4, E_133_@_V_54, E_133_@_V_85, E_134_@_0, E_135_@_V_10, E_136_@_0, E_144, E_175, E_181, E_201, E_204_@_V_10]",E_77
3,34,"[['URTI', 0.23859396799565236], ['Cluster headache', 0.18566159722723558], ['Bronchitis', 0.1773471504190298], ['Chronic rhinosinusitis', 0.14960995823729417], ['Acute rhinosinusitis', 0.10748772366243657], ['Chagas', 0.07064980122917579], ['Anemia', 0.07064980122917579]]",F,URTI,"[E_48, E_53, E_54_@_V_183, E_55_@_V_89, E_55_@_V_109, E_55_@_V_124, E_55_@_V_166, E_55_@_V_167, E_56_@_7, E_57_@_V_123, E_58_@_5, E_59_@_4, E_97, E_181, E_201, E_204_@_V_10]",E_53
4,36,"[['URTI', 0.23677812769175735], ['Influenza', 0.1842006792554458], ['Bronchitis', 0.1651209569301098], ['Cluster headache', 0.1430457852409739], ['Chronic rhinosinusitis', 0.139295948156182], ['Chagas', 0.06577925136276556], ['Anemia', 0.06577925136276556]]",M,URTI,"[E_49, E_50, E_53, E_54_@_V_183, E_55_@_V_62, E_55_@_V_124, E_55_@_V_166, E_55_@_V_167, E_56_@_5, E_57_@_V_123, E_58_@_4, E_59_@_2, E_97, E_144, E_181, E_201, E_204_@_V_10]",E_201


### Select only required columns

In [5]:
selected_columns = ["PATHOLOGY", "EVIDENCES"]
new_df = df[selected_columns]

# Write the new DataFrame to a new CSV file
# new_df.to_csv("new.csv", index=False)

In [6]:
new_df.head()

Unnamed: 0,PATHOLOGY,EVIDENCES
0,URTI,"[E_48, E_50, E_53, E_54_@_V_161, E_54_@_V_183, E_55_@_V_89, E_55_@_V_108, E_55_@_V_167, E_56_@_4, E_57_@_V_123, E_58_@_3, E_59_@_3, E_77, E_79, E_91, E_97, E_201, E_204_@_V_10, E_222]"
1,HIV (initial infection),"[E_9, E_27, E_50, E_51, E_53, E_54_@_V_198, E_55_@_V_62, E_55_@_V_166, E_55_@_V_167, E_56_@_7, E_57_@_V_123, E_58_@_7, E_59_@_2, E_91, E_115, E_129, E_130_@_V_138, E_131_@_V_10, E_132_@_0, E_133_@_V_90, E_133_@_V_91, E_133_@_V_95, E_133_@_V_110, E_133_@_V_111, E_134_@_6, E_135_@_V_12, E_136_@_0, E_148, E_162, E_189, E_204_@_V_10]"
2,Pneumonia,"[E_53, E_54_@_V_179, E_54_@_V_192, E_55_@_V_29, E_55_@_V_55, E_55_@_V_56, E_55_@_V_170, E_55_@_V_171, E_56_@_5, E_57_@_V_123, E_58_@_2, E_59_@_6, E_77, E_78, E_79, E_94, E_106, E_118, E_123, E_124, E_129, E_130_@_V_156, E_131_@_V_12, E_132_@_4, E_133_@_V_54, E_133_@_V_85, E_134_@_0, E_135_@_V_10, E_136_@_0, E_144, E_175, E_181, E_201, E_204_@_V_10]"
3,URTI,"[E_48, E_53, E_54_@_V_183, E_55_@_V_89, E_55_@_V_109, E_55_@_V_124, E_55_@_V_166, E_55_@_V_167, E_56_@_7, E_57_@_V_123, E_58_@_5, E_59_@_4, E_97, E_181, E_201, E_204_@_V_10]"
4,URTI,"[E_49, E_50, E_53, E_54_@_V_183, E_55_@_V_62, E_55_@_V_124, E_55_@_V_166, E_55_@_V_167, E_56_@_5, E_57_@_V_123, E_58_@_4, E_59_@_2, E_97, E_144, E_181, E_201, E_204_@_V_10]"


## Load Multi Label Binarizer

In [7]:
import joblib
from pathlib import Path

pwd = Path.cwd()
path = Path(pwd.parent / "HopSlot-FullStack/django-api/predict_symptoms/trained_models")

In [8]:
# Load encoder

evidenceEncoder = joblib.load(path / "evidence_mb_encoder.pkl")

#### Test loaded encoder

In [9]:
test_s = ['E_53', 'E_23']
# print(test_s.shape)
s = evidenceEncoder.transform([test_s])

print(len([x for x in s[0] if x == 1]))

2


### Perform Vectorization on Evidence Column

In [10]:
new_df.loc[:, "EVIDENCES"] = new_df.EVIDENCES.apply(lambda x: evidenceEncoder.transform([x]))

In [11]:
new_df.EVIDENCES.head(2).shape

(2,)

### Function To Expand 2d Evidence column to normal columns

In [12]:
def expand2DCellToColumns(df: pd.DataFrame, featureName):
    return df[featureName].apply(lambda x: pd.Series(x[0]))

In [13]:
X = expand2DCellToColumns(new_df, "EVIDENCES")

In [14]:
X.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,962,963,964,965,966,967,968,969,970,971
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [15]:
X.shape

(1025602, 972)

## Training the Model


In [16]:
y = new_df.PATHOLOGY.values
X = X.values

In [17]:
type(X), type(y), X.shape, y.shape

(numpy.ndarray, numpy.ndarray, (1025602, 972), (1025602,))

### Train Logistic Regression Model

In [18]:
lr_model = LogisticRegression(random_state=42, solver='lbfgs', max_iter=200)

In [19]:
# Train the model
lr_model.fit(X, y)

### Train Support Vector Classifier

In [20]:
svc_model = SVC(random_state=42, kernel='rbf', gamma='auto')

In [None]:
# svc_model.fit(X, y)

## Testing the Models

### Load Test set

In [36]:
df_test = pd.read_csv('./datasets/final/release_test_patients.csv', converters={
    'EVIDENCES': ast.literal_eval,
})
pd.set_option('display.max_colwidth', None)

new_df_test = df_test[selected_columns]

new_df_test.head(5)

Unnamed: 0,PATHOLOGY,EVIDENCES
0,GERD,"[E_53, E_54_@_V_112, E_54_@_V_161, E_54_@_V_180, E_54_@_V_181, E_55_@_V_29, E_55_@_V_101, E_55_@_V_103, E_56_@_6, E_57_@_V_29, E_57_@_V_101, E_58_@_3, E_59_@_2, E_70, E_78, E_98, E_140, E_167, E_173, E_201, E_204_@_V_10, E_217]"
1,Bronchitis,"[E_53, E_54_@_V_181, E_55_@_V_55, E_55_@_V_148, E_56_@_5, E_57_@_V_123, E_58_@_7, E_59_@_4, E_66, E_123, E_181, E_201, E_204_@_V_10, E_214]"
2,Acute dystonic reactions,"[E_15, E_128, E_147, E_168, E_172, E_193, E_204_@_V_10]"
3,Acute laryngitis,"[E_48, E_49, E_53, E_54_@_V_181, E_55_@_V_20, E_55_@_V_21, E_55_@_V_148, E_56_@_2, E_57_@_V_123, E_58_@_4, E_59_@_4, E_79, E_91, E_201, E_204_@_V_10, E_212]"
4,URTI,"[E_41, E_50, E_53, E_54_@_V_161, E_54_@_V_183, E_55_@_V_62, E_55_@_V_89, E_55_@_V_108, E_55_@_V_109, E_55_@_V_124, E_56_@_7, E_57_@_V_123, E_58_@_3, E_59_@_0, E_91, E_97, E_144, E_181, E_201, E_204_@_V_10]"


In [37]:
new_df_test["EVIDENCES"].size

134529

### Apply Encoding and Expand Columns

In [39]:
new_df_test.loc[:, "EVIDENCES"] = new_df_test.EVIDENCES.apply(lambda x: evidenceEncoder.transform([x]))

In [40]:
X_test = expand2DCellToColumns(new_df_test, "EVIDENCES")

In [42]:
X_test.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,962,963,964,965,966,967,968,969,970,971
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0


### Prediction

In [44]:
X_test = X_test.values
y_test = new_df_test.PATHOLOGY.values

#### SVC Prediction

In [45]:
y_pred = lr_model.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred))

                                          precision    recall  f1-score   support

     Acute COPD exacerbation / infection       1.00      1.00      1.00      2153
                Acute dystonic reactions       1.00      1.00      1.00      3302
                        Acute laryngitis       1.00      1.00      1.00      3217
                      Acute otitis media       1.00      1.00      1.00      3516
                   Acute pulmonary edema       1.00      1.00      1.00      2598
                    Acute rhinosinusitis       1.00      0.85      0.92      1829
                      Allergic sinusitis       1.00      1.00      1.00      2411
                             Anaphylaxis       1.00      1.00      1.00      3799
                                  Anemia       1.00      1.00      1.00      6842
                     Atrial fibrillation       1.00      1.00      1.00      2831
                               Boerhaave       1.00      1.00      1.00      2083
               

## Save the model

In [20]:
pwd = Path.cwd()
path = Path(pwd.parent / "HopSlot-FullStack/django-api/predict_symptoms/trained_models")

joblib.dump(lr_model, path / 'model.pkl')

['/Volumes/Arham/A2/BMS College stufff/PBL/Final Year Project - HopSlot/Dev/Code/HopSlot-FullStack/django-api/predict_symptoms/trained_models/model.pkl']

## Testing for API Request

In [None]:
def transformAPIRequest(appointment):
    evidences = []

    selected_columns = ["question_en", "name", "value_meaning", "data_type"]
    df = pd.read_json("datasets/final/release_evidences.json").T
    df = df[selected_columns]
    data_dict = df.to_dict(orient="records")

    for symptom in appointment["symptoms"]:
        search_key = symptom["name"]
        if symptom["type"] == "B":
            evidences.append(symptom["name"])
        elif symptom["type"] == "M":
            for value in symptom["values"]:
                evidences.append(symptom["name"] + "_@_" + value)
        else:
            value_meaning = symptom["values"]
            evidences.append(symptom["name"] + "_@_" + value_meaning)

    return evidences  # Your model inputs which will be used for prediction


def mapPathologyToSeverity(pathology):
    df = pd.read_json("datasets/final/release_conditions.json").T
    selected_columns = ["condition_name", "severity"]
    df = df[selected_columns]
    data_dict = df.to_dict(orient="records")
    return next(item for item in data_dict if item["condition_name"] == pathology)[
        "severity"
    ]


In [None]:
appointmentBatch = [
  {
    "appointmentId": "12233-asdas-21232",
    "symptoms": [{
      "name": "E_123",
      "description": "fever for 2 days",
      "type": "B"
    }]
  },
  {
    "appointmentId": "34232asdas-asda-2123",
    "symptoms": [{
      "name": "E_132",
      "description": "fever for 1 days",
      "type": "B"
      
    }]
  },
  {
    "appointmentId": "64232asdas-asda-2123",
    "symptoms": [{
      "name": "E_144",
      "description": "last night fatigue",
      "type": "B",
    }]
  },
   {
    "appointmentId": "74232asdas-asda-2123",
    "symptoms": [{
      "name": "E_243",
      "description": "suffocating last night",
      "type": "B",
    }]
  },
   {
    "appointmentId": "84232asdas-asda-2123",
    "symptoms": [{
      "name": "E_122",
      "description": "dairrhea",
      "type": "B",
    }]
  },
   {
    "appointmentId": "14232asdas-asda-2123",
    "symptoms": [{
      "name": "E_233",
      "description": "rash on hand",
      "type": "M",
      "values": ["V_23"]
    },
    {
      "name": "E_55",
      "description": "back pain",
      "type": "M",
      "values": ["V_123", "V_234", "V_43"]
    },
    {
      "name": "E_123",
      "description": "from morning unconsciousness",
      "type": "B",
    }]
  }
]

In [None]:

predictions = []
evidences = []
for appointment in appointmentBatch:
    evidences.append(transformAPIRequest(appointment))
    # print(evidences)
pathology = model.predict(evidences)
print(pathology)
    # severity = mapPathologyToSeverity(pathology)
    # severity = 1
    # predictions.append(
    #         {"appointmentId": appointment.get("appointmentId"), "severity": severity}
    # )
# print(predictions)

In [None]:
y_pred = model.predict(new_df_test['EVIDENCES'])

# Print classification report
# print(classification_report(new_df_test['PATHOLOGY'], y_pred))

In [None]:
new_df_test.head()

In [None]:
type(new_df_test['EVIDENCES'])

In [None]:
# import numpy as np
# y_sample = pd.Series("['E_53', 'E_54_@_V_112', 'E_54_@_V_161', 'E_54_@_V_180', 'E_54_@_V_181', 'E_55_@_V_29', 'E_55_@_V_101', 'E_55_@_V_103', 'E_56_@_6', 'E_57_@_V_29', 'E_57_@_V_101', 'E_58_@_3', 'E_59_@_2', 'E_70', 'E_78', 'E_98', 'E_140', 'E_167', 'E_173', 'E_201', 'E_204_@_V_10', 'E_217']")

# print(len(y_sample.squeeze()))
y_pred_2 = model.predict(["['E_53', 'E_55_@_V_101','E_59_@_2', 'E_70', 'E_78', 'E_98', 'E_140', 'E_167', 'E_173', 'E_201', 'E_204_@_V_10', 'E_217', 'E_128', 'E_147', 'E_168', 'E_172', 'E_193']"])

y_pred_2