# Hopslot Severity Predictor

## Preprocessing

### Importing libraries

In [2]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import ast
import joblib, dill
from pathlib import Path

### Read train Dataset

In [3]:
df=pd.read_csv('./datasets/final/release_train_patients.csv')
pd.set_option('display.max_colwidth', None)

### Convert list strings to actual python lists for Evidences

In [4]:
df.EVIDENCES = df.EVIDENCES.apply(ast.literal_eval)

In [5]:
df.head()

Unnamed: 0,AGE,DIFFERENTIAL_DIAGNOSIS,SEX,PATHOLOGY,EVIDENCES,INITIAL_EVIDENCE
0,18,"[['Bronchitis', 0.19171203430383882], ['Pneumonia', 0.17579340398940366], ['URTI', 0.1607809719801254], ['Bronchiectasis', 0.12429044460990353], ['Tuberculosis', 0.11367177304035844], ['Influenza', 0.11057936110639896], ['HIV (initial infection)', 0.07333003867293564], ['Chagas', 0.04984197229703562]]",M,URTI,"[E_48, E_50, E_53, E_54_@_V_161, E_54_@_V_183, E_55_@_V_89, E_55_@_V_108, E_55_@_V_167, E_56_@_4, E_57_@_V_123, E_58_@_3, E_59_@_3, E_77, E_79, E_91, E_97, E_201, E_204_@_V_10, E_222]",E_91
1,21,"[['HIV (initial infection)', 0.5189500564407601], ['Chagas', 0.3217819010436332], ['Scombroid food poisoning', 0.13496758062695968], ['Sarcoidosis', 0.024300461888647054]]",M,HIV (initial infection),"[E_9, E_27, E_50, E_51, E_53, E_54_@_V_198, E_55_@_V_62, E_55_@_V_166, E_55_@_V_167, E_56_@_7, E_57_@_V_123, E_58_@_7, E_59_@_2, E_91, E_115, E_129, E_130_@_V_138, E_131_@_V_10, E_132_@_0, E_133_@_V_90, E_133_@_V_91, E_133_@_V_95, E_133_@_V_110, E_133_@_V_111, E_134_@_6, E_135_@_V_12, E_136_@_0, E_148, E_162, E_189, E_204_@_V_10]",E_50
2,19,"[['Bronchitis', 0.11278064619119596], ['Pneumonia', 0.10048134562119852], ['Unstable angina', 0.08462979865697953], ['Possible NSTEMI / STEMI', 0.08343170881871906], ['Bronchiectasis', 0.07764173703530038], ['Boerhaave', 0.07708363099826726], ['Spontaneous rib fracture', 0.07682349455456361], ['Pericarditis', 0.07504681799224715], ['URTI', 0.06705952510476514], ['Stable angina', 0.06667817325012103], ['GERD', 0.06455232704351052], ['Scombroid food poisoning', 0.058790058668404095], ['Chagas', 0.02923384001251551], ['Sarcoidosis', 0.016406922262218876], ['Panic attack', 0.009359973789993502]]",F,Pneumonia,"[E_53, E_54_@_V_179, E_54_@_V_192, E_55_@_V_29, E_55_@_V_55, E_55_@_V_56, E_55_@_V_170, E_55_@_V_171, E_56_@_5, E_57_@_V_123, E_58_@_2, E_59_@_6, E_77, E_78, E_79, E_94, E_106, E_118, E_123, E_124, E_129, E_130_@_V_156, E_131_@_V_12, E_132_@_4, E_133_@_V_54, E_133_@_V_85, E_134_@_0, E_135_@_V_10, E_136_@_0, E_144, E_175, E_181, E_201, E_204_@_V_10]",E_77
3,34,"[['URTI', 0.23859396799565236], ['Cluster headache', 0.18566159722723558], ['Bronchitis', 0.1773471504190298], ['Chronic rhinosinusitis', 0.14960995823729417], ['Acute rhinosinusitis', 0.10748772366243657], ['Chagas', 0.07064980122917579], ['Anemia', 0.07064980122917579]]",F,URTI,"[E_48, E_53, E_54_@_V_183, E_55_@_V_89, E_55_@_V_109, E_55_@_V_124, E_55_@_V_166, E_55_@_V_167, E_56_@_7, E_57_@_V_123, E_58_@_5, E_59_@_4, E_97, E_181, E_201, E_204_@_V_10]",E_53
4,36,"[['URTI', 0.23677812769175735], ['Influenza', 0.1842006792554458], ['Bronchitis', 0.1651209569301098], ['Cluster headache', 0.1430457852409739], ['Chronic rhinosinusitis', 0.139295948156182], ['Chagas', 0.06577925136276556], ['Anemia', 0.06577925136276556]]",M,URTI,"[E_49, E_50, E_53, E_54_@_V_183, E_55_@_V_62, E_55_@_V_124, E_55_@_V_166, E_55_@_V_167, E_56_@_5, E_57_@_V_123, E_58_@_4, E_59_@_2, E_97, E_144, E_181, E_201, E_204_@_V_10]",E_201


### Select only required columns

In [6]:
selected_columns = ["PATHOLOGY", "EVIDENCES"]
new_df = df[selected_columns]

# Write the new DataFrame to a new CSV file
# new_df.to_csv("new.csv", index=False)

In [7]:
new_df.head()

Unnamed: 0,PATHOLOGY,EVIDENCES
0,URTI,"[E_48, E_50, E_53, E_54_@_V_161, E_54_@_V_183, E_55_@_V_89, E_55_@_V_108, E_55_@_V_167, E_56_@_4, E_57_@_V_123, E_58_@_3, E_59_@_3, E_77, E_79, E_91, E_97, E_201, E_204_@_V_10, E_222]"
1,HIV (initial infection),"[E_9, E_27, E_50, E_51, E_53, E_54_@_V_198, E_55_@_V_62, E_55_@_V_166, E_55_@_V_167, E_56_@_7, E_57_@_V_123, E_58_@_7, E_59_@_2, E_91, E_115, E_129, E_130_@_V_138, E_131_@_V_10, E_132_@_0, E_133_@_V_90, E_133_@_V_91, E_133_@_V_95, E_133_@_V_110, E_133_@_V_111, E_134_@_6, E_135_@_V_12, E_136_@_0, E_148, E_162, E_189, E_204_@_V_10]"
2,Pneumonia,"[E_53, E_54_@_V_179, E_54_@_V_192, E_55_@_V_29, E_55_@_V_55, E_55_@_V_56, E_55_@_V_170, E_55_@_V_171, E_56_@_5, E_57_@_V_123, E_58_@_2, E_59_@_6, E_77, E_78, E_79, E_94, E_106, E_118, E_123, E_124, E_129, E_130_@_V_156, E_131_@_V_12, E_132_@_4, E_133_@_V_54, E_133_@_V_85, E_134_@_0, E_135_@_V_10, E_136_@_0, E_144, E_175, E_181, E_201, E_204_@_V_10]"
3,URTI,"[E_48, E_53, E_54_@_V_183, E_55_@_V_89, E_55_@_V_109, E_55_@_V_124, E_55_@_V_166, E_55_@_V_167, E_56_@_7, E_57_@_V_123, E_58_@_5, E_59_@_4, E_97, E_181, E_201, E_204_@_V_10]"
4,URTI,"[E_49, E_50, E_53, E_54_@_V_183, E_55_@_V_62, E_55_@_V_124, E_55_@_V_166, E_55_@_V_167, E_56_@_5, E_57_@_V_123, E_58_@_4, E_59_@_2, E_97, E_144, E_181, E_201, E_204_@_V_10]"


## Load Multi Label Binarizer

In [8]:
import joblib
from pathlib import Path

pwd = Path.cwd()
path = Path(pwd.parent / "HopSlot-FullStack/django-api/predict_symptoms/trained_models")

In [9]:
# Load encoder

evidenceEncoder = joblib.load(path / "evidence_mb_encoder.pkl")

#### Test loaded encoder

In [10]:
test_s = ['E_53', 'E_23']
# print(test_s.shape)
s = evidenceEncoder.transform([test_s])

print(len([x for x in s[0] if x == 1]))

2


### Perform Vectorization on Evidence Column

In [11]:
new_df.loc[:, "EVIDENCES"] = new_df.EVIDENCES.apply(lambda x: evidenceEncoder.transform([x]))

In [12]:
new_df.EVIDENCES.head(2).shape

(2,)

### Function To Expand 2d Evidence column to normal columns

In [13]:
def expand2DCellToColumns(df: pd.DataFrame, featureName):
    return df[featureName].apply(lambda x: pd.Series(x[0]))

In [14]:
X = expand2DCellToColumns(new_df, "EVIDENCES")

In [15]:
X.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,962,963,964,965,966,967,968,969,970,971
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [16]:
X.shape

(1025602, 972)

## Training the Model


In [17]:
y = new_df.PATHOLOGY.values
X = X.values

In [18]:
type(X), type(y), X.shape, y.shape

(numpy.ndarray, numpy.ndarray, (1025602, 972), (1025602,))

### Train Logistic Regression Model

In [19]:
lr_model = LogisticRegression(random_state=42, solver='lbfgs', max_iter=200)

In [20]:
# Train the model
lr_model.fit(X, y)

### Train Support Vector Classifier

## Testing the Models

### Load Test set

In [21]:
df_test = pd.read_csv('./datasets/final/release_test_patients.csv', converters={
    'EVIDENCES': ast.literal_eval,
})
pd.set_option('display.max_colwidth', None)

new_df_test = df_test[selected_columns]

new_df_test.head(5)

Unnamed: 0,PATHOLOGY,EVIDENCES
0,GERD,"[E_53, E_54_@_V_112, E_54_@_V_161, E_54_@_V_180, E_54_@_V_181, E_55_@_V_29, E_55_@_V_101, E_55_@_V_103, E_56_@_6, E_57_@_V_29, E_57_@_V_101, E_58_@_3, E_59_@_2, E_70, E_78, E_98, E_140, E_167, E_173, E_201, E_204_@_V_10, E_217]"
1,Bronchitis,"[E_53, E_54_@_V_181, E_55_@_V_55, E_55_@_V_148, E_56_@_5, E_57_@_V_123, E_58_@_7, E_59_@_4, E_66, E_123, E_181, E_201, E_204_@_V_10, E_214]"
2,Acute dystonic reactions,"[E_15, E_128, E_147, E_168, E_172, E_193, E_204_@_V_10]"
3,Acute laryngitis,"[E_48, E_49, E_53, E_54_@_V_181, E_55_@_V_20, E_55_@_V_21, E_55_@_V_148, E_56_@_2, E_57_@_V_123, E_58_@_4, E_59_@_4, E_79, E_91, E_201, E_204_@_V_10, E_212]"
4,URTI,"[E_41, E_50, E_53, E_54_@_V_161, E_54_@_V_183, E_55_@_V_62, E_55_@_V_89, E_55_@_V_108, E_55_@_V_109, E_55_@_V_124, E_56_@_7, E_57_@_V_123, E_58_@_3, E_59_@_0, E_91, E_97, E_144, E_181, E_201, E_204_@_V_10]"


In [22]:
new_df_test["EVIDENCES"].size

134529

### Apply Encoding and Expand Columns

In [23]:
new_df_test.loc[:, "EVIDENCES"] = new_df_test.EVIDENCES.apply(lambda x: evidenceEncoder.transform([x]))

In [24]:
X_test = expand2DCellToColumns(new_df_test, "EVIDENCES")

In [25]:
X_test.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,962,963,964,965,966,967,968,969,970,971
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0


### Prediction

In [26]:
X_test = X_test.values
y_test = new_df_test.PATHOLOGY.values

#### LR Prediction

In [55]:
y_pred = lr_model.predict(X_test)

# Print classification report
report = classification_report(y_test, y_pred, output_dict=True)
print(report)

{'Acute COPD exacerbation / infection': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 2153.0}, 'Acute dystonic reactions': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 3302.0}, 'Acute laryngitis': {'precision': 0.9956481193658688, 'recall': 0.9956481193658688, 'f1-score': 0.9956481193658688, 'support': 3217.0}, 'Acute otitis media': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 3516.0}, 'Acute pulmonary edema': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 2598.0}, 'Acute rhinosinusitis': {'precision': 0.9961538461538462, 'recall': 0.8496446145434664, 'f1-score': 0.9170846857480083, 'support': 1829.0}, 'Allergic sinusitis': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 2411.0}, 'Anaphylaxis': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 3799.0}, 'Anemia': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 6842.0}, 'Atrial fibrillation': {'precision': 1.0, 'recall': 1.0, 'f1-scor

In [56]:
report.update({"accuracy": {"precision": None, "recall": None, "f1-score": report["accuracy"], "support": report['macro avg']['support']}})

In [60]:
report_pd = pd.DataFrame(report).T

In [63]:
report_pd.to_csv("./report.csv")

## Save the model

In [20]:
pwd = Path.cwd()
path = Path(pwd.parent / "HopSlot-FullStack/django-api/predict_symptoms/trained_models")

joblib.dump(lr_model, path / 'model.pkl')

['/Volumes/Arham/A2/BMS College stufff/PBL/Final Year Project - HopSlot/Dev/Code/HopSlot-FullStack/django-api/predict_symptoms/trained_models/model.pkl']