# HopSlot - Machine Learning Model

## Import modules and dataset 

In [186]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [187]:
disease_df = pd.read_csv("./datasets/diseases_dataset.csv")
patient_df = pd.read_csv("./datasets/patient_data.csv")

## Disease Prediction based on Symptoms ML Model

### Data Preprocessing

In [188]:
# Convert the csv of symptoms to list in python
disease_df['symptoms'] = disease_df.apply(lambda x: x['symptoms'].split(","), axis=1)

disease_df

Unnamed: 0,disease,symptoms,cures,doctor,risk level
0,flu,"[fever, cough, sore throat, runny or stuffy no...","over-the-counter medications,rest,fluids","family doctor,urgent care",low (0.1%
1,bronchitis,"[cough, mucus production, shortness of breath,...","antibiotics,over-the-counter medications,rest,...","family doctor,pulmonologist",low (0.5%
2,pneumonia,"[fever, cough, shortness of breath, chest pain...","antibiotics,over-the-counter medications,rest,...","family doctor,pulmonologist",moderate (1%)
3,heart attack,"[chest pain, shortness of breath, nausea, vomi...",emergency medical services,cardiologist,high (20%)
4,stroke,"[sudden weakness, numbness on one side of the ...",emergency medical services,neurologist,high (15%)
...,...,...,...,...,...
94,epilepsy,[seizures],"medication,surgery",neurologist,varies
95,fibromyalgia,"[widespread pain, fatigue, tenderness]","medication,lifestyle changes",rheumatologist,varies
96,graves' disease,"[hyperthyroidism, which is an overactive thyroid]","medication,surgery",endocrinologist,varies
97,hashimoto's thyroiditis,"[hypothyroidism, which is an underactive thyro...",medication,endocrinologist,varies


In [189]:
# Drop Unnecessary Columns
disease_df = disease_df.drop(columns=['cures', 'doctor', 'risk level'])

disease_df

Unnamed: 0,disease,symptoms
0,flu,"[fever, cough, sore throat, runny or stuffy no..."
1,bronchitis,"[cough, mucus production, shortness of breath,..."
2,pneumonia,"[fever, cough, shortness of breath, chest pain..."
3,heart attack,"[chest pain, shortness of breath, nausea, vomi..."
4,stroke,"[sudden weakness, numbness on one side of the ..."
...,...,...
94,epilepsy,[seizures]
95,fibromyalgia,"[widespread pain, fatigue, tenderness]"
96,graves' disease,"[hyperthyroidism, which is an overactive thyroid]"
97,hashimoto's thyroiditis,"[hypothyroidism, which is an underactive thyro..."


In [190]:
# Extract data frame
disease_X = disease_df.iloc[:, 1].values
disease_y = disease_df.iloc[:, 0].values

disease_X, disease_y

(array([list(['fever', 'cough', 'sore throat', 'runny or stuffy nose', 'muscle aches', 'headache', 'fatigue']),
        list(['cough', 'mucus production', 'shortness of breath', 'chest pain']),
        list(['fever', 'cough', 'shortness of breath', 'chest pain', 'fatigue']),
        list(['chest pain', 'shortness of breath', 'nausea', 'vomiting', 'lightheadedness', 'sweating']),
        list(['sudden weakness', 'numbness on one side of the body', 'confusion', 'difficulty speaking', 'trouble seeing in one eye', 'severe headache']),
        list(['lump', 'unexplained weight loss', 'fatigue', 'changes in bowel', 'bladder habits', 'persistent cough', 'indigestion', 'unexplained bleeding or discharge']),
        list(['increased thirst', 'frequent urination', 'unexplained weight loss', 'fatigue', 'blurred vision', 'cuts that are slow to heal']),
        list(['memory loss', 'confusion', 'difficulty thinking', 'changes in personality or behavior']),
        list(['pain', 'stiffness', 'swelli

In [191]:
# disease_X = disease_X.squeeze()

disease_X.shape, disease_X

((99,),
 array([list(['fever', 'cough', 'sore throat', 'runny or stuffy nose', 'muscle aches', 'headache', 'fatigue']),
        list(['cough', 'mucus production', 'shortness of breath', 'chest pain']),
        list(['fever', 'cough', 'shortness of breath', 'chest pain', 'fatigue']),
        list(['chest pain', 'shortness of breath', 'nausea', 'vomiting', 'lightheadedness', 'sweating']),
        list(['sudden weakness', 'numbness on one side of the body', 'confusion', 'difficulty speaking', 'trouble seeing in one eye', 'severe headache']),
        list(['lump', 'unexplained weight loss', 'fatigue', 'changes in bowel', 'bladder habits', 'persistent cough', 'indigestion', 'unexplained bleeding or discharge']),
        list(['increased thirst', 'frequent urination', 'unexplained weight loss', 'fatigue', 'blurred vision', 'cuts that are slow to heal']),
        list(['memory loss', 'confusion', 'difficulty thinking', 'changes in personality or behavior']),
        list(['pain', 'stiffness',

In [192]:
# Vectorize Symptoms

symptoms_vectorizer = TfidfVectorizer(tokenizer=lambda doc:doc, lowercase=False, token_pattern=None)
disease_X = symptoms_vectorizer.fit_transform(disease_X)

In [193]:
list(disease_X)

[<1x183 sparse matrix of type '<class 'numpy.float64'>'
 	with 7 stored elements in Compressed Sparse Row format>,
 <1x183 sparse matrix of type '<class 'numpy.float64'>'
 	with 4 stored elements in Compressed Sparse Row format>,
 <1x183 sparse matrix of type '<class 'numpy.float64'>'
 	with 5 stored elements in Compressed Sparse Row format>,
 <1x183 sparse matrix of type '<class 'numpy.float64'>'
 	with 6 stored elements in Compressed Sparse Row format>,
 <1x183 sparse matrix of type '<class 'numpy.float64'>'
 	with 6 stored elements in Compressed Sparse Row format>,
 <1x183 sparse matrix of type '<class 'numpy.float64'>'
 	with 8 stored elements in Compressed Sparse Row format>,
 <1x183 sparse matrix of type '<class 'numpy.float64'>'
 	with 6 stored elements in Compressed Sparse Row format>,
 <1x183 sparse matrix of type '<class 'numpy.float64'>'
 	with 4 stored elements in Compressed Sparse Row format>,
 <1x183 sparse matrix of type '<class 'numpy.float64'>'
 	with 4 stored elements

In [194]:
# Split train and test

disease_X_train, disease_X_test, disease_y_train, disease_y_test = train_test_split(disease_X, disease_y, test_size=.2, random_state=42)

### Train Model

In [195]:
rfc_model = RandomForestClassifier(n_estimators=200, criterion='entropy', random_state=42)
rfc_model.fit(disease_X_train, disease_y_train)

In [196]:
def vectorize_input_symptoms(symptoms: list):
    return symptoms_vectorizer.transform([symptoms])

In [197]:
y_preds = rfc_model.predict(disease_X_test)

accuracy_score(disease_y_test, y_preds)

0.1

In [198]:
sample_symptoms = vectorize_input_symptoms(['fever', 'sore throat', 'cough'])

sample_pred = rfc_model.predict(sample_symptoms)

print(sample_pred)

['measles']


In [204]:
import dill
import pickle
from pathlib import Path

BASE_PATH = Path('./models/disease_ml')
BASE_PATH.mkdir(parents=True, exist_ok=True)

VECTORIZED_MODEL_PATH = BASE_PATH / 'symptoms_vectorizer.pkl'
MODEL_SAVE_PATH = BASE_PATH / 'diseases_rfc_model.pkl'

# Save the vectorizer
with open(VECTORIZED_MODEL_PATH, 'wb') as f:
    dill.dump(symptoms_vectorizer, f)

# Save the model
with open(MODEL_SAVE_PATH, 'wb') as f:
    dill.dump(rfc_model, f)


In [205]:
import dill

# Load the vectorizer
with open(VECTORIZED_MODEL_PATH, 'rb') as f:
    loaded_vectorizer = dill.load(f)

# Load the model
with open(MODEL_SAVE_PATH, 'rb') as f:
    loaded_model = dill.load(f)

In [210]:
sample_symptoms = loaded_vectorizer.transform([['fever', 'sore throat', 'cough']])

sample_pred = loaded_model.predict(sample_symptoms)

print(sample_pred)

['measles']
