In [23]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from pickle import dump

In [2]:
dfTrain = pd.read_csv('dataset.csv')

In [3]:
dfTrain.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,


In [4]:
dfTrain["Disease"].value_counts()

Hepatitis B                                120
Psoriasis                                  120
Allergy                                    120
Alcoholic hepatitis                        120
Paralysis (brain hemorrhage)               120
Heart attack                               120
Malaria                                    120
Hypothyroidism                             120
(vertigo) Paroymsal  Positional Vertigo    120
Gastroenteritis                            120
Cervical spondylosis                       120
Diabetes                                   120
Pneumonia                                  120
Common Cold                                120
Hyperthyroidism                            120
Dengue                                     120
AIDS                                       120
Hepatitis E                                120
Dimorphic hemmorhoids(piles)               120
Drug Reaction                              120
Tuberculosis                               120
Peptic ulcer 

In [5]:
dfTrain.iloc[:,1:] = dfTrain.iloc[:,1:].apply(lambda x: x.str.lower())
dfTrain.iloc[:,1:] = dfTrain.iloc[:,1:].apply(lambda x: x.str.replace(' ',''))

In [6]:
X = []
for i in range(len(dfTrain)):
    s = ' '.join(dfTrain.iloc[i, 1:].dropna().tolist())
    X.append(s)

In [7]:
X

['itching skin_rash nodal_skin_eruptions dischromic_patches',
 'skin_rash nodal_skin_eruptions dischromic_patches',
 'itching nodal_skin_eruptions dischromic_patches',
 'itching skin_rash dischromic_patches',
 'itching skin_rash nodal_skin_eruptions',
 'skin_rash nodal_skin_eruptions dischromic_patches',
 'itching nodal_skin_eruptions dischromic_patches',
 'itching skin_rash dischromic_patches',
 'itching skin_rash nodal_skin_eruptions',
 'itching skin_rash nodal_skin_eruptions dischromic_patches',
 'continuous_sneezing shivering chills watering_from_eyes',
 'shivering chills watering_from_eyes',
 'continuous_sneezing chills watering_from_eyes',
 'continuous_sneezing shivering watering_from_eyes',
 'continuous_sneezing shivering chills',
 'shivering chills watering_from_eyes',
 'continuous_sneezing chills watering_from_eyes',
 'continuous_sneezing shivering watering_from_eyes',
 'continuous_sneezing shivering chills',
 'continuous_sneezing shivering chills watering_from_eyes',
 'stomac

In [8]:
st = set()
for row in X:
    for word in row.split():
        st.add(word)

In [9]:
st

{'abdominal_pain',
 'abnormal_menstruation',
 'acidity',
 'acute_liver_failure',
 'altered_sensorium',
 'anxiety',
 'back_pain',
 'belly_pain',
 'blackheads',
 'bladder_discomfort',
 'blister',
 'blood_in_sputum',
 'bloody_stool',
 'blurred_and_distorted_vision',
 'breathlessness',
 'brittle_nails',
 'bruising',
 'burning_micturition',
 'chest_pain',
 'chills',
 'cold_hands_and_feets',
 'coma',
 'congestion',
 'constipation',
 'continuous_feel_of_urine',
 'continuous_sneezing',
 'cough',
 'cramps',
 'dark_urine',
 'dehydration',
 'depression',
 'diarrhoea',
 'dischromic_patches',
 'distention_of_abdomen',
 'dizziness',
 'drying_and_tingling_lips',
 'enlarged_thyroid',
 'excessive_hunger',
 'extra_marital_contacts',
 'family_history',
 'fast_heart_rate',
 'fatigue',
 'fluid_overload',
 'foul_smell_ofurine',
 'headache',
 'high_fever',
 'hip_joint_pain',
 'history_of_alcohol_consumption',
 'increased_appetite',
 'indigestion',
 'inflammatory_nails',
 'internal_itching',
 'irregular_sugar

In [10]:
X = np.array(X)

In [11]:
le = LabelEncoder()
le.fit(dfTrain["Disease"])
le.classes_
Y = le.transform(dfTrain["Disease"])

In [12]:
X_train, X_val, Y_train, Y_val = model_selection.train_test_split(X, Y, test_size = 0.2, random_state=100)

In [13]:
# bag of words model
vectorizer = CountVectorizer()
X_train_model = vectorizer.fit_transform(X_train)
X_val_model = vectorizer.transform(X_val)

X_train_model = X_train_model.toarray()
X_val_model = X_val_model.toarray()


In [14]:
clf = MultinomialNB()
clf.fit(X_train_model, Y_train)

MultinomialNB()

In [15]:
Y_train_model_pred = clf.predict(X_train_model)
accuracy_score(Y_train, Y_train_model_pred)

1.0

In [16]:
clf.predict_proba(X_train_model)

array([[9.99996198e-01, 9.71972584e-10, 1.37219108e-09, ...,
        1.02358518e-09, 9.97159734e-11, 1.23562338e-09],
       [4.90600903e-12, 4.16292339e-13, 7.07901025e-13, ...,
        4.55168184e-13, 1.73209886e-14, 7.86410632e-10],
       [1.14901893e-17, 8.02701701e-20, 2.33346739e-19, ...,
        1.08793287e-21, 1.41889685e-22, 3.76788048e-17],
       ...,
       [8.95393567e-11, 3.76370156e-10, 5.31343970e-10, ...,
        3.96355741e-10, 3.86123200e-11, 5.25782211e-12],
       [1.57160691e-16, 1.94500500e-15, 3.98389964e-15, ...,
        2.20799307e-15, 2.74781657e-13, 9.14490305e-19],
       [1.19518826e-19, 2.53805427e-18, 6.26184213e-18, ...,
        2.99144753e-18, 1.59804890e-18, 1.97031486e-20]])

In [17]:
Y_train

array([ 0,  3, 11, ...,  5, 12, 24])

In [18]:
Y_val_model_pred = clf.predict(X_val_model)
accuracy_score(Y_val, Y_val_model_pred)

1.0

In [19]:
def n_probable_diseases(clf, X_test, n=5):
    Y_pred = clf.predict_proba(X_test)
    
    for i in range(len(Y_pred)):
        ls = Y_pred[i]
        zipls = zip(ls, list(range(len(Y_pred[0]))))
        zipls = list(sorted(zipls, reverse=True))
        #print(i , len(ls)," : ")
        for j in range(n):
            prob, label = zipls[j]
            print(le.inverse_transform([label])[0], f"( {prob*100:.5f} % )", end='      ')
        print('\n')

In [20]:
X_test = ['chest_pain high_fever headache', 'chest_pain acidity', 'high_fever headache']
X_test = vectorizer.transform(X_test)
X_test = X_test.toarray()

In [21]:
X_test.shape

(3, 132)

In [22]:
n_probable_diseases(clf, X_test, 3)

0 41  : 
Common Cold ( 66.77025 % )      Hypertension  ( 17.17649 % )      Malaria ( 5.53449 % )      

1 41  : 
GERD ( 95.01661 % )      Heart attack ( 1.97168 % )      Hypertension  ( 1.38591 % )      

2 41  : 
Malaria ( 35.65076 % )      Chicken pox ( 19.64053 % )      Typhoid ( 16.04890 % )      



In [25]:
dump(clf, open('classifier.pkl', 'wb'))
dump(le, open('labelencoder.pkl', 'wb'))
dump(vectorizer, open('countvectorizer', 'wb'))