# **Disease Prediction from Symptoms Data**

Import some libraries

In [59]:
import sklearn
print(sklearn.__version__)

1.4.1.post1


Read the dataset of disease-symptom.

In [60]:
import pandas as pd
import numpy as np
import regex as re
from sklearn.metrics import accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [61]:
import chardet

with open("dataset.csv", "rb") as f:
    char_encoder = chardet.detect(f.read())
char_encoder

{'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}

In [62]:
df = pd.read_csv("dataset.csv", engine="python", encoding=char_encoder["encoding"])
df.count()

Disease                         134
Count of Disease Occurrence     134
Symptom                        1865
dtype: int64

Check for null or invalid entries

In [63]:
df = df.fillna(0)
df.head()

Unnamed: 0,Disease,Count of Disease Occurrence,Symptom
0,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0008031_pain chest
1,0,0.0,UMLS:C0392680_shortness of breath
2,0,0.0,UMLS:C0012833_dizziness
3,0,0.0,UMLS:C0004093_asthenia
4,0,0.0,UMLS:C0085639_fall


In [64]:
fill = df["Disease"].iloc[0]
for i in range(1, 1867):
    if df["Disease"].iloc[i] == 0:
        df["Disease"].iloc[i] = fill
    else:
        fill = df["Disease"].iloc[i]
df["Disease"]

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df["Disease"].iloc[i] = fill
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Disease"].iloc[i] = fill
You a

0       UMLS:C0020538_hypertensive disease
1       UMLS:C0020538_hypertensive disease
2       UMLS:C0020538_hypertensive disease
3       UMLS:C0020538_hypertensive disease
4       UMLS:C0020538_hypertensive disease
                       ...                
1862           UMLS:C0233472_affect labile
1863         UMLS:C0011127_decubitus ulcer
1864         UMLS:C0011127_decubitus ulcer
1865         UMLS:C0011127_decubitus ulcer
1866         UMLS:C0011127_decubitus ulcer
Name: Disease, Length: 1867, dtype: object

In [65]:


fill = df["Count of Disease Occurrence"].iloc[0]
for i in range(1, 1867):
    if df["Count of Disease Occurrence"].iloc[i] == 0.0:
        df["Count of Disease Occurrence"].iloc[i] = fill
    else:
        fill = df["Count of Disease Occurrence"].iloc[i]
df["Count of Disease Occurrence"]

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df["Count of Disease Occurrence"].iloc[i] = fill
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Count of D

0       3363.0
1       3363.0
2       3363.0
3       3363.0
4       3363.0
         ...  
1862      45.0
1863      42.0
1864      42.0
1865      42.0
1866      42.0
Name: Count of Disease Occurrence, Length: 1867, dtype: float64

In [66]:
df = df[df.Symptom != 0]
df

Unnamed: 0,Disease,Count of Disease Occurrence,Symptom
0,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0008031_pain chest
1,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0392680_shortness of breath
2,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0012833_dizziness
3,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0004093_asthenia
4,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0085639_fall
...,...,...,...
1861,UMLS:C0233472_affect labile,45.0,UMLS:C0425251_bedridden^UMLS:C0741453_bedridden
1862,UMLS:C0233472_affect labile,45.0,UMLS:C0242453_prostatism
1863,UMLS:C0011127_decubitus ulcer,42.0,UMLS:C0232257_systolic murmur
1864,UMLS:C0011127_decubitus ulcer,42.0,UMLS:C0871754_frail


In [67]:
df["Symptom"] = df["Symptom"].apply(lambda x: x.split("^"))
df["Symptom"]

0                              [UMLS:C0008031_pain chest]
1                     [UMLS:C0392680_shortness of breath]
2                               [UMLS:C0012833_dizziness]
3                                [UMLS:C0004093_asthenia]
4                                    [UMLS:C0085639_fall]
                              ...                        
1861    [UMLS:C0425251_bedridden, UMLS:C0741453_bedrid...
1862                           [UMLS:C0242453_prostatism]
1863                      [UMLS:C0232257_systolic murmur]
1864                                [UMLS:C0871754_frail]
1865                                [UMLS:C0015967_fever]
Name: Symptom, Length: 1865, dtype: object

In [68]:
df = df.explode("Symptom").reset_index()

In [69]:
df.Symptom = df.Symptom.apply(lambda x: x.split("_")[1])
df

Unnamed: 0,index,Disease,Count of Disease Occurrence,Symptom
0,0,UMLS:C0020538_hypertensive disease,3363.0,pain chest
1,1,UMLS:C0020538_hypertensive disease,3363.0,shortness of breath
2,2,UMLS:C0020538_hypertensive disease,3363.0,dizziness
3,3,UMLS:C0020538_hypertensive disease,3363.0,asthenia
4,4,UMLS:C0020538_hypertensive disease,3363.0,fall
...,...,...,...,...
1902,1861,UMLS:C0233472_affect labile,45.0,bedridden
1903,1862,UMLS:C0233472_affect labile,45.0,prostatism
1904,1863,UMLS:C0011127_decubitus ulcer,42.0,systolic murmur
1905,1864,UMLS:C0011127_decubitus ulcer,42.0,frail


**Cleaned dataset**

In [70]:
df["Disease"] = df["Disease"].apply(lambda x: x.split("^"))
df = df.explode("Disease").reset_index()
df.Disease = df.Disease.apply(lambda x: x.split("_")[1])
df

Unnamed: 0,level_0,index,Disease,Count of Disease Occurrence,Symptom
0,0,0,hypertensive disease,3363.0,pain chest
1,1,1,hypertensive disease,3363.0,shortness of breath
2,2,2,hypertensive disease,3363.0,dizziness
3,3,3,hypertensive disease,3363.0,asthenia
4,4,4,hypertensive disease,3363.0,fall
...,...,...,...,...,...
2124,1902,1861,affect labile,45.0,bedridden
2125,1903,1862,affect labile,45.0,prostatism
2126,1904,1863,decubitus ulcer,42.0,systolic murmur
2127,1905,1864,decubitus ulcer,42.0,frail


In [71]:
df.drop(["index", "level_0", "Count of Disease Occurrence"], axis=1, inplace=True)
df

Unnamed: 0,Disease,Symptom
0,hypertensive disease,pain chest
1,hypertensive disease,shortness of breath
2,hypertensive disease,dizziness
3,hypertensive disease,asthenia
4,hypertensive disease,fall
...,...,...
2124,affect labile,bedridden
2125,affect labile,prostatism
2126,decubitus ulcer,systolic murmur
2127,decubitus ulcer,frail


**One Hot Encoding**

In [72]:
df_sparse = (pd.get_dummies(df, columns=["Symptom"]).drop("Symptom_", axis=1).drop_duplicates())
df_sparse.head()

Unnamed: 0,Disease,Symptom_Heberden's node,Symptom_Murphy's sign,Symptom_Stahli's line,Symptom_abdomen acute,Symptom_abdominal bloating,Symptom_abdominal tenderness,Symptom_abnormal sensation,Symptom_abnormally hard consistency,Symptom_abortion,...,Symptom_vision blurred,Symptom_vomiting,Symptom_weepiness,Symptom_weight gain,Symptom_welt,Symptom_wheelchair bound,Symptom_wheezing,Symptom_withdraw,Symptom_worry,Symptom_yellow sputum
0,hypertensive disease,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,hypertensive disease,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,hypertensive disease,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,hypertensive disease,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,hypertensive disease,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [73]:
df_sparse = df_sparse.groupby("Disease").sum().reset_index()
df_sparse.head()

Unnamed: 0,Disease,Symptom_Heberden's node,Symptom_Murphy's sign,Symptom_Stahli's line,Symptom_abdomen acute,Symptom_abdominal bloating,Symptom_abdominal tenderness,Symptom_abnormal sensation,Symptom_abnormally hard consistency,Symptom_abortion,...,Symptom_vision blurred,Symptom_vomiting,Symptom_weepiness,Symptom_weight gain,Symptom_welt,Symptom_wheelchair bound,Symptom_wheezing,Symptom_withdraw,Symptom_worry,Symptom_yellow sputum
0,Alzheimer's disease,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,HIV,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Pneumocystis carinii pneumonia,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,accident cerebrovascular,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,acquired immuno-deficiency syndrome,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [74]:
X = df_sparse[df_sparse.columns[1:]]
Y = df_sparse["Disease"]

Writing our cleaned data

### Trying out our classifier to learn diseases from the symptoms

In [75]:
from sklearn.model_selection import train_test_split

In [76]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

### Training a Classifier

In [77]:
from sklearn.tree import DecisionTreeClassifier

In [78]:
from sklearn.ensemble import GradientBoostingClassifier
xgb_clf = GradientBoostingClassifier()
xgb_clf.fit(X, Y)
score = xgb_clf.score(X, Y)
print(score)

0.8993288590604027


In [79]:
print ("DecisionTree")
clf = DecisionTreeClassifier()
model = clf.fit(X, Y)
print ("Acurracy: ", model.score(X, Y))

DecisionTree
Acurracy:  0.8993288590604027


In [80]:
model.predict(x_test)

array(['sepsis (invertebrate)', 'neutropenia', 'failure heart congestive',
       'hypertensive disease', 'affect labile',
       'mitral valve insufficiency', 'tachycardia sinus',
       'carcinoma breast', 'paranoia', 'incontinence', 'confusion',
       'dependence', 'encephalopathy', 'cardiomyopathy',
       'deep vein thrombosis', 'schizophrenia', 'obesity morbid',
       'hernia\xa0hiatal', 'bipolar disorder', 'depression mental',
       'dehydration', 'anemia', 'infection', 'carcinoma of lung',
       'gastritis', 'lymphoma', 'ketoacidosis diabetic', 'cholecystitis',
       'dementia', 'sepsis (invertebrate)'], dtype=object)

In [81]:
model.score(x_test, y_test)

0.9333333333333333

## 3.3 Training our model for custom symptoms data

In [82]:
input_data = pd.read_csv("Training.csv")
input_data.head()

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
1,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
4,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection


In [83]:
test_data = pd.read_csv("Testing.csv")
test_data.head()

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
1,0,0,0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Allergy
2,0,0,0,0,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,GERD
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Chronic cholestasis
4,1,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,Drug Reaction


In [84]:
input_data.shape

(4920, 133)

**Check for any Null values**

In [85]:
# seeing any null values are there with descending format
input_data.isnull().sum().sort_values(ascending=False)

itching                 0
spinning_movements      0
altered_sensorium       0
muscle_pain             0
irritability            0
                       ..
abdominal_pain          0
constipation            0
back_pain               0
pain_behind_the_eyes    0
prognosis               0
Length: 133, dtype: int64

**See the Target Variable Distribution**

In [86]:
# looking how much percent each diseases having
input_data['prognosis'].value_counts(normalize=True)

prognosis
Fungal infection                           0.02439
Hepatitis C                                0.02439
Hepatitis E                                0.02439
Alcoholic hepatitis                        0.02439
Tuberculosis                               0.02439
Common Cold                                0.02439
Pneumonia                                  0.02439
Dimorphic hemmorhoids(piles)               0.02439
Heart attack                               0.02439
Varicose veins                             0.02439
Hypothyroidism                             0.02439
Hyperthyroidism                            0.02439
Hypoglycemia                               0.02439
Osteoarthristis                            0.02439
Arthritis                                  0.02439
(vertigo) Paroymsal  Positional Vertigo    0.02439
Acne                                       0.02439
Urinary tract infection                    0.02439
Psoriasis                                  0.02439
Hepatitis D          

In [87]:
# as we can see each no. diseases having the same percentage through bar chart
input_data['prognosis'].value_counts(normalize = True).plot.bar(color='red')
plt.subplots_adjust(left = 0.9, right = 2 , top = 2, bottom = 1)

Error in callback <function flush_figures at 0x00000268A5131000> (for post_execute), with arguments args (),kwargs {}:



KeyboardInterrupt



**Check the relationship between the variables by applying the correlation **

In [None]:
corr = input_data.corr()

mask = np.triu(np.ones_like(corr, dtype=bool))

# plt.figure(figsize=(16, 10))

# sns.heatmap(corr, mask=mask, vmax=0.9, square=True, annot=True, cmap="YlGnBu", fmt=".2f")

# plt.subplots_adjust(left=0.5, right=0.95, top=0.9, bottom=0.5)

# plt.show()


ValueError: could not convert string to float: 'Fungal infection'

**Check for Alternate Hypothesis**

In [None]:
# took two high correlation variables and analysing if it is satisfying null hypothesis or alternate hypothesis
pd.crosstab(input_data['cold_hands_and_feets'],input_data['weight_gain'])

weight_gain,0,1
cold_hands_and_feets,Unnamed: 1_level_1,Unnamed: 2_level_1
0,4800,6
1,6,108


**As p value is  0.0  which is less than 0.05 then they are actually different from each other which satisfy the alternate hypothesis **

In [None]:
# imported the chi square contingency
from scipy.stats import chi2_contingency
# as p value is  0.0  which is less than 0.05 then they are actually different from each other which satisfy the alternate hypothesis 
chi2_contingency(pd.crosstab(input_data['cold_hands_and_feets'],input_data['weight_gain']))

Chi2ContingencyResult(statistic=4362.40173527245, pvalue=0.0, dof=1, expected_freq=array([[4.69464146e+03, 1.11358537e+02],
       [1.11358537e+02, 2.64146341e+00]]))

**Train for Custom Training Data**

In [89]:
x = input_data.drop(['prognosis'],axis =1)
y = input_data['prognosis']

Chi2ContingencyResult(statistic=4362.40173527245, pvalue=0.0, dof=1, expected_freq=array([[4.69464146e+03, 1.11358537e+02],
       [1.11358537e+02, 2.64146341e+00]]))

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

**Multinomial Naive Bayes**

In [None]:
# imported naive_baye algorithm
from sklearn.naive_bayes import MultinomialNB

# fitted the model
mnb = MultinomialNB()
mnb = mnb.fit(x_train, y_train)

score = mnb.score(x_test, y_test)
print("Accuracy Score: ",score)

Accuracy Score:  1.0


In [90]:
gbm_clf = GradientBoostingClassifier()
gbm_clf.fit(x_train, y_train)
score = gbm_clf.score(x_train, y_train)
print(score)

0.9159663865546218


**K-Fold Cross-Validation**

In [None]:
# by cross validating we got mean also 100%
from sklearn.model_selection import cross_val_score

scores = cross_val_score(mnb, x_test, y_test, cv=3)
print(scores)
print(scores.mean())

[1. 1. 1.]
1.0


In [91]:
scores = cross_val_score(gbm_clf, x_test, y_test, cv=10)
print(scores)
print(scores.mean())

ValueError: n_splits=10 cannot be greater than the number of members in each class.

In [None]:
real_diseases = y_test.values
y_pred = gbm_clf.predict(x_test)
# for the cross checking purpose i want to see if predicted values and actual values are same else it gives me worng prediction
for i in range(0, 20):
    if y_pred[i] == real_diseases[i]:
        print("Pred: {0} Actual:{1}".format(y_pred[i], real_diseases[i]))
    else:
        print("worng prediction")
        print("Pred: {0} Actual:{1}".format(y_pred[i], real_diseases[i]))

Pred: Acne Actual:Acne
Pred: Acne Actual:Acne
Pred: Hyperthyroidism Actual:Hyperthyroidism
Pred: AIDS Actual:AIDS
Pred: Chronic cholestasis Actual:Chronic cholestasis
Pred: Hypertension  Actual:Hypertension 
Pred: Hypoglycemia Actual:Hypoglycemia
Pred: Arthritis Actual:Arthritis
Pred: Hepatitis B Actual:Hepatitis B
Pred: Migraine Actual:Migraine
Pred: Urinary tract infection Actual:Urinary tract infection
Pred: Diabetes  Actual:Diabetes 
Pred: Hepatitis D Actual:Hepatitis D
Pred: Psoriasis Actual:Psoriasis
Pred: Alcoholic hepatitis Actual:Alcoholic hepatitis
Pred: Alcoholic hepatitis Actual:Alcoholic hepatitis
Pred: Dimorphic hemmorhoids(piles) Actual:Dimorphic hemmorhoids(piles)
Pred: Hepatitis E Actual:Hepatitis E
Pred: Diabetes  Actual:Diabetes 
Pred: Cervical spondylosis Actual:Cervical spondylosis


In [94]:
# imported Kfold
from sklearn.model_selection import KFold

## Function to run multiple algorithms with different K values of KFold.
def evaluate(train_data, kmax, algo):
    test_scores = {}
    train_scores = {}
    for i in range(2, kmax, 2):
        kf = KFold(n_splits=i)
        sum_train = 0
        sum_test = 0
        data = input_data
        for train, test in kf.split(data):
            train_data = data.iloc[train, :]
            test_data = data.iloc[test, :]
            x_train = train_data.drop(["prognosis"], axis=1)
            y_train = train_data["prognosis"]
            x_test = test_data.drop(["prognosis"], axis=1)
            y_test = test_data["prognosis"]
            algo_model = algo.fit(x_train, y_train)
            sum_train += algo_model.score(x_train, y_train)
            y_pred = algo_model.predict(x_test)
            sum_test += accuracy_score(y_test, y_pred)
        average_test = sum_test / i
        average_train = sum_train / i
        test_scores[i] = average_test
        train_scores[i] = average_train
        print("kvalue: ", i)
    return (train_scores, test_scores)

In [93]:
from sklearn.ensemble import GradientBoostingClassifier

gbm = GradientBoostingClassifier()
nb = MultinomialNB()
from sklearn.linear_model import LogisticRegression

log = LogisticRegression()
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(criterion="entropy")

from sklearn.ensemble import RandomForestClassifier

ran = RandomForestClassifier(n_estimators=10)

In [92]:

# algo_dict={"l_o_g":log, "d_t":dt,"r_a_n": ran,"NB":nb, "G_8":gbm}
algo_dict={}
algo_train_scores = {}

algo_test_scores = {}

**Gradient Boosted Tree is the most efficient algorithm with K=2**

In [95]:
from sklearn.model_selection import cross_val_score, KFold

max_kfold = 11

for algo_name in algo_dict.keys():
    print(algo_name)

    tr_scores = cross_val_score(algo_dict[algo_name], input_data, cv=KFold(n_splits=max_kfold))
    tst_score = tr_scores.mean()

    algo_train_scores[algo_name] = tr_scores
    algo_test_scores[algo_name] = tst_score

    print(algo_train_scores)
    print(algo_test_scores)

In [96]:
df_test = pd.DataFrame(algo_test_scores)
df_train = pd.DataFrame(algo_train_scores)

df_test.plot(grid = True)
plt.show()

TypeError: no numeric data to plot

In [97]:
# building the model at k value 2
test_scores = {}
train_scores = {}
for i in range(2, 4, 2):
    kf = KFold(n_splits=i)
    sum_train = 0
    sum_test = 0
    data = input_data
    for train, test in kf.split(data):
        train_data = data.iloc[train, :]
        test_data = data.iloc[test, :]
        x_train = train_data.drop(["prognosis"], axis=1)
        y_train = train_data["prognosis"]
        x_test = test_data.drop(["prognosis"], axis=1)
        y_test = test_data["prognosis"]
        algo_model = gbm.fit(x_train, y_train)
        sum_train += gbm.score(x_train, y_train)
        y_pred = gbm.predict(x_test)
        sum_test += accuracy_score(y_test, y_pred)
    average_test = sum_test / i
    average_train = sum_train / i
    test_scores[i] = average_test
    train_scores[i] = average_train
    print("kvalue: ", i)

kvalue:  2


In [98]:
print(train_scores)
print(test_scores)

{2: 1.0}
{2: 0.9792682926829268}


**Model**

In [99]:
gbm.__getstate__()

{'n_estimators': 100,
 'learning_rate': 0.1,
 'loss': 'log_loss',
 'criterion': 'friedman_mse',
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'min_weight_fraction_leaf': 0.0,
 'subsample': 1.0,
 'max_features': None,
 'max_depth': 3,
 'min_impurity_decrease': 0.0,
 'ccp_alpha': 0.0,
 'init': None,
 'random_state': None,
 'alpha': 0.9,
 'verbose': 0,
 'max_leaf_nodes': None,
 'warm_start': False,
 'validation_fraction': 0.1,
 'n_iter_no_change': None,
 'tol': 0.0001,
 'feature_names_in_': array(['itching', 'skin_rash', 'nodal_skin_eruptions',
        'continuous_sneezing', 'shivering', 'chills', 'joint_pain',
        'stomach_pain', 'acidity', 'ulcers_on_tongue', 'muscle_wasting',
        'vomiting', 'burning_micturition', 'spotting_ urination',
        'fatigue', 'weight_gain', 'anxiety', 'cold_hands_and_feets',
        'mood_swings', 'weight_loss', 'restlessness', 'lethargy',
        'patches_in_throat', 'irregular_sugar_level', 'cough',
        'high_fever', 'sunken_eyes', 'breat

In [100]:
state_dict = gbm.__getstate__()
classes_array = state_dict['classes_']
features = state_dict['feature_names_in_']
classes_list = np.ndarray.tolist(classes_array)

**Symptoms Similarirty Matching [Future Scope]**

In [101]:
symptoms = x.columns
regex = re.compile('_')
symptoms = [i if regex.search(i) == None else i.replace('_', ' ') for i in symptoms ]

In [102]:
# Function to find all close matches of  
# input string in given list of possible strings 
from difflib import get_close_matches  
def closeMatches(patterns, word): 
    print(get_close_matches(word, patterns, n=2, cutoff=0.7))

In [103]:
word = 'sivering'
closeMatches(symptoms, word)

['shivering']


**-------------------------------**

In [104]:
from flashtext import KeywordProcessor
keyword_processor = KeywordProcessor()
keyword_processor.add_keywords_from_list(symptoms)

In [None]:
text = 'I have itching, joint pain and fatigue'
keyword_processor.extract_keywords(text)

['itching', 'joint pain', 'fatigue']

In [105]:
text = 'I have itching, joint pain and fatigue'
keyword_processor.extract_keywords(text)

['itching', 'joint pain', 'fatigue']

In [106]:
def predict_disease(query):
    matched_keyword = keyword_processor.extract_keywords(query)
    if len(matched_keyword) == 0:
        print("No Matches")
    else:
        regex = re.compile(" ")
        processed_keywords = [
            i if regex.search(i) == None else i.replace(" ", "_")
            for i in matched_keyword
        ]
        coded_features = []
        for keyword in processed_keywords:
            coded_features.append(np.where(features == keyword)[0][0])
        sample_x = []
        for i in range(len(features)):
            try:
                sample_x.append(i / coded_features[coded_features.index(i)])
            except:
                sample_x.append(i * 0)
        sample_x = np.array(sample_x).reshape(1, len(sample_x))
        probs = gbm.predict_proba(sample_x)[0]
        output = np.column_stack((classes_list, probs))

        filtered_probs = []
        filtered_classes = []

        # Filter the predicted probabilities based on a threshold of 0.2
        threshold = 0.2
        for i in range(len(classes_array)):
            if probs[i] > threshold:
                filtered_probs.append(probs[i])
                filtered_classes.append(classes_array[i])

        filtered_output = np.column_stack((filtered_classes, filtered_probs))

        output = output[np.argsort(output[:,1])[::-1]]
        filtered_output = filtered_output[np.argsort(filtered_output[:,1])[::-1]]

        print("\n Detected Symptoms: \n", processed_keywords)        
        print("\n Predicted Disease Probabilities: \n", output)
        print("\n Filtered Diesease: \n", filtered_output)
        print()


**Testing model**

In [110]:
query = 'I have cough and redness of eyes'
predict_disease(query)


 Detected Symptoms: 
 ['cough', 'redness_of_eyes']

 Predicted Disease Probabilities: 
 [['Common Cold' '0.9840701974054467']
 ['Chronic cholestasis' '0.0014774096507758367']
 ['Gastroenteritis' '0.0005249915061594321']
 ['Fungal infection' '0.0005199694238543121']
 ['Bronchial Asthma' '0.0004963790484871352']
 ['Paralysis (brain hemorrhage)' '0.0004937434064939385']
 ['Allergy' '0.00048679449740742']
 ['Heart attack' '0.00047383285257779155']
 ['Drug Reaction' '0.00047368341364998484']
 ['Acne' '0.0004678910200043594']
 ['AIDS' '0.00046789101994738544']
 ['(vertigo) Paroymsal  Positional Vertigo' '0.0004597841779265203']
 ['Impetigo' '0.0004411402652784268']
 ['Chicken pox' '0.00043126952974037425']
 ['Urinary tract infection' '0.00042872680371990946']
 ['GERD' '0.0004146835905298653']
 ['Varicose veins' '0.00041195951791323583']
 ['Arthritis' '0.0004072131773343609']
 ['Cervical spondylosis' '0.0004050542391896123']
 ['Dimorphic hemmorhoids(piles)' '0.00039679332551990833']
 ['Hepat



**Saving Model**

In [108]:
import pickle
# import xgboost as xgb

# Save the gbm object to a pickle file
with open('model.pkl', 'wb') as f:
    pickle.dump(gbm, f)