In [1]:
import pandas as pd
import numpy as np
import sklearn

In [2]:
df=pd.read_csv('datasets/fyp_data.csv')

In [3]:
sklearn.set_config(transform_output="default")


In [4]:
df.head()

Unnamed: 0,Symptoms,Age,Sex,Disease,Treatment,Nature,Unnamed: 6,Unnamed: 7,Unnamed: 8
0,coughing,5.0,female,Asthma,Omalizumab,high,,,
1,tight feeling in the chest,4.0,female,Asthma,Mepolizumab,high,,,
2,wheezing,6.0,male,Asthma,Mepolizumab,high,,,
3,shortness of breath,7.0,male,Asthma,Mepolizumab,high,,,
4,shortness of breath,9.0,male,Asthma,Mepolizumab,high,,,


In [9]:
df=df[['Symptoms','Age','Sex','Nature','Disease']]

In [11]:
df.head()

Unnamed: 0,Symptoms,Age,Sex,Nature,Disease
0,coughing,5.0,female,high,Asthma
1,tight feeling in the chest,4.0,female,high,Asthma
2,wheezing,6.0,male,high,Asthma
3,shortness of breath,7.0,male,high,Asthma
4,shortness of breath,9.0,male,high,Asthma


In [23]:
df['Symptoms'] = df['Symptoms'].str.strip().str.lower()



In [25]:
df['Symptoms'].value_counts()

Symptoms
wheezing                           1869
shortness of breath                1716
fatigue                            1344
chest pain                         1200
mucus                              1152
yellow cough                       1118
chronic cough                       864
cold                                845
low-grade fever                     685
nausea                              672
feeling run-down or tired           624
whistling sound while breathing     538
runny nose                          535
stuffy nose                         489
fever                               461
chest congestion                    440
loss of appetite                    336
shaking                             336
low energy                          336
shallow breathing                   336
rapid breathing                     336
vomiting                            336
sweating                            336
cough with blood                    336
greenish cough                 

In [21]:
df['Symptoms'].unique()

array(['coughing', 'tight feeling in the chest', 'wheezing',
       'shortness of breath', 'fever', 'cold', 'allergy', 'Runny nose',
       'stuffy nose', 'Low-grade fever', 'Chest congestion', 'Wheezing',
       'whistling sound while breathing', 'yellow cough',
       'Feeling run-down or tired', 'mucus', 'chest pain',
       'chronic cough', 'Fatigue', nan, 'greenish cough',
       'cough with blood', 'Fever', 'sweating', 'shaking',
       'Rapid breathing', 'shallow breathing', 'low energy',
       'Loss of appetite', 'fatigue', 'Nausea', 'vomiting'], dtype=object)

In [15]:
df.isnull().sum()

Symptoms    144
Age          74
Sex         506
Nature      842
Disease       0
dtype: int64

In [17]:
df.shape

(18069, 5)

In [19]:
df['Symptoms'] = df['Symptoms'].fillna('Unknown')
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Sex'] = df['Sex'].fillna('unknown')
df['Nature'] = df['Nature'].fillna('Unknown')


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18069 entries, 0 to 18068
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Symptoms  18069 non-null  object 
 1   Age       18069 non-null  float64
 2   Sex       18069 non-null  object 
 3   Nature    18069 non-null  object 
 4   Disease   18069 non-null  object 
dtypes: float64(1), object(4)
memory usage: 705.9+ KB


In [23]:
df.head(3)

Unnamed: 0,Symptoms,Age,Sex,Nature,Disease
0,coughing,5.0,female,high,Asthma
1,tight feeling in the chest,4.0,female,high,Asthma
2,wheezing,6.0,male,high,Asthma


In [25]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(df.drop(columns=['Disease']),df['Disease'],test_size=0.2,random_state=42)

In [26]:
cate=['Symptoms','Sex']
order=['Nature']

In [29]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder
from sklearn.pipeline import Pipeline

In [31]:
processing=ColumnTransformer(
    transformers=[
        ('categorical_encoding',OneHotEncoder(sparse_output=False),cate),
        ('ordinal_encoding',OrdinalEncoder(categories=[['Unknown','medium','high']]),order)
    ],
    remainder='passthrough'
)

pip=Pipeline(
    steps=[
        ('preprocess',processing)
    ]
)

In [33]:
x_train.head()

Unnamed: 0,Symptoms,Age,Sex,Nature
3941,yellow cough,33.0,female,medium
13357,chest pain,11.0,male,medium
7677,yellow cough,83.0,male,medium
1236,shortness of breath,65.0,male,high
2900,chronic cough,67.0,female,high


In [35]:
x_train_trf=pip.fit_transform(x_train)
x_test_trf=pip.fit_transform(x_test)

In [37]:
x_test_trf

array([[ 0.,  0.,  0., ...,  0.,  1., 21.],
       [ 0.,  0.,  0., ...,  0.,  1., 34.],
       [ 0.,  0.,  0., ...,  0.,  1., 17.],
       ...,
       [ 0.,  0.,  0., ...,  0.,  1., 15.],
       [ 0.,  0.,  0., ...,  0.,  2., 51.],
       [ 0.,  0.,  0., ...,  0.,  2., 51.]])

In [39]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,recall_score,f1_score


In [68]:
cls=DecisionTreeClassifier(max_depth=10)
cls.fit(x_train_trf,y_train)

In [70]:
y_pred=cls.predict(x_test_trf)

In [72]:
print('accuracy_score...............',accuracy_score(y_test,y_pred))
print('precision_score...............',precision_score(y_test,y_pred,average='weighted'))
print('recall_score...............',recall_score(y_test,y_pred,average='weighted'))
print('f1_score...............',f1_score(y_test,y_pred,average='weighted'))
print('confusion_matrix:\n',confusion_matrix(y_test,y_pred))

accuracy_score............... 0.9216934144991699
precision_score............... 0.9212115714701336
recall_score............... 0.9216934144991699
f1_score............... 0.9132318016699925
confusion_matrix:
 [[  71   14   81    0   46]
 [   0  328   31   10   26]
 [   0    6 1217    9    0]
 [   0   15   20  966    0]
 [   9   16    0    0  749]]
