In [23]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,recall_score,f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier


In [15]:
df=pd.read_csv('datasets/fyp_data.csv')

In [17]:
sklearn.set_config(transform_output="default")


In [25]:
df=df[['Symptoms','Age','Sex','Nature','Disease']]

df['Symptoms'] = df['Symptoms'].fillna('Unknown')
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Sex'] = df['Sex'].fillna('unknown')
df['Nature'] = df['Nature'].fillna('Unknown')

x_train,x_test,y_train,y_test=train_test_split(df.drop(columns=['Disease']),df['Disease'],test_size=0.2,random_state=42)

cate=['Symptoms','Sex']
order=['Nature']

processing=ColumnTransformer(
    transformers=[
        ('categorical_encoding',OneHotEncoder(sparse_output=False),cate),
        ('ordinal_encoding',OrdinalEncoder(categories=[['Unknown','medium','high']]),order)
    ],
    remainder='passthrough'
)

pip=Pipeline(
    steps=[
        ('preprocess',processing)
    ]
)

x_train_trf=pip.fit_transform(x_train)
x_test_trf=pip.fit_transform(x_test)



In [27]:
dtc=DecisionTreeClassifier(max_depth=10)
knc=KNeighborsClassifier()
rfc=RandomForestClassifier()

In [37]:
estimators=[
    ('decision tree.....',dtc),
    ('kneighbor.........',knc),
    ('random forest.....',rfc)
]

In [39]:
for i in estimators:
    x=cross_val_score(i[1],x_train_trf,y_train,scoring= 'accuracy', cv=10)
    print(i[0],np.round(x.mean(),2))

decision tree..... 0.91
kneighbor......... 0.99
random forest..... 0.98


In [41]:
from sklearn.ensemble import VotingClassifier

In [55]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
y_train_transform=le.fit_transform(y_train)

In [67]:
# hard voting
vc=VotingClassifier(estimators=estimators)
x=cross_val_score(vc,x_train_trf,y_train_transform,scoring= 'accuracy', cv=10)
print(np.round(np.mean(x),2))

0.98


In [73]:
# soft voting
vcs=VotingClassifier(estimators=estimators,voting='soft')
y=cross_val_score(vcs,x_train_trf,y_train,scoring='accuracy',cv=10)
print(np.round(np.mean(x),3))

0.985
