In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score

In [None]:
df = pd.read_csv("/kaggle/input/disease-symptoms-and-patient-profile-dataset/Disease_symptom_and_patient_profile_dataset.csv")

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.isna().sum()

# General age distribution

In [None]:
sns.histplot(df, x='Age', kde=True, color='g')

In [None]:
def plots(df, y):
    grouped = df.groupby(y)
    fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(20, 10))
    sns.histplot(df,x='Age',hue=y,kde=True, ax=axes[0])
    sns.barplot(x=grouped['Age'].mean().index,
               y=grouped['Age'].mean(), ax=axes[1])
    
    for container in axes[1].containers:
        axes[1].bar_label(container, size=15, color='black')
        
    freqs = df[y].value_counts()
    axes[2].pie(freqs,
               labels=freqs.index,
               autopct='%0.2f%%')
    
    plt.show()

# Blood pressure and cholesterol levels by age - general data

In [None]:
cat_vis = ['Blood Pressure', 'Cholesterol Level']

for i in cat_vis:
    plots(df, i)

# Blood pressure and cholesterol by age in healthy individuals(tested Negative for disease)

In [None]:
for i in cat_vis:
    plots(df[df['Outcome Variable'] == 'Negative'], i)

# Blood pressure and cholesterol by age in unhealthy individuals(teste Positive for a disease)

In [None]:
for i in cat_vis:
    plots(df[df['Outcome Variable'] == 'Positive'], i)

# As it seems healthy individuals have a tendency to have mostly normal blood pressure and cholesterol levels compared to their unhealthy peers

# Genral data distribution by age which people tend to be sick and which do not. Examine for data imbalance for classification

In [None]:
plots(df, 'Outcome Variable')

# Observations

As it seems all ages tend to have issues with their health in any way, be it mild or serious disease

No class imbalance observed, no need for SMOTE technique

# Encoding data

In [None]:
cols = df.columns
le = LabelEncoder()
encoders = []
for i in cols:
    if i != 'Age':
        le.fit(df[i])
        df[i] = le.transform(df[i])
        encoders += [le]

In [None]:
df.head()

In [None]:
df['Disease'].nunique()

In [None]:
x = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42, test_size=0.2)

# Training and evaluation pipeline

In [None]:
def training(model, name):
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    acc = accuracy_score(y_test, y_pred)
    acc *= 100
    acc = round(acc, 2)
    print("{}: {}%".format(name, acc))
    return model

# Defining models and tuning their parameters

In [None]:
rfc = RandomForestClassifier(max_depth=60)
dtc = DecisionTreeClassifier(max_depth=70)
abc = AdaBoostClassifier(learning_rate=0.01)
svc = SVC(C=0.1)
xgb = XGBClassifier()

models = [rfc, dtc, abc, svc, xgb]
names = ['Random Forest', 'Decision Tree', 'Ada Boost', 'SVC', 'XGB']

# Results

In [None]:
trained = []
for i, j in zip(models, names):
    trained += [training(i, j)]
    print()