In [10]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, GridSearchCV
from imblearn.over_sampling import SMOTE

# Load your dataset
df = pd.read_csv('./data/dataset.csv')

# Data Preparation
df['Fever'] = df['Fever'].map({'Yes': 1, 'No': 0})
df['Cough'] = df['Cough'].map({'Yes': 1, 'No': 0})
df['Fatigue'] = df['Fatigue'].map({'Yes': 1, 'No': 0})
df['Difficulty Breathing'] = df['Difficulty Breathing'].map({'Yes': 1, 'No': 0})
df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0})
df['Blood Pressure'] = df['Blood Pressure'].map({'Low': 0, 'Normal': 1, 'High': 2})
df['Cholesterol Level'] = df['Cholesterol Level'].map({'Low': 0, 'Normal': 1, 'High': 2})
df['Outcome Variable'] = df['Outcome Variable'].map({'Positive': 1, 'Negative': 0})

# Split data into features (X) and target (y)
X = df.drop(columns=['Disease'])  # Features
y = df['Outcome Variable']  # Target

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Handle class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Normalization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

# Model Training with Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=42, max_depth=3, min_samples_split=2)
clf.fit(X_train_scaled, y_train_resampled)
y_pred_clf = clf.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred_clf)
print(f'Classification Model Accuracy: {accuracy}')

cv_scores = cross_val_score(clf, X, y, cv=5)
print(f'Cross-validation scores: {cv_scores}')

param_grid = {'max_depth': [3, 5, 10, None], 'min_samples_split': [2, 5, 10]}
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=3)
grid_search.fit(X_train_scaled, y_train_resampled)
print(f'Best parameters found: {grid_search.best_params_}')


Classification Model Accuracy: 1.0
Cross-validation scores: [1. 1. 1. 1. 1.]
Best parameters found: {'max_depth': 3, 'min_samples_split': 2}
