# Load Dataset

In [24]:
# import library
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

In [25]:
# retrieve dataset
x_train = pd.read_csv('x_train.csv')
y_train = pd.read_csv('y_train.csv')
x_val = pd.read_csv('x_val.csv')
y_val = pd.read_csv('y_val.csv')
x_test = pd.read_csv('x_test.csv')
y_test = pd.read_csv('y_test.csv')

# change dataset to numpy array
x_train = x_train.values
y_train = y_train.values
x_val = x_val.values
y_val = y_val.values
x_test = x_test.values
y_test = y_test.values

# change to 1D array
y_train = y_train.ravel()
y_val = y_val.ravel()
y_test = y_test.ravel()

# Model Training

In [26]:
# Define Decision Tree model
model = DecisionTreeClassifier(class_weight='balanced', random_state=42)

# Hyperparameter grid
param_grid = {
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_depth': [None, 3, 5, 7, 10],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 5, 10]
}

# Grid search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='f1', cv=5, n_jobs=-1)

# Fit grid search
grid_search.fit(x_train, y_train)

# Best parameters and estimator
print("\nBest Parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_


Best Parameters: {'criterion': 'entropy', 'max_depth': 3, 'min_samples_leaf': 5, 'min_samples_split': 2}


In [27]:
# Best parameters and estimator
print("\nBest Parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_

# Predictions
y_pred = best_model.predict(x_test)


Best Parameters: {'criterion': 'entropy', 'max_depth': 3, 'min_samples_leaf': 5, 'min_samples_split': 2}


# Model Evaluation

In [28]:
# Compute metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)


print("Results using sklearn LogisticRegression:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Results using sklearn LogisticRegression:
Accuracy: 0.6152
Precision: 0.2348
Recall: 0.6696
F1 Score: 0.3476


In [29]:
print('The details for confusion matrix is:')
print(classification_report(y_test, y_pred))

The details for confusion matrix is:
              precision    recall  f1-score   support

           0       0.91      0.61      0.73       636
           1       0.23      0.67      0.35       115

    accuracy                           0.62       751
   macro avg       0.57      0.64      0.54       751
weighted avg       0.81      0.62      0.67       751

