Question 3: Decision Trees with Cross-Validation<br>
Task: Implement cross-validation for decision trees to find the best performing model on a
classification problem.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report
import unittest


# Function to load data
def load_data():
    try:
        data = load_iris()
        return data.data, data.target
    except Exception as e:
        raise ValueError(f"Error loading data: {e}")


# Function for train-test split
def split_data(X, y, test_size=0.3):
    try:
        return train_test_split(X, y, test_size=test_size, random_state=42)
    except Exception as e:
        raise ValueError(f"Error splitting data: {e}")


# Function for decision tree model training
def train_decision_tree(X_train, y_train, max_depth=None, min_samples_split=2, criterion='gini'):
    try:
        dt_model = DecisionTreeClassifier(max_depth=max_depth, min_samples_split=min_samples_split, criterion=criterion, random_state=42)
        dt_model.fit(X_train, y_train)
        return dt_model
    except Exception as e:
        raise ValueError(f"Error training the decision tree model: {e}")


# Function for performing cross-validation
def cross_validate_model(model, X_train, y_train, cv=10):
    try:
        return cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')
    except Exception as e:
        raise ValueError(f"Error performing cross-validation: {e}")


# Function for grid search hyperparameter tuning
def grid_search(X_train, y_train):
    param_grid = {
        'max_depth': [3, 5, 10, None],
        'min_samples_split': [2, 5, 10],
        'criterion': ['gini', 'entropy']
    }
    try:
        grid_search = GridSearchCV(estimator=DecisionTreeClassifier(random_state=42), param_grid=param_grid, cv=10, n_jobs=-1, scoring='accuracy')
        grid_search.fit(X_train, y_train)
        return grid_search
    except Exception as e:
        raise ValueError(f"Error during grid search: {e}")


# Unit Tests
class TestDecisionTreeModel(unittest.TestCase):
    def test_data_loading(self):
        X, y = load_data()
        self.assertEqual(X.shape[0], len(y))

    def test_model_training(self):
        X, y = load_data()
        X_train, X_test, y_train, y_test = split_data(X, y)
        model = train_decision_tree(X_train, y_train)
        self.assertTrue(hasattr(model, "predict"))

    def test_cross_validation(self):
        X, y = load_data()
        X_train, X_test, y_train, y_test = split_data(X, y)
        dt_model = train_decision_tree(X_train, y_train)
        cv_scores = cross_validate_model(dt_model, X_train, y_train)
        self.assertGreater(cv_scores.mean(), 0)

    def test_grid_search(self):
        X, y = load_data()
        X_train, X_test, y_train, y_test = split_data(X, y)
        grid_search_result = grid_search(X_train, y_train)
        self.assertIsNotNone(grid_search_result.best_params_)


# Run main logic
if __name__ == "__main__":
    try:
        X, y = load_data()
        X_train, X_test, y_train, y_test = split_data(X, y)

        # Cross-validation and grid search
        dt_model = DecisionTreeClassifier(random_state=42)
        cv_scores = cross_validate_model(dt_model, X_train, y_train)

        print(f"Cross-validation scores: {cv_scores}")
        print(f"Mean cross-validation score: {cv_scores.mean():.4f}")
        print(f"Standard deviation of cross-validation score: {cv_scores.std():.4f}")

        grid_search_result = grid_search(X_train, y_train)
        print(f"Best hyperparameters: {grid_search_result.best_params_}")
        print(f"Best cross-validation score from Grid Search: {grid_search_result.best_score_:.4f}")

        # Evaluate the best model
        best_model = grid_search_result.best_estimator_
        y_pred = best_model.predict(X_test)

        # Classification report
        print("Classification Report on Test Data:")
        print(classification_report(y_test, y_pred))

        # Visualize the Decision Tree
        plt.figure(figsize=(12, 8))
        plot_tree(best_model, filled=True, feature_names=data.feature_names, class_names=data.target_names)
        plt.title("Decision Tree (Best Model from Grid Search)")
        plt.show()

        # Running unit tests
        unittest.main(argv=[''], exit=False)
        
    except Exception as e:
        print(f"Error: {e}")


Cross-validation scores: [0.90909091 1.         1.         0.72727273 0.81818182 1.
 1.         0.8        1.         0.9       ]
Mean cross-validation score: 0.9155
Standard deviation of cross-validation score: 0.0971
Best hyperparameters: {'criterion': 'gini', 'max_depth': 3, 'min_samples_split': 2}
Best cross-validation score from Grid Search: 0.9245
Classification Report on Test Data:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      1.00      1.00        13
           2       1.00      1.00      1.00        13

    accuracy                           1.00        45
   macro avg       1.00      1.00      1.00        45
weighted avg       1.00      1.00      1.00        45

Error: name 'data' is not defined


<Figure size 1200x800 with 0 Axes>