In [2]:
# importing dataset
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.tree import DecisionTreeClassifier
import os
from load import load

# Get the current working directory
current_directory = os.getcwd()
# Define the path to the datasets folder
datasets_folder = os.path.join(current_directory, "datasets")
exclude_datasets = ['rice', 'timeseries.py', 'analysis.py', 'normalized_nyse_stock_data.data', 'pca_normalized_nyse_stock_data.data', 'media.data', 'reliance.data']
datasets = [dataset for dataset in os.listdir(datasets_folder) if not (dataset.startswith('.') or dataset in exclude_datasets)]

for dataset_name in datasets:
    # importing dataset
    print(dataset_name)
    X, Y = load(dataset_name)
    # Define the number of splits for time series split
    num_splits = 5

    # Fix the seed value for splitting data
    split_seed = 42

    # Define the range of depths for the decision tree
    depth_range = range(2, 21)

    # Initialize a list to store the results
    results = []

    # declaring test size for time series split
    time_series_split_test_size = 15

    # _______________________________________________________________________________

    # Variable description:
    # _______________________________________________________________________________

    #     X               -   Data attributes.for X_train.
    #     X_test          -   Data attributes for testing (20% of the dataset).
    #     y_test          -   Corresponding labels for X_test.

    # Split the data into training and testing sets with the fixed seed
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=split_seed)

    # Perform time series split
    tscv = TimeSeriesSplit(n_splits=num_splits, test_size=time_series_split_test_size)

    # Iterate over different depths for the decision tree
    for depth in depth_range:
        # Initialize lists to store evaluation metrics for each fold
        accuracies = []
        f1_scores = []
        precisions = []
        recalls = []

        # Perform time series split
        for split_num, (train_index, test_index) in enumerate(tscv.split(X_train), 1):
            X_cv_train, X_cv_test = pd.DataFrame(X_train).iloc[train_index], pd.DataFrame(X_train).iloc[test_index]
            y_cv_train, y_cv_test = pd.DataFrame(y_train).iloc[train_index], pd.DataFrame(y_train).iloc[test_index]

            # Train the decision tree model with the current depth
            tree_classifier = DecisionTreeClassifier(criterion='gini',max_depth=depth, random_state= 21)
            tree_classifier.fit(X_cv_train, y_cv_train)

            # Make predictions
            y_pred = tree_classifier.predict(X_cv_test)

            # Calculate evaluation metrics for this fold
            accuracy = accuracy_score(y_cv_test, y_pred)
            f1 = f1_score(y_cv_test, y_pred, average='macro')  # Use macro F1 score
            precision = precision_score(y_cv_test, y_pred, average='macro', zero_division=1)
            recall = recall_score(y_cv_test, y_pred, average='macro', zero_division=1)

            # Append metrics to the lists
            accuracies.append(accuracy)
            f1_scores.append(f1)
            precisions.append(precision)
            recalls.append(recall)

        # Calculate mean metrics across all folds
        mean_accuracy = sum(accuracies) / len(accuracies)
        mean_f1 = sum(f1_scores) / len(f1_scores)
        mean_precision = sum(precisions) / len(precisions)
        mean_recall = sum(recalls) / len(recalls)

        # Store the results for this depth
        results.append({
            'Split Seed': split_seed,
            'Depth of Tree': depth,
            'Mean Accuracy': mean_accuracy,
            'Mean F1 Score': mean_f1,
            'Mean Precision': mean_precision,
            'Mean Recall': mean_recall
        })

    # Create a DataFrame from the results
    results_df = pd.DataFrame(results)

        # Find the depth with the maximum F1 score
    best_depth_row = results_df.loc[results_df['Mean F1 Score'].idxmax()]

    # Train the decision tree with the best depth on the entire training data
    best_depth = best_depth_row['Depth of Tree']
    print(best_depth)
    best_tree_classifier = DecisionTreeClassifier(criterion='gini',max_depth=int(best_depth), random_state= 21)
    best_tree_classifier.fit(X_train, y_train)

    # Make predictions on the test data
    y_pred_test_dt = best_tree_classifier.predict(X_test)
    print(best_tree_classifier)
    # Calculate evaluation metrics on the test data
    accuracy_test_dt = accuracy_score(y_test, y_pred_test_dt)
    f1_test_dt = f1_score(y_test, y_pred_test_dt, average='macro')
    precision_test_dt = precision_score(y_test, y_pred_test_dt, average='macro', zero_division=1)
    recall_test_dt = recall_score(y_test, y_pred_test_dt, average='macro', zero_division=1)

    # Print the evaluation metrics on the test data
    print("Test Accuracy:", accuracy_test_dt)
    print("Test F1 Score:", f1_test_dt)
    print("Test Precision:", precision_test_dt)
    print("Test Recall:", recall_test_dt)

    #print tree
    from sklearn.tree import export_text
    # Print the tree using export_text
    tree_text = export_text(best_tree_classifier, feature_names=list(X.columns))
    print(X.columns)
    print(tree_text)

    # Creating this because we want to save the result in form of csv and numpy
    evaluation_metrics_dt_test = {
        "Accuracy": accuracy_test_dt,
        "F1 Score": f1_test_dt,
        "Precision": precision_test_dt,
        "Recall": recall_test_dt,
        "Depth" : best_depth
    }
        # Define the path to the results folder
    results_folder = os.path.join(current_directory, "results")

    # Define the dataset name
    dataset_name = dataset_name

    # Create a folder for the current dataset within the results directory
    dataset_results_folder = os.path.join(results_folder, dataset_name)
    os.makedirs(dataset_results_folder, exist_ok=True)

    # Create folder for decision tree results
    dt_folder = os.path.join(dataset_results_folder, "DT_gini")
    os.makedirs(dt_folder, exist_ok=True)
    # Save results_df to CSV
    results_csv_path = os.path.join(dt_folder, "all_splitseed_with_depth.csv")
    results_df.to_csv(results_csv_path, index=False)

    # Save results_df to NumPy
    results_npy_path = os.path.join(dt_folder, "all_splitseed_with_depth.npy")
    np.save(results_npy_path, results)

    # Path for DT csv file
    dt_metrics_csv_path = os.path.join(dt_folder, "evaluation_metrics_dt_test.csv")
    # saving this to DT folder
    pd.DataFrame(evaluation_metrics_dt_test.items(), columns=["Metric", "Value"]).to_csv(dt_metrics_csv_path, index=False)

    # Path for DT numpy file
    dt_metrics_npy_path = os.path.join(dt_folder, "evaluation_metrics_dt_test.npy")
    # saving it to DT folder
    np.save(dt_metrics_npy_path, evaluation_metrics_dt_test)
        

ionosphere
3.0
DecisionTreeClassifier(max_depth=3, random_state=21)
Test Accuracy: 0.8591549295774648
Test F1 Score: 0.8426418439716312
Test Precision: 0.8867647058823529
Test Recall: 0.8276578073089701
Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33],
      dtype='int64')
|--- 4 <= 0.04
|   |--- class: 0
|--- 4 >  0.04
|   |--- 26 <= 1.00
|   |   |--- 2 <= 0.14
|   |   |   |--- class: 0
|   |   |--- 2 >  0.14
|   |   |   |--- class: 1
|   |--- 26 >  1.00
|   |   |--- 21 <= -0.07
|   |   |   |--- class: 1
|   |   |--- 21 >  -0.07
|   |   |   |--- class: 0

iris
2.0
DecisionTreeClassifier(max_depth=2, random_state=21)
Test Accuracy: 0.9666666666666667
Test F1 Score: 0.9658994032395567
Test Precision: 0.9722222222222222
Test Recall: 0.9629629629629629
Index([0, 1, 2, 3], dtype='int64')
|--- 3 <= 0.80
|   |--- class: 0
|--- 3 >  0.80
|   |--- 2 <= 4.75
|   |   |--- class: 1
|   |--- 2 >  4

In [3]:
# # importing dataset
# from load import load
# X, Y = load("breastcancerwisconsin")

In [4]:
# import pandas as pd
# from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
# from sklearn.model_selection import train_test_split, TimeSeriesSplit
# from sklearn.tree import DecisionTreeClassifier

# # Define the number of splits for time series split
# num_splits = 5

# # Fix the seed value for splitting data
# split_seed = 42

# # Define the range of depths for the decision tree
# depth_range = range(2, 21)

# # Initialize a list to store the results
# results = []

# # declaring test size for time series split
# time_series_split_test_size = 15

# # _______________________________________________________________________________

# # Variable description:
# # _______________________________________________________________________________

# #     X               -   Data attributes.
# #     y               -   Corresponding labels for X.
# #     X_train         -   Data attributes for training (80% of the dataset).
# #     y_train         -   Corresponding labels for X_train.
# #     X_test          -   Data attributes for testing (20% of the dataset).
# #     y_test          -   Corresponding labels for X_test.

# # Split the data into training and testing sets with the fixed seed
# X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=split_seed)

# # Perform time series split
# tscv = TimeSeriesSplit(n_splits=num_splits, test_size=time_series_split_test_size)

# # Iterate over different depths for the decision tree
# for depth in depth_range:
#     # Initialize lists to store evaluation metrics for each fold
#     accuracies = []
#     f1_scores = []
#     precisions = []
#     recalls = []

#     # Perform time series split
#     for split_num, (train_index, test_index) in enumerate(tscv.split(X_train), 1):
#         X_cv_train, X_cv_test = X_train.iloc[train_index], X_train.iloc[test_index]
#         y_cv_train, y_cv_test = y_train.iloc[train_index], y_train.iloc[test_index]

#         # Train the decision tree model with the current depth
#         tree_classifier = DecisionTreeClassifier(max_depth=depth)
#         tree_classifier.fit(X_cv_train, y_cv_train)

#         # Make predictions
#         y_pred = tree_classifier.predict(X_cv_test)

#         # Calculate evaluation metrics for this fold
#         accuracy = accuracy_score(y_cv_test, y_pred)
#         f1 = f1_score(y_cv_test, y_pred, average='macro')  # Use macro F1 score
#         precision = precision_score(y_cv_test, y_pred, average='macro', zero_division=1)
#         recall = recall_score(y_cv_test, y_pred, average='macro', zero_division=1)

#         # Append metrics to the lists
#         accuracies.append(accuracy)
#         f1_scores.append(f1)
#         precisions.append(precision)
#         recalls.append(recall)

#     # Calculate mean metrics across all folds
#     mean_accuracy = sum(accuracies) / len(accuracies)
#     mean_f1 = sum(f1_scores) / len(f1_scores)
#     mean_precision = sum(precisions) / len(precisions)
#     mean_recall = sum(recalls) / len(recalls)

#     # Store the results for this depth
#     results.append({
#         'Split Seed': split_seed,
#         'Depth of Tree': depth,
#         'Mean Accuracy': mean_accuracy,
#         'Mean F1 Score': mean_f1,
#         'Mean Precision': mean_precision,
#         'Mean Recall': mean_recall
#     })

# # Create a DataFrame from the results
# results_df = pd.DataFrame(results)

In [5]:
# # Find the depth with the maximum F1 score
# best_depth_row = results_df.loc[results_df['Mean F1 Score'].idxmax()]

# # Train the decision tree with the best depth on the entire training data
# best_depth = best_depth_row['Depth of Tree']
# print(best_depth)
# best_tree_classifier = DecisionTreeClassifier(max_depth=int(best_depth))
# best_tree_classifier.fit(X_train, y_train)

# # Make predictions on the test data
# y_pred_test_dt = best_tree_classifier.predict(X_test)

# # Calculate evaluation metrics on the test data
# accuracy_test_dt = accuracy_score(y_test, y_pred_test_dt)
# f1_test_dt = f1_score(y_test, y_pred_test_dt, average='macro')
# precision_test_dt = precision_score(y_test, y_pred_test_dt, average='macro', zero_division=1)
# recall_test_dt = recall_score(y_test, y_pred_test_dt, average='macro', zero_division=1)

# # Print the evaluation metrics on the test data
# print("Test Accuracy:", accuracy_test_dt)
# print("Test F1 Score:", f1_test_dt)
# print("Test Precision:", precision_test_dt)
# print("Test Recall:", recall_test_dt)

# # Creating this because we want to save the result in form of csv and numpy
# evaluation_metrics_dt_test = {
#     "Accuracy": accuracy_test_dt,
#     "F1 Score": f1_test_dt,
#     "Precision": precision_test_dt,
#     "Recall": recall_test_dt
# }

In [6]:
# import os
# import numpy as np

# # Get the current working directory
# current_directory = os.getcwd()

# # Define the path to the results folder
# results_folder = os.path.join(current_directory, "results")

# # Define the dataset name
# dataset_name = "iris"

# # Create a folder for the current dataset within the results directory
# dataset_results_folder = os.path.join(results_folder, dataset_name)
# os.makedirs(dataset_results_folder, exist_ok=True)

# # Create folder for decision tree results
# dt_folder = os.path.join(dataset_results_folder, "DT")
# os.makedirs(dt_folder, exist_ok=True)
# # Save results_df to CSV
# results_csv_path = os.path.join(dt_folder, "all_splitseed_with_depth.csv")
# results_df.to_csv(results_csv_path, index=False)

# # Save results_df to NumPy
# results_npy_path = os.path.join(dt_folder, "all_splitseed_with_depth.npy")
# np.save(results_npy_path, results)

# # Path for DT csv file
# dt_metrics_csv_path = os.path.join(dt_folder, "evaluation_metrics_dt_test.csv")
# # saving this to DT folder
# pd.DataFrame(evaluation_metrics_dt_test.items(), columns=["Metric", "Value"]).to_csv(dt_metrics_csv_path, index=False)

# # Path for DT numpy file
# dt_metrics_npy_path = os.path.join(dt_folder, "evaluation_metrics_dt_test.npy")
# # saving it to DT folder
# np.save(dt_metrics_npy_path, evaluation_metrics_dt_test)