In [6]:
import sys
import numpy as np
import re

# Function to perform Effort to Compress (ETC)
def etc(arr):
    # Check if the array is empty
    if len(arr) == 0:
        print("Error: Empty array provided.")
        return 0

    iteration = 0  # Initialize the iteration count
    while True:
        iteration += 1  # Increment the iteration count
        hash = [0] * 256  # Initialize a hash array to count occurrences of characters
        n = len(arr)  # Get the length of the array
        mp = {}  # Initialize a dictionary to store pairs of adjacent elements
        # Iterate through the array to count character occurrences and pairs of adjacent elements
        for i in range(n-1):
            a = arr[i]
            hash[a] += 1
            b = arr[i+1]
            if (a, b) in mp:
                mp[(a, b)] += 1
            else:
                mp[(a, b)] = 1
        hash[arr[n-1]] += 1  # Count the occurrence of the last element
        finish = False  # Flag to indicate if all characters are the same
        # Check if all characters are the same
        for i in range(256):
            if hash[i] == n:
                finish = True
                break
        if finish:
            # Algorithm ends as all characters are the same
            break

        mx = (0, 0)  # Initialize a pair to store the most frequent adjacent elements
        mxcnt = 0  # Initialize a counter for the most frequent pair
        # Find the most frequent pair of adjacent elements
        for key, value in mp.items():
            if value > mxcnt:
                mx = key
                mxcnt = value
        replace = -1  # Initialize a variable to store the character to be replaced
        # Find a character that is not used in the array
        for i in range(256):
            if hash[i] == 0:
                replace = i
                break
        if replace == -1:
            print("Error: all characters are used")
            return iteration

        newarr = []  # Initialize a new array to store the transformed elements
        i = 0  # Initialize an index variable
        while i < n-1:
            a = arr[i]
            b = arr[i+1]
            if a == mx[0] and b == mx[1]:
                # Replace the most frequent pair with the character that is not used
                newarr.append(replace)
                i += 1
            else:
                newarr.append(a)
            i += 1
        if i == n - 1:
            newarr.append(arr[i])
        # Append the last element if it's not part of the most frequent pair
        if arr[n-1] == mx[1] and arr[n-2] == mx[0]:
            pass
        else:
            newarr.append(arr[n-1])
        arr = np.array(newarr)  # Convert the new array to numpy array for further processing
    return iteration - 1

In [7]:
class Node():
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, info_gain=None, value=None):
        ''' constructor '''

        # for decision node
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.info_gain = info_gain

        # for leaf node
        self.value = value

In [8]:
class DecisionTreeClassifier():
    def __init__(self, min_samples_split=2, max_depth=2):
        ''' constructor '''

        # initialize the root of the tree
        self.root = None

        # stopping conditions
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth

    def build_tree(self, dataset, curr_depth=0):
        ''' recursive function to build the tree '''

        X, Y = dataset[:,:-1], dataset[:,-1]
        num_samples, num_features = np.shape(X)

        # split until stopping conditions are met
        if num_samples>=self.min_samples_split and curr_depth<=self.max_depth:
            # find the best split
            best_split = self.get_best_split(dataset, num_samples, num_features)
            # check if information gain is positive
            if best_split["info_gain"]>0:
                # recur left
                left_subtree = self.build_tree(best_split["dataset_left"], curr_depth+1)
                # recur right
                right_subtree = self.build_tree(best_split["dataset_right"], curr_depth+1)
                # return decision node
                return Node(best_split["feature_index"], best_split["threshold"],
                            left_subtree, right_subtree, best_split["info_gain"])

        # compute leaf node
        leaf_value = self.calculate_leaf_value(Y)
        # return leaf node
        return Node(value=leaf_value)

    def get_best_split(self, dataset, num_samples, num_features):
        ''' function to find the best split '''

        # dictionary to store the best split
        best_split = {}
        max_info_gain = -float("inf")

        # loop over all the features
        for feature_index in range(num_features):
            feature_values = dataset[:, feature_index]
            possible_thresholds = np.unique(feature_values)
            # loop over all the feature values present in the data
            for threshold in possible_thresholds:
                # get current split
                dataset_left, dataset_right = self.split(dataset, feature_index, threshold)
                # check if childs are not null
                if len(dataset_left)>0 and len(dataset_right)>0:
                    y, left_y, right_y = dataset[:, -1], dataset_left[:, -1], dataset_right[:, -1]
                    # compute information gain
                    curr_info_gain = self.information_gain(y, left_y, right_y, "etc")
                    # update the best split if needed
                    if curr_info_gain>max_info_gain:
                        best_split["feature_index"] = feature_index
                        best_split["threshold"] = threshold
                        best_split["dataset_left"] = dataset_left
                        best_split["dataset_right"] = dataset_right
                        best_split["info_gain"] = curr_info_gain
                        max_info_gain = curr_info_gain

        # return best split
        return best_split

    def split(self, dataset, feature_index, threshold):
        ''' function to split the data '''

        dataset_left = np.array([row for row in dataset if row[feature_index]<=threshold])
        dataset_right = np.array([row for row in dataset if row[feature_index]>threshold])
        return dataset_left, dataset_right

    def etc(self, y):
        y = list(y.astype(int))

        "Computes ETC of the given sequence"
        if len(y) == 1:
            return 0
        else:
            out =  etc(y)
            return out
    ############## START OF ETC CODE ###########
    def etc_gain(self, parent, l_child, r_child):
        left = l_child
        right = r_child
        """
        Computes the etc gain from splitting the parent dataset into two datasets.

        Parameters:
            parent (ndarray): Input parent dataset.
            left (ndarray): Subset of the parent dataset after split on a feature.
            right (ndarray): Subset of the parent dataset after split on a feature.

        Returns:
            etc_gain_ (float): effor to compress (etc) gain of the split.
        """

        # set initial information gain to 0
        etc_gain_ = 0
        # compute etc for parent
        parent_etc = self.etc(parent)
        # calculate weight for left and right nodes
        weight_left = len(left) / len(parent)
        weight_right= len(right) / len(parent)
        # compute etc for left and right nodes

        etc_left, etc_right = self.etc(left), self.etc(right)
        # calculate weighted entropy
        weighted_etc = weight_left * etc_left + weight_right * etc_right
        # calculate etc gain
        etc_gain_ = parent_etc - weighted_etc
        #print("ETC Gain = ", etc_gain_)
        return etc_gain_
###########################

    def information_gain(self, parent, l_child, r_child, mode="etc"):
        ''' function to compute information gain '''

        weight_l = len(l_child) / len(parent)
        weight_r = len(r_child) / len(parent)
        if mode=="etc":
            gain = self.etc_gain(parent, l_child, r_child)

        return gain

    def calculate_leaf_value(self, Y):
        ''' function to compute leaf node '''

        Y = list(Y)
        return max(Y, key=Y.count)

    def print_tree(self, tree=None, indent=" "):
        ''' function to print the tree '''

        if not tree:
            tree = self.root

        if tree.value is not None:
            print(tree.value)

        else:
            print("X_"+str(tree.feature_index), "<=", tree.threshold, "?", tree.info_gain)
            print("%sleft:" % (indent), end="")
            self.print_tree(tree.left, indent + indent)
            print("%sright:" % (indent), end="")
            self.print_tree(tree.right, indent + indent)

    def fit(self, X, Y):
        Y = np.array(Y).reshape(-1, 1)
        dataset = np.concatenate((X, Y), axis=1)
        self.root = self.build_tree(dataset)

    def predict(self, X):
        ''' function to predict new dataset '''

        preditions = [self.make_prediction(x, self.root) for x in X]
        return preditions

    def make_prediction(self, x, tree):
        ''' function to predict a single data point '''

        if tree.value!=None: return tree.value
        feature_val = x[tree.feature_index]
        if feature_val<=tree.threshold:
            return self.make_prediction(x, tree.left)
        else:
            return self.make_prediction(x, tree.right)

In [9]:
# importing dataset
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.utils import shuffle
import os
import numpy as np
from load import load
import time
# Get the current working directory
current_directory = os.getcwd()
# Define the path to the datasets folder
datasets_folder = os.path.join(current_directory, "datasets")
# List all the datasets in the folder, excluding hidden files
exclude_datasets = ['ionosphere', 'breastcancerwisconsin', 'wine', 'appendicitis', 'diabetespimaindian', 'sonar', 'iris', 'rice', 'timeseries.py']

datasets = [dataset for dataset in os.listdir(datasets_folder) if not (dataset.startswith('.') or dataset in exclude_datasets)]
# st = time.time()


for dataset_name in datasets:
    # importing dataset
    print(dataset_name)
    X, Y = load(dataset_name)

    # Define the number of splits for time series split
    num_splits = 5

    # Fix the seed value for splitting data
    split_seed = 42

    # Define different seed values for shuffling training data
    shuffle_seed_values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]

    # Define the range of depths for the decision tree
    depth_range = range(2, 21)

    # Initialize a list to store the results
    results = []
    # declaring test size for time series split
    time_series_split_test_size = 15

    # _______________________________________________________________________________

    # Variable description:
    # _______________________________________________________________________________

    #     X               -   Data attributes.
    #     y               -   Corresponding labels for X.
    #     X_train         -   Data attributes for training (80% of the dataset).
    #     y_train         -   Corresponding labels for X_train.
    #     X_test          -   Data attributes for testing (20% of the dataset).
    #     y_test          -   Corresponding labels for X_test.
    # Split the data into training and testing sets with the fixed seed

    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=split_seed)
    # print("hi boss", flush=True)

    # Perform experiments for each seed value for shuffling training data
    for shuffle_seed in shuffle_seed_values:
        # Shuffle only the training data with the current seed
        X_train_shuffled, y_train_shuffled = shuffle(X_train, y_train, random_state=shuffle_seed)
        # print("hello", flush=True)

        # Perform time series split
        tscv = TimeSeriesSplit(n_splits=num_splits,test_size=time_series_split_test_size)

        # Iterate over different depths for the decision tree
        for depth in depth_range:
            # Initialize lists to store evaluation metrics for each fold
            accuracies = []
            f1_scores = []
            precisions = []
            recalls = []

            # Perform time series split
            for split_num, (train_index, test_index) in enumerate(tscv.split(X_train_shuffled), 1):
                X_cv_train, X_cv_test = pd.DataFrame(X_train_shuffled).iloc[train_index].to_numpy(),pd.DataFrame(X_train_shuffled).iloc[test_index].to_numpy()
                y_cv_train, y_cv_test = pd.DataFrame(y_train_shuffled).iloc[train_index].to_numpy(), pd.DataFrame(y_train_shuffled).iloc[test_index].to_numpy()
                # print("check", flush=True)
                # curr_time = time.time()
                # print("Current time is: ", curr_time - st)

                # Train the decision tree model with the current depth
                tree_classifier = DecisionTreeClassifier(max_depth=depth)
                tree_classifier.fit(X_cv_train, y_cv_train)

                # Make predictions
                y_pred = tree_classifier.predict(X_cv_test)

                # Calculate evaluation metrics for this fold
                accuracy = accuracy_score(y_cv_test, y_pred)
                f1 = f1_score(y_cv_test, y_pred, average='macro')  # Use macro F1 score
                precision = precision_score(y_cv_test, y_pred, average='macro', zero_division = 1)
                recall = recall_score(y_cv_test, y_pred, average='macro', zero_division = 1)

                # Append metrics to the lists
                accuracies.append(accuracy)
                f1_scores.append(f1)
                precisions.append(precision)
                recalls.append(recall)

            # Calculate mean metrics across all folds
            mean_accuracy = sum(accuracies) / len(accuracies)
            mean_f1 = sum(f1_scores) / len(f1_scores)
            mean_precision = sum(precisions) / len(precisions)
            mean_recall = sum(recalls) / len(recalls)

            # Store the results for this seed value and depth
            results.append({
                'Split Seed': split_seed,
                'Shuffle Seed': shuffle_seed,
                'Depth of Tree': depth,
                'Mean Accuracy': mean_accuracy,
                'Mean F1 Score': mean_f1,
                'Mean Precision': mean_precision,
                'Mean Recall': mean_recall
            })

    # print("reached here")

    # Create a DataFrame from the results
    results_df = pd.DataFrame(results)
    # Group by 'Shuffle Seed' and find the maximum F1 score within each group
    max_f1_score_per_group = results_df.groupby('Shuffle Seed')['Mean F1 Score'].transform(max)

    # Filter the DataFrame to keep rows where 'Mean F1 Score' is equal to the maximum F1 score within each group
    max_f1_rows = results_df[results_df['Mean F1 Score'] == max_f1_score_per_group]

    # Group the max_f1_rows DataFrame by 'Shuffle Seed' and find the row with the minimum 'Depth of Tree' within each group
    min_depth_for_max_f1 = max_f1_rows.groupby('Shuffle Seed').apply(lambda x: x.loc[x['Depth of Tree'].idxmin()])

    # Extracting desired columns
    selected_columns = min_depth_for_max_f1[['Split Seed', 'Shuffle Seed', 'Depth of Tree']]

    # # Printing the extracted values
    # print("Shuffle seed values with corresponding depth for maximum F1 score:")
    # for _, row in selected_columns.iterrows():
    #     print(f"{{'Split Seed': {row['Split Seed']}, 'Shuffle Seed': {row['Shuffle Seed']}, 'Depth': {row['Depth of Tree']}}}")

    # Convert the selected columns to a numpy array and save it
    parameters_array = selected_columns.to_numpy()

    # Convert each row of the numpy array to a dictionary
    parameters_list = [{'Split Seed': row[0], 'Shuffle Seed': row[1], 'Depth': row[2]} for row in parameters_array]

        # Initialize a list to store the evaluation metrics for each model
    evaluation_metrics = []

    # Iterate over the top parameters
    for i, params in enumerate(parameters_list, start=1):
        split_seed = int(params['Split Seed'])  # Convert to integer
        shuffle_seed = int(params['Shuffle Seed'])
        max_depth = int(params['Depth'])

        # Split the dataset
        X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=split_seed)

        # Shuffle only the training data with the fixed seed
        X_train_shuffled, y_train_shuffled = shuffle(X_train, y_train, random_state=shuffle_seed)

        # print("arham")
        # Train the decision tree model with the entire training data
        tree_classifier = DecisionTreeClassifier(max_depth=max_depth)
        tree_classifier.fit(X_train_shuffled, y_train_shuffled)
        # print("jain")
        #  # Printing the decision tree
        # print(f"Model {i} Decision Tree:")
        # tree_classifier.print_tree()
        # print()  # empty line for readability

        # Make predictions
        y_pred_train_data = tree_classifier.predict(X_train_shuffled.values)

        # Calculate evaluation metrics
        accuracy_train = accuracy_score(y_train_shuffled, y_pred_train_data)
        f1_train = f1_score(y_train_shuffled, y_pred_train_data, average='macro')
        precision_train = precision_score(y_train_shuffled, y_pred_train_data, average='macro',zero_division=1)
        recall_train = recall_score(y_train_shuffled, y_pred_train_data, average='macro', zero_division=1)

        # Append the evaluation metrics to the list
        evaluation_metrics.append({
            'Model': i,
            'Split Seed': split_seed,
            'Shuffle Seed': shuffle_seed,
            'Max Depth': max_depth,
            'Accuracy_train': accuracy_train,
            'F1 Score_train': f1_train,
            'Precision_train': precision_train,
            'Recall_Train' : recall_train
        })

            # Find the row with the maximum F1 score
    max_f1_row = results_df.loc[results_df['Mean F1 Score'].idxmax()]

    # Extract the shuffle seed value with maximum F1 score
    shuffle_seed_with_max_f1_score = max_f1_row['Shuffle Seed']

    # Find the index of the row with shuffle_seed_with_max_f1_score in parameters list
    max_f1_index = None
    for idx, params in enumerate(parameters_list):
        if params['Shuffle Seed'] == shuffle_seed_with_max_f1_score:
            max_f1_index = idx
            break

    if max_f1_index is not None:
        max_f1_params = parameters_list[max_f1_index]

    # Finding the depth corresponding to the shuffle seed value
    corresponding_depth = max_f1_params['Depth']

    # # Print the chosen shuffle seed value
    # print("Chosen Shuffle Seed for Training:", shuffle_seed_with_max_f1_score)
    # print("Corresponding depth:", corresponding_depth)

    # Split the data into training and testing sets using the chosen shuffle seed value
    X_train_pdt, X_test_pdt, y_train_pdt, y_test_pdt = train_test_split(X, Y, test_size=0.2, random_state=split_seed)

    # Shuffle only the training data with the chosen shuffle seed value
    X_train_shuffled_pdt, y_train_shuffled_pdt = shuffle(X_train_pdt, y_train_pdt, random_state=int(shuffle_seed_with_max_f1_score))

    # Train a decision tree classifier with the chosen depth
    classifier_pdt = DecisionTreeClassifier(max_depth=corresponding_depth)
    classifier_pdt.fit(X_train_shuffled_pdt, y_train_shuffled_pdt)

    # Predict labels for the testing data
    y_pred_pdt = tree_classifier.predict(X_test_pdt.values)

    # Calculate evaluation metrics
    accuracy_pdt = accuracy_score(y_test_pdt, y_pred_pdt)
    f1_score_pdt = f1_score(y_test_pdt, y_pred_pdt, average='macro')
    precision_pdt = precision_score(y_test_pdt, y_pred_pdt, average='macro', zero_division=1)
    recall_pdt = recall_score(y_test_pdt, y_pred_pdt, average='macro', zero_division=1)

    # Print the evaluation metrics
    print("Evaluation Metrics PDT:")
    print(f"Accuracy: {accuracy_pdt}")
    print(f"F1 Score: {f1_score_pdt}")
    print(f"Precision: {precision_pdt}")
    print(f"Recall: {recall_pdt}")
    print(f"best_Shuffleseed_used_for_pdt: {shuffle_seed_with_max_f1_score}")
    print(f"Corresponding Depth: {corresponding_depth}")

    # Creating this because we want to save the result in form of csv and numpy
    evaluation_metrics_pdt = {
        "Accuracy_PDT": accuracy_pdt,
        "F1 Score_PDT": f1_score_pdt,
        "Precision_PDT": precision_pdt,
        "Recall_PDT": recall_pdt,
        "best_Shuffleseed_used_for_pdt": shuffle_seed_with_max_f1_score,
        "Corresponding Depth" : corresponding_depth
    }


    # Initialize a list to store the top shuffle seeds along with their corresponding depths
    top_shuffle_seeds = []

    # Iterate over the top parameters
    for params in parameters_list:
        split_seed = int(params['Split Seed'])
        shuffle_seed = int(params['Shuffle Seed'])
        depth = int(params['Depth'])
        top_shuffle_seeds.append({'Split Seed':split_seed, 'Shuffle Seed': shuffle_seed, 'Depth': depth})

    # Load the dataset
    X_train_pdf, X_test_pdf, y_train_pdf, y_test_pdf = train_test_split(X, Y, test_size=0.2, random_state=split_seed)

    # Create an empty matrix to store predictions
    predictions_matrix = np.zeros((len(X_test), len(top_shuffle_seeds)), dtype=int)

    # Iterate over each test sample
    # for i, x_test_sample in enumerate(X_test):
        # Make predictions using each model
    for j, params in enumerate(top_shuffle_seeds):
        split_seed = (params['Split Seed'])
        shuffle_seed = params['Shuffle Seed']
        max_depth = params['Depth']
        
        # Shuffle only the training data with the fixed seed
        X_train_shuffled_pdf, y_train_shuffled_pdf = shuffle(X_train_pdf, y_train_pdf, random_state=shuffle_seed)

        # Train the decision tree model with the entire training data
        tree_classifier = DecisionTreeClassifier(max_depth=max_depth)
        # print("reached checkpoint1")
        tree_classifier.fit(X_train_shuffled_pdf, y_train_shuffled_pdf)
        # print("reached checkpoint2")
        # Make prediction for the current test sample using the trained model
        prediction = tree_classifier.predict(X_test_pdf.values)

        # Store the prediction in the matrix
        predictions_matrix[:, j] = prediction

    # Initialize lists to store final predictions and evaluation metrics
    final_predictions = []
    accuracies = []
    precisions = []
    f1_scores = []

    # Iterate over each row in the predictions matrix
    for i in range(predictions_matrix.shape[0]):
        # Find the majority occurring element in the row
        majority_prediction = np.bincount(predictions_matrix[i,:]).argmax()

        # Append the majority prediction to the final predictions list
        final_predictions.append(majority_prediction)
    final_prediction = predictions_matrix[:,-1]
    # Calculate accuracy, precision, F1-score and recall using the majority prediction and actual test label
    accuracy_test_pdf = accuracy_score(y_test_pdf, final_predictions)
    precision_test_pdf = precision_score(y_test_pdf, final_predictions, average='macro', zero_division=1)
    f1_test_pdf = f1_score(y_test_pdf, final_predictions, average='macro')
    recall_test_pdf = recall_score(y_test_pdf, final_predictions, average='macro', zero_division=1)



    # Print the final evaluation metrics
    print("Evaluation Metrics PDF:")
    print(f"Accuracy_test_pdf: {accuracy_test_pdf}")
    print(f"Precision_test_pdf: {precision_test_pdf}")
    print(f"F1 Score_test_pdf: {f1_test_pdf}")
    print(f"Recall_test_pdf: {recall_test_pdf}")

    # Creating this because we want to save the result in form of csv and numpy
    evaluation_metrics_pdf = {
        "Accuracy": accuracy_test_pdf,
        "F1 Score": f1_test_pdf,
        "Precision": precision_test_pdf,
        "Recall": recall_test_pdf
    }

    # Define the path to the results folder
    results_folder = os.path.join(current_directory, "results")

    # Define the dataset name
    dataset_name = dataset_name

    # Create a folder for the current dataset within the results directory
    dataset_results_folder = os.path.join(results_folder, dataset_name)
    os.makedirs(dataset_results_folder, exist_ok=True)


    # Create folders for PDT and pdf within the dataset results folder
    pdt_folder = os.path.join(dataset_results_folder, "PDT")
    pdf_folder = os.path.join(dataset_results_folder, "PDF")
    os.makedirs(pdt_folder, exist_ok=True)
    os.makedirs(pdf_folder, exist_ok=True)

    # Path for PDT csv file
    pdt_metrics_csv_path = os.path.join(pdt_folder, "evaluation_metrics_pdt_test.csv")
    # saving this to PDT folder
    pd.DataFrame(evaluation_metrics_pdt.items(), columns=["Metric", "Value"]).to_csv(pdt_metrics_csv_path, index=False)

    # Path for PDT numpy file
    pdt_metrics_npy_path = os.path.join(pdt_folder, "evaluation_metrics_pdt_test.npy")
    #saving it to PDT folder
    np.save(pdt_metrics_npy_path, evaluation_metrics_pdt)

    # Convert evaluation_metrics to DataFrame
    training_data_df = pd.DataFrame(evaluation_metrics)
    # Path for Training metrics csv file
    training_metrics_csv_path = os.path.join(pdt_folder, "evaluation_metrics_training_data_of_top_paramaters.csv")
    # saving this to PDT folder
    training_data_df.to_csv(training_metrics_csv_path, index=False)

    # Path for Training metrics numpy file
    training_metrics_npy_path = os.path.join(pdt_folder, "evaluation_metrics_training_data_of_top_paramaters.npy")
    # saving this to PDT folder
    np.save(training_metrics_npy_path, evaluation_metrics)

    # Save parameters_list to CSV
    parameters_csv_path = os.path.join(pdt_folder, "top_parameters_list.csv")
    pd.DataFrame(parameters_list).to_csv(parameters_csv_path, index=False)

    # Save parameters_list to NumPy
    parameters_npy_path = os.path.join(pdt_folder, "top_parameters_list.npy")
    np.save(parameters_npy_path, parameters_list)

    # Save results_df to CSV
    results_csv_path = os.path.join(pdt_folder, "all_shuffleseed_with_depth.csv")
    results_df.to_csv(results_csv_path, index=False)

    # Save results_df to NumPy
    results_npy_path = os.path.join(pdt_folder, "all_shuffleseed_with_depth.npy")
    np.save(results_npy_path, results)

    # Path for PDF csv file
    pdf_metrics_csv_path = os.path.join(pdf_folder, "evaluation_metrics_pdf_test.csv")
    # saving this to PDT folder
    pd.DataFrame(evaluation_metrics_pdf.items(), columns=["Metric", "Value"]).to_csv(pdf_metrics_csv_path, index=False)

    # Path for PDF numpy file
    pdf_metrics_npy_path = os.path.join(pdf_folder, "evaluation_metrics_pdf_test.npy")
    #saving it to PDF folder
    np.save(pdf_metrics_npy_path, evaluation_metrics_pdf)

nyse_stock_data.data


  max_f1_score_per_group = results_df.groupby('Shuffle Seed')['Mean F1 Score'].transform(max)
  min_depth_for_max_f1 = max_f1_rows.groupby('Shuffle Seed').apply(lambda x: x.loc[x['Depth of Tree'].idxmin()])


Evaluation Metrics PDT:
Accuracy: 0.971830985915493
F1 Score: 0.9678442028985508
Precision: 0.9678442028985508
Recall: 0.9678442028985508
best_Shuffleseed_used_for_pdt: 1.0
Corresponding Depth: 2.0
Evaluation Metrics PDF:
Accuracy_test_pdf: 0.971830985915493
Precision_test_pdf: 0.96
F1 Score_test_pdf: 0.9685283687943262
Recall_test_pdf: 0.9791666666666667
timeseries.py
Dataset not found.


TypeError: Expected sequence or array-like, got <class 'NoneType'>

In [None]:
# import pandas as pd
# from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
# from sklearn.model_selection import train_test_split, TimeSeriesSplit
# from sklearn.utils import shuffle

# # Define the number of splits for time series split
# num_splits = 5

# # Fix the seed value for splitting data
# split_seed = 42

# # Define different seed values for shuffling training data
# shuffle_seed_values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]

# # Define the range of depths for the decision tree
# depth_range = range(2, 21)

# # Initialize a list to store the results
# results = []
# # declaring test size for time series split
# time_series_split_test_size = 15

# # _______________________________________________________________________________

# # Variable description:
# # _______________________________________________________________________________

# #     X               -   Data attributes.
# #     y               -   Corresponding labels for X.
# #     X_train         -   Data attributes for training (80% of the dataset).
# #     y_train         -   Corresponding labels for X_train.
# #     X_test          -   Data attributes for testing (20% of the dataset).
# #     y_test          -   Corresponding labels for X_test.
# # Split the data into training and testing sets with the fixed seed

# X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=split_seed)

# # Perform experiments for each seed value for shuffling training data
# for shuffle_seed in shuffle_seed_values:
#     # Shuffle only the training data with the current seed
#     X_train_shuffled, y_train_shuffled = shuffle(X_train, y_train, random_state=shuffle_seed)

#     # Perform time series split
#     tscv = TimeSeriesSplit(n_splits=num_splits,test_size=time_series_split_test_size)

#     # Iterate over different depths for the decision tree
#     for depth in depth_range:
#         # Initialize lists to store evaluation metrics for each fold
#         accuracies = []
#         f1_scores = []
#         precisions = []
#         recalls = []

#         # Perform time series split
#         for split_num, (train_index, test_index) in enumerate(tscv.split(X_train_shuffled), 1):
#             X_cv_train, X_cv_test = X_train_shuffled[train_index], X_train_shuffled[test_index]
#             y_cv_train, y_cv_test = y_train_shuffled[train_index], y_train_shuffled[test_index]

#             # Train the decision tree model with the current depth
#             tree_classifier = DecisionTreeClassifier(max_depth=depth)
#             tree_classifier.fit(X_cv_train, y_cv_train)

#             # Make predictions
#             y_pred = tree_classifier.predict(X_cv_test)

#             # Calculate evaluation metrics for this fold
#             accuracy = accuracy_score(y_cv_test, y_pred)
#             f1 = f1_score(y_cv_test, y_pred, average='macro')  # Use macro F1 score
#             precision = precision_score(y_cv_test, y_pred, average='macro', zero_division = 1)
#             recall = recall_score(y_cv_test, y_pred, average='macro', zero_division = 1)

#             # Append metrics to the lists
#             accuracies.append(accuracy)
#             f1_scores.append(f1)
#             precisions.append(precision)
#             recalls.append(recall)

#         # Calculate mean metrics across all folds
#         mean_accuracy = sum(accuracies) / len(accuracies)
#         mean_f1 = sum(f1_scores) / len(f1_scores)
#         mean_precision = sum(precisions) / len(precisions)
#         mean_recall = sum(recalls) / len(recalls)

#         # Store the results for this seed value and depth
#         results.append({
#             'Split Seed': split_seed,
#             'Shuffle Seed': shuffle_seed,
#             'Depth of Tree': depth,
#             'Mean Accuracy': mean_accuracy,
#             'Mean F1 Score': mean_f1,
#             'Mean Precision': mean_precision,
#             'Mean Recall': mean_recall
#         })

# # Create a DataFrame from the results
# results_df = pd.DataFrame(results)

In [None]:
# import pandas as pd
# import numpy as np

# # Group by 'Shuffle Seed' and find the maximum F1 score within each group
# max_f1_score_per_group = results_df.groupby('Shuffle Seed')['Mean F1 Score'].transform(max)

# # Filter the DataFrame to keep rows where 'Mean F1 Score' is equal to the maximum F1 score within each group
# max_f1_rows = results_df[results_df['Mean F1 Score'] == max_f1_score_per_group]

# # Group the max_f1_rows DataFrame by 'Shuffle Seed' and find the row with the minimum 'Depth of Tree' within each group
# min_depth_for_max_f1 = max_f1_rows.groupby('Shuffle Seed').apply(lambda x: x.loc[x['Depth of Tree'].idxmin()])

# # Extracting desired columns
# selected_columns = min_depth_for_max_f1[['Split Seed', 'Shuffle Seed', 'Depth of Tree']]

# # # Printing the extracted values
# # print("Shuffle seed values with corresponding depth for maximum F1 score:")
# # for _, row in selected_columns.iterrows():
# #     print(f"{{'Split Seed': {row['Split Seed']}, 'Shuffle Seed': {row['Shuffle Seed']}, 'Depth': {row['Depth of Tree']}}}")

# # Convert the selected columns to a numpy array and save it
# parameters_array = selected_columns.to_numpy()

# # Convert each row of the numpy array to a dictionary
# parameters_list = [{'Split Seed': row[0], 'Shuffle Seed': row[1], 'Depth': row[2]} for row in parameters_array]

In [None]:
# import pandas as pd
# from sklearn.metrics import accuracy_score, f1_score, precision_score
# from sklearn.utils import shuffle
# from sklearn.model_selection import train_test_split
# import numpy as np

# # Initialize a list to store the evaluation metrics for each model
# evaluation_metrics = []

# # Iterate over the top parameters
# for i, params in enumerate(parameters_list, start=1):
#     split_seed = int(params['Split Seed'])  # Convert to integer
#     shuffle_seed = int(params['Shuffle Seed'])
#     max_depth = int(params['Depth'])

#     # Split the dataset
#     X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=split_seed)

#     # Shuffle only the training data with the fixed seed
#     X_train_shuffled, y_train_shuffled = shuffle(X_train, y_train, random_state=shuffle_seed)

#     # Train the decision tree model with the entire training data
#     tree_classifier = DecisionTreeClassifier(max_depth=max_depth)
#     tree_classifier.fit(X_train_shuffled, y_train_shuffled)

#     #  # Printing the decision tree
#     # print(f"Model {i} Decision Tree:")
#     # tree_classifier.print_tree()
#     # print()  # empty line for readability

#     # Make predictions on the training data
#     y_pred_train_data = tree_classifier.predict(X_train_shuffled)

#     # Calculate evaluation metrics
#     accuracy_train = accuracy_score(y_train_shuffled, y_pred_train_data)
#     f1_train = f1_score(y_train_shuffled, y_pred_train_data, average='macro')
#     precision_train = precision_score(y_train_shuffled, y_pred_train_data, average='macro',zero_division=1)

#     # Append the evaluation metrics to the list
#     evaluation_metrics.append({
#         'Model': i,
#         'Split Seed': split_seed,
#         'Shuffle Seed': shuffle_seed,
#         'Max Depth': max_depth,
#         'Accuracy_train': accuracy_train,
#         'F1 Score_train': f1_train,
#         'Precision_train': precision_train
#     })

# # # Display the evaluation metrics for each model
# # for metrics in evaluation_metrics:
# #     print(f"Model {metrics['Model']} Evaluation Metrics:")
# #     print(metrics)

In [None]:
# import os
# # Find the row with the maximum F1 score
# max_f1_row = results_df.loc[results_df['Mean F1 Score'].idxmax()]

# # Extract the shuffle seed value with maximum F1 score
# shuffle_seed_with_max_f1_score = max_f1_row['Shuffle Seed']

# # Find the index of the row with shuffle_seed_with_max_f1_score in parameters list
# max_f1_index = None
# for idx, params in enumerate(parameters_list):
#     if params['Shuffle Seed'] == shuffle_seed_with_max_f1_score:
#         max_f1_index = idx
#         break

# if max_f1_index is not None:
#     max_f1_params = parameters_list[max_f1_index]

# # Finding the depth corresponding to the shuffle seed value
# corresponding_depth = max_f1_params['Depth']

# # # Print the chosen shuffle seed value
# # print("Chosen Shuffle Seed for Training:", shuffle_seed_with_max_f1_score)
# # print("Corresponding depth:", corresponding_depth)

# # Split the data into training and testing sets using the chosen shuffle seed value
# X_train_pdt, X_test_pdt, y_train_pdt, y_test_pdt = train_test_split(X, Y, test_size=0.2, random_state=split_seed)

# # Shuffle only the training data with the chosen shuffle seed value
# X_train_shuffled_pdt, y_train_shuffled_pdt = shuffle(X_train_pdt, y_train_pdt, random_state=int(shuffle_seed_with_max_f1_score))

# # Train a decision tree classifier with the chosen depth
# classifier_pdt = DecisionTreeClassifier(max_depth=corresponding_depth)
# classifier_pdt.fit(X_train_shuffled_pdt, y_train_shuffled_pdt)

# # Predict labels for the testing data
# y_pred_pdt = classifier_pdt.predict(X_test_pdt)

# # Calculate evaluation metrics
# accuracy_pdt = accuracy_score(y_test_pdt, y_pred_pdt)
# f1_score_pdt = f1_score(y_test_pdt, y_pred_pdt, average='macro')
# precision_pdt = precision_score(y_test_pdt, y_pred_pdt, average='macro', zero_division=1)
# recall_pdt = recall_score(y_test_pdt, y_pred_pdt, average='macro', zero_division=1)

# # Print the evaluation metrics
# print("Evaluation Metrics:")
# print(f"Accuracy: {accuracy_pdt}")
# print(f"F1 Score: {f1_score_pdt}")
# print(f"Precision: {precision_pdt}")
# print(f"Recall: {recall_pdt}")

# # Creating this because we want to save the result in form of csv and numpy
# evaluation_metrics_pdt = {
#     "Accuracy": accuracy_pdt,
#     "F1 Score": f1_score_pdt,
#     "Precision": precision_pdt,
#     "Recall": recall_pdt
# }


In [None]:
# import numpy as np
# from sklearn.metrics import accuracy_score, f1_score, precision_score
# from sklearn.utils import shuffle
# from sklearn.model_selection import train_test_split

# # Initialize a list to store the top shuffle seeds along with their corresponding depths
# top_shuffle_seeds = []

# # Iterate over the top parameters
# for params in parameters_list:
#     split_seed = int(params['Split Seed'])
#     shuffle_seed = int(params['Shuffle Seed'])
#     depth = int(params['Depth'])
#     top_shuffle_seeds.append({'Split Seed':split_seed, 'Shuffle Seed': shuffle_seed, 'Depth': depth})

# # Load the dataset
# X_train_pdf, X_test_pdf, y_train_pdf, y_test_pdf = train_test_split(X, Y, test_size=0.2, random_state=split_seed)

# # Create an empty matrix to store predictions
# predictions_matrix = np.zeros((len(X_test), len(top_shuffle_seeds)), dtype=int)

# # Iterate over each test sample
# # for i, x_test_sample in enumerate(X_test):
#     # Make predictions using each model
# for j, params in enumerate(top_shuffle_seeds):
#     split_seed = (params['Split Seed'])
#     shuffle_seed = params['Shuffle Seed']
#     max_depth = params['Depth']

#     # Shuffle only the training data with the fixed seed
#     X_train_shuffled_pdf, y_train_shuffled_pdf = shuffle(X_train_pdf, y_train_pdf, random_state=shuffle_seed)

#     # Train the decision tree model with the entire training data
#     tree_classifier = DecisionTreeClassifier(max_depth=max_depth)
#     tree_classifier.fit(X_train_shuffled_pdf, y_train_shuffled_pdf)

#     # Make prediction for the current test sample using the trained model
#     prediction = tree_classifier.predict(X_test_pdf)

#     # Store the prediction in the matrix
#     predictions_matrix[:, j] = prediction

# # Initialize lists to store final predictions and evaluation metrics
# final_predictions = []
# accuracies = []
# precisions = []
# f1_scores = []

# # Iterate over each row in the predictions matrix
# for i in range(predictions_matrix.shape[0]):
#     # Find the majority occurring element in the row
#     majority_prediction = np.bincount(predictions_matrix[i,:]).argmax()

#     # Append the majority prediction to the final predictions list
#     final_predictions.append(majority_prediction)
# final_prediction = predictions_matrix[:,-1]
# # Calculate accuracy, precision, F1-score and recall using the majority prediction and actual test label
# accuracy_test_pdf = accuracy_score(y_test_pdf, final_predictions)
# precision_test_pdf = precision_score(y_test_pdf, final_predictions, average='macro', zero_division=1)
# f1_test_pdf = f1_score(y_test_pdf, final_predictions, average='macro')
# recall_test_pdf = recall_score(y_test_pdf, final_predictions, average='macro', zero_division=1)



# # Print the final evaluation metrics
# print("Overall Evaluation Metrics:")
# print(f"Accuracy_test_pdf: {accuracy_test_pdf}")
# print(f"Precision_test_pdf: {precision_test_pdf}")
# print(f"F1 Score_test_pdf: {f1_test_pdf}")
# print(f"Recall_test_pdf: {recall_test_pdf}")

# # Creating this because we want to save the result in form of csv and numpy
# evaluation_metrics_pdf = {
#     "Accuracy": accuracy_test_pdf,
#     "F1 Score": f1_test_pdf,
#     "Precision": precision_test_pdf,
#     "Recall": recall_test_pdf
# }


In [None]:
# import os
# # Get the current working directory
# current_directory = os.getcwd()

# # Define the path to the results folder
# results_folder = os.path.join(current_directory, "results")

# # Define the dataset name
# dataset_name = dataset_name

# # Create a folder for the current dataset within the results directory
# dataset_results_folder = os.path.join(results_folder, dataset_name)
# os.makedirs(dataset_results_folder, exist_ok=True)


# # Create folders for PDT and pdf within the dataset results folder
# pdt_folder = os.path.join(dataset_results_folder, "PDT")
# pdf_folder = os.path.join(dataset_results_folder, "PDF")
# os.makedirs(pdt_folder, exist_ok=True)
# os.makedirs(pdf_folder, exist_ok=True)

# # Path for PDT csv file
# pdt_metrics_csv_path = os.path.join(pdt_folder, "evaluation_metrics_pdt_test.csv")
# # saving this to PDT folder
# pd.DataFrame(evaluation_metrics_pdt.items(), columns=["Metric", "Value"]).to_csv(pdt_metrics_csv_path, index=False)

# # Path for PDT numpy file
# pdt_metrics_npy_path = os.path.join(pdt_folder, "evaluation_metrics_pdt_test.npy")
# #saving it to PDT folder
# np.save(pdt_metrics_npy_path, evaluation_metrics_pdt)

# # Save parameters_list to CSV
# parameters_csv_path = os.path.join(pdt_folder, "top_parameters_list.csv")
# pd.DataFrame(parameters_list).to_csv(parameters_csv_path, index=False)

# # Save parameters_list to NumPy
# parameters_npy_path = os.path.join(pdt_folder, "top_parameters_list.npy")
# np.save(parameters_npy_path, parameters_list)

# # Save results_df to CSV
# results_csv_path = os.path.join(pdt_folder, "all_shuffleseed_with_depth.csv")
# results_df.to_csv(results_csv_path, index=False)

# # Save results_df to NumPy
# results_npy_path = os.path.join(pdt_folder, "all_shuffleseed_with_depth.npy")
# np.save(results_npy_path, results)

# # Path for PDF csv file
# pdf_metrics_csv_path = os.path.join(pdf_folder, "evaluation_metrics_pdf_test.csv")
# # saving this to PDT folder
# pd.DataFrame(evaluation_metrics_pdf.items(), columns=["Metric", "Value"]).to_csv(pdf_metrics_csv_path, index=False)

# # Path for PDF numpy file
# pdf_metrics_npy_path = os.path.join(pdf_folder, "evaluation_metrics_pdf_test.npy")
# #saving it to PDF folder
# np.save(pdf_metrics_npy_path, evaluation_metrics_pdf)