In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.ensemble import RandomForestClassifier
import os
import numpy as np
from load import load
# Get the current working directory
current_directory = os.getcwd()
# Define the path to the datasets folder
datasets_folder = os.path.join(current_directory, "datasets")
# datasets = [dataset for dataset in os.listdir(datasets_folder) if not (dataset.startswith('.') or dataset in exclude_datasets)]
datasets = [dataset for dataset in os.listdir(datasets_folder) if not (dataset.startswith('.'))]

for dataset_name in datasets:
    # importing dataset
    print(dataset_name)
    X, Y = load(dataset_name)
        # Define the number of splits for time series split
    num_splits = 5

    # Fix the seed value for splitting data
    split_seed = 42

    # Define the range of depths for the decision tree
    depth_range = range(2, 21)

    # Initialize a list to store the results
    results = []

    # Define the range of estimators for the random forest
    estimators_range = [10, 50, 100, 150, 1000]

    # declaring test size for time series split
    time_series_split_test_size = 15

    # Perform time series split
    tscv = TimeSeriesSplit(n_splits=num_splits, test_size=time_series_split_test_size)

    # _______________________________________________________________________________

    # Variable description:
    # _______________________________________________________________________________

    #     X               -   Data attributes.
    #     y               -   Corresponding labels for X.
    #     X_train         -   Data attributes for training (80% of the dataset).
    #     y_train         -   Corresponding labels for X_train.
    #     X_test          -   Data attributes for testing (20% of the dataset).
    #     y_test          -   Corresponding labels for X_test.


    # Split the data into training and testing sets with the fixed seed
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=split_seed)

    # Iterate over different numbers of estimators for Random Forest
    for n_estimators in estimators_range:
        # Iterate over different depths for the decision tree
        for depth in depth_range:
            # Initialize lists to store evaluation metrics for each fold
            accuracies = []
            f1_scores = []
            precisions = []
            recalls = []

            # Perform time series split
            for train_index, test_index in tscv.split(X_train):
                X_cv_train, X_cv_test = pd.DataFrame(X_train).iloc[train_index], pd.DataFrame(X_train).iloc[test_index]
                y_cv_train, y_cv_test = pd.DataFrame(y_train).iloc[train_index], pd.DataFrame(y_train).iloc[test_index]

                # Train the Random Forest model with the current number of estimators and depth
                rf_classifier = RandomForestClassifier(criterion='gini', n_estimators=n_estimators, max_depth=depth, random_state= 21)
                y_cv_train = np.ravel(y_cv_train)
                rf_classifier.fit(X_cv_train, y_cv_train)

                # Make predictions
                y_pred = rf_classifier.predict(X_cv_test)

                # Calculate evaluation metrics for this fold
                accuracy = accuracy_score(y_cv_test, y_pred)
                f1 = f1_score(y_cv_test, y_pred, average='macro')  # Use macro F1 score
                precision = precision_score(y_cv_test, y_pred, average='macro', zero_division=1)
                recall = recall_score(y_cv_test, y_pred, average='macro', zero_division=1)

                # Append metrics to the lists
                accuracies.append(accuracy)
                f1_scores.append(f1)
                precisions.append(precision)
                recalls.append(recall)

            # Calculate mean metrics across all folds
            mean_accuracy = sum(accuracies) / len(accuracies)
            mean_f1 = sum(f1_scores) / len(f1_scores)
            mean_precision = sum(precisions) / len(precisions)
            mean_recall = sum(recalls) / len(recalls)

            # Store the results for this seed value and depth
            results.append({
                'Split Seed': split_seed,
                'N_estimators': n_estimators,
                'Depth of Tree': depth,
                'Mean Accuracy': mean_accuracy,
                'Mean F1 Score': mean_f1,
                'Mean Precision': mean_precision,
                'Mean Recall': mean_recall
            })

    # Create a DataFrame from the results
    results_df = pd.DataFrame(results)
    # Group by 'N_estimators' and find the row with maximum 'Mean F1 Score' for each group
    max_f1_per_estimator = results_df.loc[results_df.groupby('N_estimators')['Mean F1 Score'].idxmax()]

    # Find the row with the maximum 'Mean F1 Score' across all 'N_estimators'
    max_f1_row = max_f1_per_estimator.loc[max_f1_per_estimator['Mean F1 Score'].idxmax()]

    # Extract the maximum n_estimator and corresponding depth with the minimum value
    max_n_estimator = max_f1_row['N_estimators']
    max_depth_for_estimator = max_f1_per_estimator.loc[max_f1_per_estimator['N_estimators'] == max_n_estimator, 'Depth of Tree'].min()

    print("\nMaximum n_estimator and corresponding depth:")
    print("n_estimator:", max_n_estimator)
    print("Depth:", max_depth_for_estimator) 

        # Train Random Forest classifier with the best parameters
    best_rf_classifier = RandomForestClassifier(criterion='gini',n_estimators=int(max_n_estimator), max_depth=max_depth_for_estimator, random_state= 21)
    y_train = np.ravel(y_train)
    best_rf_classifier.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred_test_rf = best_rf_classifier.predict(X_test)

    # Calculate evaluation metrics
    accuracy_test_rf = accuracy_score(y_test, y_pred_test_rf)
    precision_test_rf = precision_score(y_test, y_pred_test_rf, average='macro', zero_division=1)
    recall_test_rf = recall_score(y_test, y_pred_test_rf, average='macro', zero_division=1)
    f1_test_rf = f1_score(y_test, y_pred_test_rf, average='macro')

    print("\nPerformance metrics on test set:")
    print("Accuracy_test_rf:", accuracy_test_rf)
    print("F1 Score_test_rf:", f1_test_rf)
    print("Precision_test_rf:", precision_test_rf)
    print("Recall_test_rf:", recall_test_rf)

    # Creating this because we want to save the result in form of csv and numpy
    evaluation_metrics_rf_test = {
        "Accuracy": accuracy_test_rf,
        "F1 Score": f1_test_rf,
        "Precision": precision_test_rf,
        "Recall": recall_test_rf,
        "best_N_estimator_":max_n_estimator,
        "Corresponding_depth" : max_depth_for_estimator
    }

    # Define the path to the results folder
    results_folder = os.path.join(current_directory, "results")

    # Define the dataset name
    dataset_name = dataset_name

    # Create a folder for the current dataset within the results directory
    dataset_results_folder = os.path.join(results_folder, dataset_name)
    os.makedirs(dataset_results_folder, exist_ok=True)

    # Create folder for random forest results
    rf_folder = os.path.join(dataset_results_folder, "RF_Gini")
    os.makedirs(rf_folder, exist_ok=True)

    # Save results_df to CSV
    results_csv_path = os.path.join(rf_folder, "all_nestimator_with_depth.csv")
    results_df.to_csv(results_csv_path, index=False)

    # Save results_rf_gini to NumPy
    results_npy_path = os.path.join(rf_folder, "all_nestimator_with_depth.npy")
    np.save(results_npy_path, results)

    # Path for RF csv file
    rf_metrics_csv_path = os.path.join(rf_folder, "evaluation_metrics_rf_test.csv")
    # saving this to DT folder
    pd.DataFrame(evaluation_metrics_rf_test.items(), columns=["Metric", "Value"]).to_csv(rf_metrics_csv_path, index=False)

    # Path for RF numpy file
    rf_metrics_npy_path = os.path.join(rf_folder, "evaluation_metrics_rf_test.npy")
    # saving it to RF_Gini folder
    np.save(rf_metrics_npy_path, evaluation_metrics_rf_test)

nyse_stock_data.data

Maximum n_estimator and corresponding depth:
n_estimator: 10.0
Depth: 4

Performance metrics on test set:
Accuracy_test_rf: 0.9859154929577465
F1 Score_test_rf: 0.9840985442329226
Precision_test_rf: 0.9791666666666667
Recall_test_rf: 0.9895833333333333
timeseries.py
Dataset not found.


TypeError: Expected sequence or array-like, got <class 'NoneType'>