## Import the lib

In [None]:
import os
import sys
import yaml
import joblib
import argparse
import warnings
import traceback
import numpy as np
import pandas as pd
from tqdm import tqdm
from xgboost import XGBClassifier
from sklearn.decomposition import PCA
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import (
    accuracy_score,
    recall_score,
    f1_score,
    precision_score,
    classification_report,
)

# Utils file

In [None]:
single_nucleosides = ["A", "C", "G", "T"]

di_nucleosides = [
    "AA",
    "AC",
    "AG",
    "AT",
    "CA",
    "CC",
    "CG",
    "CT",
    "GA",
    "GC",
    "GG",
    "GT",
    "TA",
    "TC",
    "TG",
    "TT",
]

tri_nucleosides = [
    "AAA",
    "AAC",
    "AAG",
    "AAT",
    "ACA",
    "ACC",
    "ACG",
    "ACT",
    "AGA",
    "AGC",
    "AGG",
    "AGT",
    "ATA",
    "ATC",
    "ATG",
    "ATT",
    "CAA",
    "CAC",
    "CAG",
    "CAT",
    "CCA",
    "CCC",
    "CCG",
    "CCT",
    "CGA",
    "CGC",
    "CGG",
    "CGT",
    "CTA",
    "CTC",
    "CTG",
    "CTT",
    "GAA",
    "GAC",
    "GAG",
    "GAT",
    "GCA",
    "GCC",
    "GCG",
    "GCT",
    "GGA",
    "GGC",
    "GGG",
    "GGT",
    "GTA",
    "GTC",
    "GTG",
    "GTT",
    "TAA",
    "TAC",
    "TAG",
    "TAT",
    "TCA",
    "TCC",
    "TCG",
    "TCT",
    "TGA",
    "TGC",
    "TGG",
    "TGT",
    "TTA",
    "TTC",
    "TTG",
    "TTT",
]


tetra_nucleosides = [
    "AAAA",
    "AAAC",
    "AAAG",
    "AAAT",
    "AACA",
    "AACC",
    "AACG",
    "AACT",
    "AAGA",
    "AAGC",
    "AAGG",
    "AAGT",
    "AATA",
    "AATC",
    "AATG",
    "AATT",
    "ACAA",
    "ACAC",
    "ACAG",
    "ACAT",
    "ACCA",
    "ACCC",
    "ACCG",
    "ACCT",
    "ACGA",
    "ACGC",
    "ACGG",
    "ACGT",
    "ACTA",
    "ACTC",
    "ACTG",
    "ACTT",
    "AGAA",
    "AGAC",
    "AGAG",
    "AGAT",
    "AGCA",
    "AGCC",
    "AGCG",
    "AGCT",
    "AGGA",
    "AGGC",
    "AGGG",
    "AGGT",
    "AGTA",
    "AGTC",
    "AGTG",
    "AGTT",
    "ATAA",
    "ATAC",
    "ATAG",
    "ATAT",
    "ATCA",
    "ATCC",
    "ATCG",
    "ATCT",
    "ATGA",
    "ATGC",
    "ATGG",
    "ATGT",
    "ATTA",
    "ATTC",
    "ATTG",
    "ATTT",
    "CAAA",
    "CAAC",
    "CAAG",
    "CAAT",
    "CACA",
    "CACC",
    "CACG",
    "CACT",
    "CAGA",
    "CAGC",
    "CAGG",
    "CAGT",
    "CATA",
    "CATC",
    "CATG",
    "CATT",
    "CCAA",
    "CCAC",
    "CCAG",
    "CCAT",
    "CCCA",
    "CCCC",
    "CCCG",
    "CCCT",
    "CCGA",
    "CCGC",
    "CCGG",
    "CCGT",
    "CCTA",
    "CCTC",
    "CCTG",
    "CCTT",
    "CGAA",
    "CGAC",
    "CGAG",
    "CGAT",
    "CGCA",
    "CGCC",
    "CGCG",
    "CGCT",
    "CGGA",
    "CGGC",
    "CGGG",
    "CGGT",
    "CGTA",
    "CGTC",
    "CGTG",
    "CGTT",
    "CTAA",
    "CTAC",
    "CTAG",
    "CTAT",
    "CTCA",
    "CTCC",
    "CTCG",
    "CTCT",
    "CTGA",
    "CTGC",
    "CTGG",
    "CTGT",
    "CTTA",
    "CTTC",
    "CTTG",
    "CTTT",
    "GAAA",
    "GAAC",
    "GAAG",
    "GAAT",
    "GACA",
    "GACC",
    "GACG",
    "GACT",
    "GAGA",
    "GAGC",
    "GAGG",
    "GAGT",
    "GATA",
    "GATC",
    "GATG",
    "GATT",
    "GCAA",
    "GCAC",
    "GCAG",
    "GCAT",
    "GCCA",
    "GCCC",
    "GCCG",
    "GCCT",
    "GCGA",
    "GCGC",
    "GCGG",
    "GCGT",
    "GCTA",
    "GCTC",
    "GCTG",
    "GCTT",
    "GGAA",
    "GGAC",
    "GGAG",
    "GGAT",
    "GGCA",
    "GGCC",
    "GGCG",
    "GGCT",
    "GGGA",
    "GGGC",
    "GGGG",
    "GGGT",
    "GGTA",
    "GGTC",
    "GGTG",
    "GGTT",
    "GTAA",
    "GTAC",
    "GTAG",
    "GTAT",
    "GTCA",
    "GTCC",
    "GTCG",
    "GTCT",
    "GTGA",
    "GTGC",
    "GTGG",
    "GTGT",
    "GTTA",
    "GTTC",
    "GTTG",
    "GTTT",
    "TAAA",
    "TAAC",
    "TAAG",
    "TAAT",
    "TACA",
    "TACC",
    "TACG",
    "TACT",
    "TAGA",
    "TAGC",
    "TAGG",
    "TAGT",
    "TATA",
    "TATC",
    "TATG",
    "TATT",
    "TCAA",
    "TCAC",
    "TCAG",
    "TCAT",
    "TCCA",
    "TCCC",
    "TCCG",
    "TCCT",
    "TCGA",
    "TCGC",
    "TCGG",
    "TCGT",
    "TCTA",
    "TCTC",
    "TCTG",
    "TCTT",
    "TGAA",
    "TGAC",
    "TGAG",
    "TGAT",
    "TGCA",
    "TGCC",
    "TGCG",
    "TGCT",
    "TGGA",
    "TGGC",
    "TGGG",
    "TGGT",
    "TGTA",
    "TGTC",
    "TGTG",
    "TGTT",
    "TTAA",
    "TTAC",
    "TTAG",
    "TTAT",
    "TTCA",
    "TTCC",
    "TTCG",
    "TTCT",
    "TTGA",
    "TTGC",
    "TTGG",
    "TTGT",
    "TTTA",
    "TTTC",
    "TTTG",
    "TTTT",
]


def dump(value=None, filename=None):
    if (value is not None) and (filename is not None):
        joblib.dump(value=value, filename=filename)

    else:
        raise ValueError("Value & Filename should be passed.".capitalize())


def load(filename=None):
    if filename is not None:
        return joblib.load(filename=filename)

    else:
        raise ValueError(
            "Filename should be passed in an appropriate manner".capitalize()
        )


def config():
    with open("../config.yml", "r") as file:
        return yaml.safe_load(file)


def hyperparameter_tuning(model: str = "RF"):
    if model == "RF":
        return {
            "n_estimators": [100, 200, 300],
            "criterion": ["gini", "entropy"],
            "max_features": ["sqrt", "log2"],
        }
    elif model == "DT":
        return {
            "criterion": ["gini", "entropy"],
            "max_depth": [None, 10, 20, 30],
            "min_samples_split": [2, 5],
            "min_samples_leaf": [1, 2],
        }
    elif model == "LR":
        return {
            "penalty": ["l1", "l2", "elasticnet", "none"],
            "C": [0.001, 0.01, 0.1, 1, 10],
            "max_iter": [100, 200, 300],
        }
    elif model == "XGB":
        return {
            "learning_rate": [0.01, 0.1, 1],
            "max_depth": [3, 5, 7],
            "n_estimators": [100, 200, 300],
        }
    elif model == "NB":
        return {
            "var_smoothing": [1e-09],
        }
    else:
        raise ValueError(
            "The model name is not supported. Please check the model name and try again".capitalize()
        )

## Feature Generator for DNA-Sequence

In [None]:
warnings.filterwarnings("ignore")


class FeatureGenerator:
    def __init__(
        self, approaches: list = ["single", "di", "tri", "tetra", "gc-content"]
    ):
        self.approaches = approaches

        self.X = list()
        self.y = list()

        self.GC_Content = list()

        self.dataset = pd.read_csv("../data/raw/DNA-Classification.csv")[0:5] # I am using sub sample as it would take a huge time to craete

    def feature_generator(self):
        if "single" in self.approaches:
            max_len = max(self.dataset["sequence"].apply(len))

            for pos in range(max_len):
                for nucleoside in single_nucleosides:
                    feature_column = f"{nucleoside}_pos_{pos}"
                    self.dataset[feature_column] = 0

            for instance in tqdm(range(self.dataset.shape[0])):
                sequence = self.dataset.loc[instance, "sequence"]

                for pos, nucleotide in enumerate(sequence):
                    for nucleoside in single_nucleosides:
                        feature_column = f"{nucleoside}_pos_{pos}"
                        if nucleoside == nucleotide:
                            self.dataset.loc[instance, feature_column] = 1

        if "di" in self.approaches:
            max_len = max(self.dataset["sequence"].apply(len))

            for pos in range(max_len - 1):
                for di_nucleoside in di_nucleosides:
                    feature_column = f"{di_nucleoside}_pos_{pos}_di_nucleoside"
                    self.dataset[feature_column] = 0

            for instance in tqdm(range(self.dataset.shape[0])):
                sequence = self.dataset.loc[instance, "sequence"]
                for pos in range(len(sequence) - 1):
                    for di_nucleoside in di_nucleosides:
                        feature_column = f"{di_nucleoside}_pos_{pos}_di_nucleoside"
                        if sequence[pos : pos + 2] == di_nucleoside:
                            self.dataset.loc[instance, feature_column] = 1

        if "tri" in self.approaches:
            max_len = max(self.dataset["sequence"].apply(len))

            for pos in range(max_len - 2):
                for tri_nucleoside in tri_nucleosides:
                    feature_column = f"{tri_nucleoside}_pos_{pos}_tri_nucleoside"
                    self.dataset[feature_column] = 0

            for instance in tqdm(range(self.dataset.shape[0])):
                sequence = self.dataset.loc[instance, "sequence"]
                for pos in range(len(sequence) - 2):
                    for tri_nucleoside in tri_nucleosides:
                        feature_column = f"{tri_nucleoside}_pos_{pos}_tri_nucleoside"
                        if sequence[pos : pos + 3] == tri_nucleoside:
                            self.dataset.loc[instance, feature_column] = 1

        if "tetra" in self.approaches:
            max_len = max(self.dataset["sequence"].apply(len))

            for pos in range(max_len - 3):
                for tetra_nucleoside in tetra_nucleosides:
                    feature_column = f"{tetra_nucleoside}_pos_{pos}_tetra_nucleoside"
                    self.dataset[feature_column] = 0

            for instance in tqdm(range(self.dataset.shape[0])):
                sequence = self.dataset.loc[instance, "sequence"]
                for pos in range(len(sequence) - 3):
                    for tetra_nucleoside in tetra_nucleosides:
                        feature_column = (
                            f"{tetra_nucleoside}_pos_{pos}_tetra_nucleoside"
                        )
                        if sequence[pos : pos + 4] == tetra_nucleoside:
                            self.dataset.loc[instance, feature_column] = 1

        if "gc-content" in self.approaches:
            self.GC_Content = []

            for instance in tqdm(range(self.dataset.shape[0])):
                sequence = self.dataset.loc[instance, "sequence"]
                G_count = sequence.count("G")
                C_count = sequence.count("C")
                GC_Content = (
                    (G_count + C_count) / len(sequence) if len(sequence) > 0 else 0
                )
                self.GC_Content.append(GC_Content)

            self.dataset["GC-Content"] = self.GC_Content

        try:
            self.dataset.to_csv(
                os.path.join(
                    config()["path"]["processed_path"], "processed_dataset.csv"
                )
            )
        except Exception as e:
            print(
                "Cannot saved the dataset in the processed file, & error: {}".capitalize().format(
                    e
                )
            )
            traceback.print_exc()
        else:
            print(
                "the dataset stored in the {} folder".format(
                    config()["path"]["processed_path"]
                ).capitalize()
            )


if __name__ == "__main__":
    generator = FeatureGenerator(approaches=["single"])
    generator.feature_generator()

## Dataloader

In [None]:
class DataLoader:
    def __init__(
        self,
        dataset=None,
        split_size: float = 0.20,
        approaches: list = ["single", "di", "tri", "tetra", "gc_content"],
    ):
        self.dataset = dataset
        self.split_size = split_size
        self.approaches = approaches

    def split_dataset(self):
        if os.path.exists(config()["path"]["processed_path"]):
            dataset = os.path.join(
                config()["path"]["processed_path"], "processed_dataset.csv"
            )

            self.processed_data = pd.read_csv(dataset)

            X = self.processed_data.iloc[:, 4:]
            y = self.processed_data.iloc[:, 3]

            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=self.split_size, random_state=42
            )

            for type, dataset in [
                ("X_train", X_train),
                ("X_test", X_test),
                ("y_train", y_train),
                ("y_test", y_test),
            ]:
                dataset.to_csv(
                    os.path.join(config()["path"]["processed_path"], f"{type}.csv"),
                    index=False,
                )

            print(
                "Training and testing dataset is stored in the folder {}".format(
                    config()["path"]["processed_path"]
                )
            )

            return {
                "X_train": X_train,
                "X_test": X_test,
                "y_train": y_train,
                "y_test": y_test,
            }

    def feature_generator(self):
        if isinstance(self.approaches, list):
            self.generator = FeatureGenerator(approaches=self.approaches)

            try:
                self.generator.feature_generator()
            except Exception as e:
                print(f"An error occurred: {e}".capitalize())
            else:
                print(
                    "Feature generation completed successfully and store in the folder {}".format(
                        os.path.join(config()["path"]["processed_path"])
                    )
                )
        else:
            raise ValueError("Approaches must be a list".capitalize())

    @staticmethod
    def dataset_history():
        if os.path.exists(config()["path"]["processed_path"]):
            processed_path = os.path.join(
                config()["path"]["processed_path"], "processed_dataset.csv"
            )

            dataset = pd.read_csv(processed_path)

            information = {}

            information["isNaN".title()] = (
                "NaN".capitalize()
                if dataset.isnull().sum().sum() > 0
                else "no NaN".capitalize()
            )

            information["total_features".title()] = str(dataset.shape[1])
            information["total_instances".title()] = str(dataset.shape[0])
            information["dataset_shape".title()] = str(dataset.shape)
            information["target_ratio".title()] = str(
                dataset["labels"].value_counts(ascending=False).to_dict()
            )

            pd.DataFrame(information, index=[0]).to_csv(
                os.path.join(config()["path"]["files_path"], "dataset_history.csv")
            )

            print(
                "Dataset history is stored in the folder {}".format(
                    config()["path"]["files_path"]
                )
            )


if __name__ == "__main__":
    loader = DataLoader(
        dataset="../data/raw/DNA-Classification.csv",
        split_size=0.2,
        approaches=["single"]  # please use "di", "tri", "tetra" and GC-content so create a huge amount of features
    )
    
    loader.feature_generator()
    
    DataLoader.dataset_history()

## Define the MLModels

In [None]:
class MachineLearningModel:
    def __init__(self, model: str = "RF"):
        self.model = model

    def define_model(self):
        if self.model == "RF":
            return RandomForestClassifier()
        elif self.model == "DT":
            return DecisionTreeClassifier()
        elif self.model == "NB":
            return MultinomialNB()
        elif self.model == "LR":
            return LogisticRegression()
        elif self.model == "XGB":
            return XGBClassifier()
        else:
            raise TypeError(
                "Select the appropriate machine learning model to train the model".capitalize()
            )


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Define the model for the DNA-Classification".title()
    )
    parser.add_argument(
        "--model",
        type=str,
        default=config()["model"]["model_name"],
        choices=["RF", "DT", "NB", "LR", "XGB"],
        help="Define the model name for the model".capitalize(),
    )

    model = MachineLearningModel()
    model.define_model()

## Helper method

In [None]:
def dataset_initialization():

    X_train = pd.read_csv(
        os.path.join(config()["path"]["processed_path"], "X_train.csv")
    )
    X_test = pd.read_csv(
        os.path.join(config()["path"]["processed_path"], "X_test.csv"),
    )
    y_train = pd.read_csv(
        os.path.join(config()["path"]["processed_path"], "y_train.csv"),
    )
    y_test = pd.read_csv(
        os.path.join(config()["path"]["processed_path"], "y_test.csv"),
    )

    training_dataset = pd.concat(
        [X_train, y_train],
        axis=1,
    )
    testing_dataset = pd.concat(
        [X_test, y_test],
        axis=1,
    )

    training_dataset = training_dataset.iloc[:, :-1]
    testing_dataset = testing_dataset.iloc[:, :-1]

    return {
        "X_train": X_train,
        "X_test": X_test,
        "y_train": y_train,
        "y_test": y_test,
        "training_dataset": training_dataset,
        "testing_dataset": testing_dataset,
    }


def features_extraction_technique():
    try:
        dataset = dataset_initialization()

        training_dataset = dataset["training_dataset"]
        testing_dataset = dataset["testing_dataset"]

    except Exception as e:
        print("An error is occured: ", e)

    try:
        pca = PCA()
        pca.fit(training_dataset)

    except ImportError as e:
        print("An error is occured: ", e)
    except Exception as e:
        print("An error is occured: ", e)

    explained_variance = np.cumsum(pca.explained_variance_ratio_)
    best_n_components = np.argmax(explained_variance >= 0.90) + 1

    pca = PCA(n_components=best_n_components)

    X_train_transformed = pca.fit_transform(training_dataset)
    X_test_transformed = pca.transform(testing_dataset)

    X_train_df = pd.DataFrame(
        X_train_transformed,
        index=dataset["X_train"].index,
    )
    X_test_df = pd.DataFrame(
        X_test_transformed,
        index=dataset["X_test"].index,
    )

    y_train = dataset["y_train"].reset_index(drop=True)["labels"]
    y_test = dataset["y_test"].reset_index(drop=True)["labels"]

    dataset = pd.concat(
        [
            pd.concat([X_train_df, y_train], axis=1),
            pd.concat([X_test_df, y_test], axis=1),
        ],
        axis=0,
    )

    X = dataset.drop("labels", axis=1)
    y = dataset["labels"]

    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=config()["dataloader"]["split_size"],
        random_state=42,
    )

    os.makedirs(
        os.path.join(config()["path"]["processed_path"], "PCA-dataset"), exist_ok=True
    )

    for dataset_name, data in [
        ("X_train", X_train),
        ("X_test", X_test),
        ("y_train", y_train),
        ("y_test", y_test),
    ]:
        data.to_csv(
            os.path.join(
                config()["path"]["processed_path"], "PCA-dataset", f"{dataset_name}.csv"
            ),
            index=False,
        )

    return {"X_train": X_train, "X_test": X_test, "y_train": y_train, "y_test": y_test}


def features_selection_technique():
    RF = RandomForestClassifier(n_estimators=300, criterion="gini", random_state=42)

    try:
        dataset = dataset_initialization()

        X_train = dataset["X_train"]
        y_train = dataset["y_train"]
        X_test = dataset["X_test"]
        y_test = dataset["y_test"]

        RF.fit(X_train, y_train)

        feature_importances = RF.feature_importances_

        importance_df = pd.concat(
            [
                pd.DataFrame(X_train.columns, columns=["Features"]),
                pd.DataFrame(feature_importances, columns=["Importance"]),
            ],
            axis=1,
        ).sort_values(by=["Importance"], ascending=False)

        columns = importance_df[importance_df["Importance"] >= 0.001]["Features"].values
        index = importance_df[importance_df["Importance"] >= 0.001].index

        X_train = X_train.loc[:, columns]
        X_test = X_test.loc[:, columns]

        os.makedirs(
            os.path.join(config()["path"]["processed_path"], "Feature-Importance"),
            exist_ok=True,
        )

        for dataset_name, data in [
            ("X_train", X_train),
            ("X_test", X_test),
            ("y_train", y_train),
            ("y_test", y_test),
        ]:
            data.to_csv(
                os.path.join(
                    config()["path"]["processed_path"],
                    "Feature-Importance",
                    f"{dataset_name}.csv",
                ),
                index=False,
            )

        return {
            "X_train": X_train,
            "X_test": X_test,
            "y_train": y_train,
            "y_test": y_test,
        }

    except Exception as e:
        print("An error occurred: ", e)
        traceback.print_exc()


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Helper method for the DNA-Classifier".title()
    )
    parser.add_argument(
        "--FE",
        type=str,
        default="PCA",
        help="Features Extraction Technique".capitalize(),
    )
    parser.add_argument(
        "--FS",
        type=bool,
        default=False,
        help="Features Selection Technique".capitalize(),
    )

    args = parser.parse_args()

    if args.FE:
        _ = features_extraction_technique()
    elif args.FS:
        _ = features_selection_technique()

## Trainer Method

In [None]:
import warnings

warnings.filterwarnings("ignore")


class Trainer:
    def __init__(
        self,
        model: str = "RF",
        features_extraction: bool = False,
        features_selection: bool = False,
        hyperparameter_tuning: bool = False,
        KFold: int = 5,
    ):
        self.steps = "training"
        self.model = model
        self.features_extraction = features_extraction
        self.features_selection = features_selection
        self.hyperparameter_tuning = hyperparameter_tuning
        self.KFold = KFold

        self.accuracy = []
        self.precision = []
        self.recall = []
        self.f1_score = []

    def choose_dataset(self):
        if self.features_extraction:
            return features_extraction_technique()
        elif self.features_selection:
            return features_selection_technique()
        else:
            path = config()["path"]["processed_data"]
            return (
                {
                    "X_train": os.path.join(path, "X_train.csv"),
                    "X_test": os.path.join(path, "X_test.csv"),
                    "y_train": os.path.join(path, "y_train.csv"),
                    "y_test": os.path.join(path, "y_test.csv"),
                }
                if os.path.exists(config()["path"]["processed_data"])
                else "Make sure the processed data is in the right path".capitalize()
            )

    def select_the_model(self):
        if self.model == "RF":
            return MachineLearningModel(model="RF").define_model()
        elif self.model == "DT":
            return MachineLearningModel(model="DT").define_model()
        elif self.model == "LR":
            return MachineLearningModel(model="LR").define_model()
        elif self.model == "XGB":
            return MachineLearningModel(model="XGB").define_model()
        elif self.model == "NB":
            return MachineLearningModel(model="NB").define_model()
        else:
            return "Make sure the model is in the right format".capitalize()

    def model_evaluation(self, **kwargs):
        with open("./evaluation.json", "w") as file:
            json.dump(
                {
                    "Accuracy": np.mean(kwargs["accuracy"]).round(2),
                    "Precision": np.mean(kwargs["precision"]).round(2),
                    "Recall": np.mean(kwargs["recall"]).round(2),
                    "F1 Score": np.mean(kwargs["f1_score"]).round(2),
                    "Classification Report": classification_report(
                        kwargs["actual_labels"],
                        kwargs["predicted_labels"],
                        output_dict=True,
                    ),
                },
                file,
                indent=4,
            )

    def train(self):
        dataset = self.choose_dataset()
        classifier = self.select_the_model()

        if self.hyperparameter_tuning:
            classifier = GridSearchCV(
                estimator=classifier,
                param_grid=hyperparameter_tuning(model=self.model),
                cv=self.KFold,
                scoring="accuracy",
            )

            classifier.fit(dataset["X_train"], dataset["y_train"])

            predicted = classifier.predict(dataset["X_test"])

            print("The best parameters are: ".capitalize(), classifier.best_params_)
            print("Refined best parameters: ".capitalize(), classifier.best_score_)

            self.model_evaluation(
                accuracy=accuracy_score(predicted, dataset["y_test"]),
                precision=precision_score(
                    predicted, dataset["y_test"], average="weighted"
                ),
                recall=recall_score(predicted, dataset["y_test"], average="weighted"),
                f1_score=f1_score(predicted, dataset["y_test"], average="weighted"),
                actual_labels=dataset["y_test"],
                predicted_labels=predicted,
            )

        else:
            predicted_labels = []
            actual_labels = []

            KFoldCV = KFold(n_splits=self.KFold, shuffle=True, random_state=42)

            X = pd.concat([dataset["X_train"], dataset["X_test"]], axis=0).reset_index(
                drop=True
            )
            y = pd.concat([dataset["y_train"], dataset["y_test"]], axis=0).reset_index(
                drop=True
            )

            for index, (train_index, test_index) in enumerate(KFoldCV.split(X, y)):
                print(f"{'*' * 10} KFold CV - {index + 1} is executing {'*' * 10}")

                X_train_fold, y_train_fold = X.iloc[train_index, :], y.iloc[train_index]
                X_test_fold, y_test_fold = X.iloc[test_index, :], y.iloc[test_index]

                classifier.fit(X_train_fold, y_train_fold)
                predicted = classifier.predict(X_test_fold)

                self.accuracy.append(accuracy_score(y_test_fold, predicted))
                self.precision.append(
                    precision_score(y_test_fold, predicted, average="weighted")
                )
                self.recall.append(
                    recall_score(y_test_fold, predicted, average="weighted")
                )
                self.f1_score.append(
                    f1_score(y_test_fold, predicted, average="weighted")
                )

                predicted_labels.extend(predicted)
                actual_labels.extend(y_test_fold.values.ravel())

            self.model_evaluation(
                accuracy=self.accuracy,
                precision=self.precision,
                recall=self.recall,
                f1_score=self.f1_score,
                predicted_labels=predicted_labels,
                actual_labels=actual_labels,
            )

            print(
                "The evaluation metrics are saved in the evaluation.json file".capitalize()
            )


if __name__ == "__main__":
    trainer = Trainer(
        features_extraction=True, hyperparameter_tuning=False, model="RF", KFold=2
    )
    trainer.train()