## Import the lib

In [None]:
import os
import sys
import yaml
import joblib
import argparse
import warnings
import traceback
import pandas as pd
from tqdm import tqdm

# Utils file

In [None]:
single_nucleosides = ["A", "C", "G", "T"]

di_nucleosides = [
    "AA",
    "AC",
    "AG",
    "AT",
    "CA",
    "CC",
    "CG",
    "CT",
    "GA",
    "GC",
    "GG",
    "GT",
    "TA",
    "TC",
    "TG",
    "TT",
]

tri_nucleosides = [
    "AAA",
    "AAC",
    "AAG",
    "AAT",
    "ACA",
    "ACC",
    "ACG",
    "ACT",
    "AGA",
    "AGC",
    "AGG",
    "AGT",
    "ATA",
    "ATC",
    "ATG",
    "ATT",
    "CAA",
    "CAC",
    "CAG",
    "CAT",
    "CCA",
    "CCC",
    "CCG",
    "CCT",
    "CGA",
    "CGC",
    "CGG",
    "CGT",
    "CTA",
    "CTC",
    "CTG",
    "CTT",
    "GAA",
    "GAC",
    "GAG",
    "GAT",
    "GCA",
    "GCC",
    "GCG",
    "GCT",
    "GGA",
    "GGC",
    "GGG",
    "GGT",
    "GTA",
    "GTC",
    "GTG",
    "GTT",
    "TAA",
    "TAC",
    "TAG",
    "TAT",
    "TCA",
    "TCC",
    "TCG",
    "TCT",
    "TGA",
    "TGC",
    "TGG",
    "TGT",
    "TTA",
    "TTC",
    "TTG",
    "TTT",
]


tetra_nucleosides = [
    "AAAA",
    "AAAC",
    "AAAG",
    "AAAT",
    "AACA",
    "AACC",
    "AACG",
    "AACT",
    "AAGA",
    "AAGC",
    "AAGG",
    "AAGT",
    "AATA",
    "AATC",
    "AATG",
    "AATT",
    "ACAA",
    "ACAC",
    "ACAG",
    "ACAT",
    "ACCA",
    "ACCC",
    "ACCG",
    "ACCT",
    "ACGA",
    "ACGC",
    "ACGG",
    "ACGT",
    "ACTA",
    "ACTC",
    "ACTG",
    "ACTT",
    "AGAA",
    "AGAC",
    "AGAG",
    "AGAT",
    "AGCA",
    "AGCC",
    "AGCG",
    "AGCT",
    "AGGA",
    "AGGC",
    "AGGG",
    "AGGT",
    "AGTA",
    "AGTC",
    "AGTG",
    "AGTT",
    "ATAA",
    "ATAC",
    "ATAG",
    "ATAT",
    "ATCA",
    "ATCC",
    "ATCG",
    "ATCT",
    "ATGA",
    "ATGC",
    "ATGG",
    "ATGT",
    "ATTA",
    "ATTC",
    "ATTG",
    "ATTT",
    "CAAA",
    "CAAC",
    "CAAG",
    "CAAT",
    "CACA",
    "CACC",
    "CACG",
    "CACT",
    "CAGA",
    "CAGC",
    "CAGG",
    "CAGT",
    "CATA",
    "CATC",
    "CATG",
    "CATT",
    "CCAA",
    "CCAC",
    "CCAG",
    "CCAT",
    "CCCA",
    "CCCC",
    "CCCG",
    "CCCT",
    "CCGA",
    "CCGC",
    "CCGG",
    "CCGT",
    "CCTA",
    "CCTC",
    "CCTG",
    "CCTT",
    "CGAA",
    "CGAC",
    "CGAG",
    "CGAT",
    "CGCA",
    "CGCC",
    "CGCG",
    "CGCT",
    "CGGA",
    "CGGC",
    "CGGG",
    "CGGT",
    "CGTA",
    "CGTC",
    "CGTG",
    "CGTT",
    "CTAA",
    "CTAC",
    "CTAG",
    "CTAT",
    "CTCA",
    "CTCC",
    "CTCG",
    "CTCT",
    "CTGA",
    "CTGC",
    "CTGG",
    "CTGT",
    "CTTA",
    "CTTC",
    "CTTG",
    "CTTT",
    "GAAA",
    "GAAC",
    "GAAG",
    "GAAT",
    "GACA",
    "GACC",
    "GACG",
    "GACT",
    "GAGA",
    "GAGC",
    "GAGG",
    "GAGT",
    "GATA",
    "GATC",
    "GATG",
    "GATT",
    "GCAA",
    "GCAC",
    "GCAG",
    "GCAT",
    "GCCA",
    "GCCC",
    "GCCG",
    "GCCT",
    "GCGA",
    "GCGC",
    "GCGG",
    "GCGT",
    "GCTA",
    "GCTC",
    "GCTG",
    "GCTT",
    "GGAA",
    "GGAC",
    "GGAG",
    "GGAT",
    "GGCA",
    "GGCC",
    "GGCG",
    "GGCT",
    "GGGA",
    "GGGC",
    "GGGG",
    "GGGT",
    "GGTA",
    "GGTC",
    "GGTG",
    "GGTT",
    "GTAA",
    "GTAC",
    "GTAG",
    "GTAT",
    "GTCA",
    "GTCC",
    "GTCG",
    "GTCT",
    "GTGA",
    "GTGC",
    "GTGG",
    "GTGT",
    "GTTA",
    "GTTC",
    "GTTG",
    "GTTT",
    "TAAA",
    "TAAC",
    "TAAG",
    "TAAT",
    "TACA",
    "TACC",
    "TACG",
    "TACT",
    "TAGA",
    "TAGC",
    "TAGG",
    "TAGT",
    "TATA",
    "TATC",
    "TATG",
    "TATT",
    "TCAA",
    "TCAC",
    "TCAG",
    "TCAT",
    "TCCA",
    "TCCC",
    "TCCG",
    "TCCT",
    "TCGA",
    "TCGC",
    "TCGG",
    "TCGT",
    "TCTA",
    "TCTC",
    "TCTG",
    "TCTT",
    "TGAA",
    "TGAC",
    "TGAG",
    "TGAT",
    "TGCA",
    "TGCC",
    "TGCG",
    "TGCT",
    "TGGA",
    "TGGC",
    "TGGG",
    "TGGT",
    "TGTA",
    "TGTC",
    "TGTG",
    "TGTT",
    "TTAA",
    "TTAC",
    "TTAG",
    "TTAT",
    "TTCA",
    "TTCC",
    "TTCG",
    "TTCT",
    "TTGA",
    "TTGC",
    "TTGG",
    "TTGT",
    "TTTA",
    "TTTC",
    "TTTG",
    "TTTT",
]


def dump(value=None, filename=None):
    if (value is not None) and (filename is not None):
        joblib.dump(value=value, filename=filename)

    else:
        raise ValueError("Value & Filename should be passed.".capitalize())


def load(filename=None):
    if filename is not None:
        return joblib.load(filename=filename)

    else:
        raise ValueError(
            "Filename should be passed in an appropriate manner".capitalize()
        )


def config():
    with open("../config.yml", "r") as file:
        return yaml.safe_load(file)


def hyperparameter_tuning(model: str = "RF"):
    if model == "RF":
        return {
            "n_estimators": [100, 200, 300],
            "criterion": ["gini", "entropy"],
            "max_features": ["sqrt", "log2"],
        }
    elif model == "DT":
        return {
            "criterion": ["gini", "entropy"],
            "max_depth": [None, 10, 20, 30],
            "min_samples_split": [2, 5],
            "min_samples_leaf": [1, 2],
        }
    elif model == "LR":
        return {
            "penalty": ["l1", "l2", "elasticnet", "none"],
            "C": [0.001, 0.01, 0.1, 1, 10],
            "max_iter": [100, 200, 300],
        }
    elif model == "XGB":
        return {
            "learning_rate": [0.01, 0.1, 1],
            "max_depth": [3, 5, 7],
            "n_estimators": [100, 200, 300],
        }
    elif model == "NB":
        return {
            "var_smoothing": [1e-09],
        }
    else:
        raise ValueError(
            "The model name is not supported. Please check the model name and try again".capitalize()
        )

## Feature Generator for DNA-Sequence

In [None]:
warnings.filterwarnings("ignore")


class FeatureGenerator:
    def __init__(
        self, approaches: list = ["single", "di", "tri", "tetra", "gc-content"]
    ):
        self.approaches = approaches

        self.X = list()
        self.y = list()

        self.GC_Content = list()

        self.dataset = pd.read_csv("../data/raw/DNA-Classification.csv")[0:5] # I am using sub sample as it would take a huge time to craete

    def feature_generator(self):
        if "single" in self.approaches:
            max_len = max(self.dataset["sequence"].apply(len))

            for pos in range(max_len):
                for nucleoside in single_nucleosides:
                    feature_column = f"{nucleoside}_pos_{pos}"
                    self.dataset[feature_column] = 0

            for instance in tqdm(range(self.dataset.shape[0])):
                sequence = self.dataset.loc[instance, "sequence"]

                for pos, nucleotide in enumerate(sequence):
                    for nucleoside in single_nucleosides:
                        feature_column = f"{nucleoside}_pos_{pos}"
                        if nucleoside == nucleotide:
                            self.dataset.loc[instance, feature_column] = 1

        if "di" in self.approaches:
            max_len = max(self.dataset["sequence"].apply(len))

            for pos in range(max_len - 1):
                for di_nucleoside in di_nucleosides:
                    feature_column = f"{di_nucleoside}_pos_{pos}_di_nucleoside"
                    self.dataset[feature_column] = 0

            for instance in tqdm(range(self.dataset.shape[0])):
                sequence = self.dataset.loc[instance, "sequence"]
                for pos in range(len(sequence) - 1):
                    for di_nucleoside in di_nucleosides:
                        feature_column = f"{di_nucleoside}_pos_{pos}_di_nucleoside"
                        if sequence[pos : pos + 2] == di_nucleoside:
                            self.dataset.loc[instance, feature_column] = 1

        if "tri" in self.approaches:
            max_len = max(self.dataset["sequence"].apply(len))

            for pos in range(max_len - 2):
                for tri_nucleoside in tri_nucleosides:
                    feature_column = f"{tri_nucleoside}_pos_{pos}_tri_nucleoside"
                    self.dataset[feature_column] = 0

            for instance in tqdm(range(self.dataset.shape[0])):
                sequence = self.dataset.loc[instance, "sequence"]
                for pos in range(len(sequence) - 2):
                    for tri_nucleoside in tri_nucleosides:
                        feature_column = f"{tri_nucleoside}_pos_{pos}_tri_nucleoside"
                        if sequence[pos : pos + 3] == tri_nucleoside:
                            self.dataset.loc[instance, feature_column] = 1

        if "tetra" in self.approaches:
            max_len = max(self.dataset["sequence"].apply(len))

            for pos in range(max_len - 3):
                for tetra_nucleoside in tetra_nucleosides:
                    feature_column = f"{tetra_nucleoside}_pos_{pos}_tetra_nucleoside"
                    self.dataset[feature_column] = 0

            for instance in tqdm(range(self.dataset.shape[0])):
                sequence = self.dataset.loc[instance, "sequence"]
                for pos in range(len(sequence) - 3):
                    for tetra_nucleoside in tetra_nucleosides:
                        feature_column = (
                            f"{tetra_nucleoside}_pos_{pos}_tetra_nucleoside"
                        )
                        if sequence[pos : pos + 4] == tetra_nucleoside:
                            self.dataset.loc[instance, feature_column] = 1

        if "gc-content" in self.approaches:
            self.GC_Content = []

            for instance in tqdm(range(self.dataset.shape[0])):
                sequence = self.dataset.loc[instance, "sequence"]
                G_count = sequence.count("G")
                C_count = sequence.count("C")
                GC_Content = (
                    (G_count + C_count) / len(sequence) if len(sequence) > 0 else 0
                )
                self.GC_Content.append(GC_Content)

            self.dataset["GC-Content"] = self.GC_Content

        try:
            self.dataset.to_csv(
                os.path.join(
                    config()["path"]["processed_path"], "processed_dataset.csv"
                )
            )
        except Exception as e:
            print(
                "Cannot saved the dataset in the processed file, & error: {}".capitalize().format(
                    e
                )
            )
            traceback.print_exc()
        else:
            print(
                "the dataset stored in the {} folder".format(
                    config()["path"]["processed_path"]
                ).capitalize()
            )


if __name__ == "__main__":
    generator = FeatureGenerator(approaches=["single"])
    generator.feature_generator()