# GNB-RDP
## Gaussian Naive Bayes on Ribosomal Database Project

* https://rdp.cme.msu.edu/misc/resources.jsp
* https://scikit-learn.org/stable/modules/naive_bayes.html#gaussian-naive-bayes

## Iris

In [1]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)
print("Number of mislabeled points out of a total %d points : %d"
      % (X_test.shape[0], (y_test != y_pred).sum()))


Number of mislabeled points out of a total 75 points : 4


## RDP

In [2]:
from bioinformatics import na_read
from bioinformatics import KmerVectors as kvec
from bioinformatics import NCBIDataset as nds
from bioinformatics import FASTADataset as fads

In [3]:
RDP_PATH="../data/bioinformatics/rdp/202208/"

archaea_file = RDP_PATH + "current_Archaea_unaligned.fa"
bacteria_file = RDP_PATH + "current_Bacteria_unaligned.fa"
fungi_file = RDP_PATH + "current_Fungi_unaligned.fa"

In [4]:
archaea_fads = fads.FASTADataset('archaea', archaea_file, limit=0)
print(f'archaea: [{len(archaea_fads.fasta_dataset)}]')
#archaea.fasta_dataset
bacteria_fads = fads.FASTADataset('bacteria', bacteria_file, limit=0)
print(f'bacteria: [{len(bacteria_fads.fasta_dataset)}]')

archaea: [160767]
bacteria: [3196041]


In [5]:
kv = kvec.KmerVectors(['A','G','C','T'], 8, fastadatasets=[archaea_fads, bacteria_fads])
print(kv.labels)

KmerVectors Object -
alphabet [['A', 'G', 'C', 'T']]
dict: [['AAAAAAAA', 'AAAAAAAG', 'AAAAAAAC', 'AAAAAAAT']]...[['TTTTTTTA', 'TTTTTTTG', 'TTTTTTTC', 'TTTTTTTT']]
Labels: [{'archaea': 1, 'bacteria': 2}]
[archaea]
[../data/bioinformatics/rdp/202208/current_Archaea_unaligned.fa]
[bacteria]
[../data/bioinformatics/rdp/202208/current_Bacteria_unaligned.fa]
{'archaea': 1, 'bacteria': 2}


In [6]:
e = kv.seq2KmerEncodedNumpyVectors(base_count_max=4, length_min=500, dataset_limit=10000)

FASTA Dataset
fasta dataset: [archaea], limit: [10000]
10002000300040005000600070008000900010000capped at [10000]
-
Total:             [10785]
Using :               [10001]
skip_count_minlength: [660]
skip_count_alphabet:  [125]
fasta dataset: [bacteria], limit: [10000]
10002000300040005000600070008000900010000capped at [10000]
-
Total:             [10584]
Using :               [10001]
skip_count_minlength: [421]
skip_count_alphabet:  [163]


In [7]:
X=e[0]
X

array([[30342, 55834, 26729, ..., 21321, 19749, 13462],
       [30342, 55834, 26729, ..., 19749, 13462, 53850],
       [30342, 55834, 26729, ..., 19749, 13462, 53850],
       ...,
       [22375, 23964, 30320, ..., 44359, 46367, 54397],
       [63831, 58718, 38266, ..., 11109, 44438, 46683],
       [42588, 39280, 26051, ..., 20770, 17545,  4646]])

In [8]:
y=e[1]
y

array([1, 1, 1, ..., 2, 2, 2])

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [10]:
print(f'X_train len: [{len(X_train)}]')
print(f'y_train len: [{len(y_train)}]')

print(f'X_test len: [{len(X_test)}]')
print(f'y_test len: [{len(y_test)}]')

X_train len: [13400]
y_train len: [13400]
X_test len: [6600]
y_test len: [6600]


In [11]:
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)
print("Number of mislabeled points out of a total %d points : %d"
      % (X_test.shape[0], (y_test != y_pred).sum()))


Number of mislabeled points out of a total 6600 points : 1246
