# LDA-SILVA
## Linear Discriminant Analysis on SILVA Databases for ARB

* https://machinelearningmastery.com/linear-discriminant-analysis-with-python/
* https://www.arb-silva.de/download/arb-files/
* https://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.LinearDiscriminantAnalysis.html

In [1]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

## Silva Data

In [2]:
from bioinformatics import na_read
from bioinformatics import KmerVectors as kvec
from bioinformatics import NCBIDataset as nds
from bioinformatics import FASTADataset as fads

In [3]:
SILVA_PATH="../data/bioinformatics/silva/20220823/fasta/"

archaea_file = SILVA_PATH + "archaea_arb-silva.de_2022-08-23_id1197800_tax_silva.fasta"
bacteria_file = SILVA_PATH + "bacteria_bacteroidotaarb-silva.de_2022-08-23_id1197801_tax_silva.fasta"
eukaryota_file = SILVA_PATH + "eukaryota_amorphea_arb-silva.de_2022-08-23_id1197812_tax_silva.fasta"

In [4]:
archaea_fads = fads.FASTADataset('archaea', archaea_file, limit=0)
print(f'archaea: [{len(archaea_fads.fasta_dataset)}]')
#archaea.fasta_dataset
bacteria_fads = fads.FASTADataset('bacteria', bacteria_file, limit=0)
print(f'bacteria: [{len(bacteria_fads.fasta_dataset)}]')

archaea: [347020]
bacteria: [1078234]


In [5]:
kv = kvec.KmerVectors(['A','G','C','U'], 8, fastadatasets=[archaea_fads, bacteria_fads])
print(kv.labels)

KmerVectors Object -
alphabet [['A', 'G', 'C', 'U']]
dict: [['AAAAAAAA', 'AAAAAAAG', 'AAAAAAAC', 'AAAAAAAU']]...[['UUUUUUUA', 'UUUUUUUG', 'UUUUUUUC', 'UUUUUUUU']]
Labels: [{'archaea': 1, 'bacteria': 2}]
[archaea]
[../data/bioinformatics/silva/20220823/fasta/archaea_arb-silva.de_2022-08-23_id1197800_tax_silva.fasta]
[bacteria]
[../data/bioinformatics/silva/20220823/fasta/bacteria_bacteroidotaarb-silva.de_2022-08-23_id1197801_tax_silva.fasta]
{'archaea': 1, 'bacteria': 2}


In [6]:
e = kv.seq2KmerEncodedNumpyVectors(base_count_max=4, length_min=500, dataset_limit=10000)

FASTA Dataset
fasta dataset: [archaea], limit: [10000]
10002000300040005000600070008000900010000capped at [10000]
-
Total:             [10909]
Using :               [10001]
skip_count_minlength: [358]
skip_count_alphabet:  [551]
fasta dataset: [bacteria], limit: [10000]
1000200030004000500060007000800090001000011000capped at [10000]
-
Total:             [11421]
Using :               [10001]
skip_count_minlength: [607]
skip_count_alphabet:  [814]


In [7]:
X=e[0]
X

array([[23958, 30297, 55655, ..., 29819, 53743, 18364],
       [27141, 43028, 41043, ..., 34629,  7445, 29780],
       [49879,  2911, 11645, ..., 43726, 43833, 44263],
       ...,
       [ 4605, 18420,  8147, ..., 58800, 38594, 23307],
       [ 4605, 18420,  8147, ..., 25494, 36443, 14700],
       [ 4605, 18420,  8147, ...,  1321,  5285, 21142]])

In [8]:
y=e[1]
y

array([1, 1, 1, ..., 2, 2, 2])

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [10]:
print(f'X_train len: [{len(X_train)}]')
print(f'y_train len: [{len(y_train)}]')

print(f'X_test len: [{len(X_test)}]')
print(f'y_test len: [{len(y_test)}]')

X_train len: [13400]
y_train len: [13400]
X_test len: [6600]
y_test len: [6600]


In [11]:
# make a prediction with a lda model on the dataset
from sklearn.datasets import make_classification
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# define dataset
#X, y = make_classification(n_samples=1000, n_features=10, n_informative=10, n_redundant=0, random_state=1)
# define model
model = LinearDiscriminantAnalysis()
# fit model
model.fit(X_train, y_train)
# define new data
#row = [0.12777556,-3.64400522,-2.23268854,-1.82114386,1.75466361,0.1243966,1.03397657,2.35822076,1.01001752,0.56768485]
row = X[12]
# make a prediction
yhat = model.predict([row])
# summarize prediction
print('Predicted Class: %d' % yhat)

Predicted Class: 1


In [12]:
row = X[500]
# make a prediction
yhat = model.predict([row])
# summarize prediction
print('Predicted Class: %d' % yhat)

Predicted Class: 2


In [13]:
model.score(X_test, y_test)

0.9348484848484848