### Working with Biological Sequences

In [16]:
from seqanalyzer import DNASequence, RNASequence, AminoAcidSequence

In [17]:
# DNA sequence example
dna_seq = DNASequence("ATCG")
rna_seq = dna_seq.transcribe()
print(rna_seq) 

AUCG


In [21]:
# RNA sequence example
rna_seq = RNASequence("ATCGATCG")
gc_content = rna_seq.gc_content()
print(f"GC Content: {gc_content}") 

GC Content: 50.00


In [22]:
# Aminoacid sequence example
amino_acid_seq = AminoAcidSequence("ACDEFG")
hydrophobicity = amino_acid_seq.compute_hydrophobicity()
print(f"Hydrophobicity: {hydrophobicity:.2f}")

Hydrophobicity: 33.33


#### OpenFasta context manager

In [7]:
from bio_files_processor import OpenFasta

In [9]:
fasta_file_path = "data/multiple_genes_example.fasta"
with OpenFasta(fasta_file_path) as fasta:
    for record in fasta:
        print(record)
        print('\n')

>GTD323452 5S_rRNA NODE_272_length_223_cov_0.720238:18-129(+)
ACGGCCATAGGACTTTGAAAGCACCGCATCCCGTCCGATCTGCGAAGTTAACCAAGATGCCGCCTGGTTAGTACC
ATGGTGGGGGACCACATGGGAATCCCTGGTGCTGTG


>GTD678345 16S_rRNA NODE_80_length_720_cov_1.094737:313-719(+)
TTGGCTTCTTAGAGGGACTTTTGATGTTTAATCAAAGGAAGTTTGAGGCAATAACAGGTCTGTGATGCCCTTAGA
TGTTCTGGGCCGCACGCGCGCTACACTGAGCCCTTGGGAGTGGTCCATTTGAGCCGGCAACGGCACGTTTGGACT
GCAAACTTGGGCAAACTTGGTCATTTAGAGGAAGTAAAAGTCGTAACAAGGT


>GTD174893 16S_rRNA NODE_1_length_2558431_cov_75.185164:2153860-2155398(+)
TTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAACGGTAACAGGAA
ACAGCTTGCTGTTTCGCTGACGAGTGGGAAGTAGGTAGCTTAACCTTCGGGAGGGCGCTTACCACTTTGTGATTC
ATGACTGGGGTGAAGTCGTAACAAGGTAACCGTAGGGGAACCTGCGGTTGGATCACCTCCTT


>GTD906783 16S_rRNA NODE_1_length_2558431_cov_75.185164:793941-795479(-)
TTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAACGGTAACAGGAA
ACAGCTTGCTGTTTCGCTGACGAGTGGGAAGTAGGTAGCTTAACCTTCGGGAGGGCGCTTACCACTTTGTGATTC
ATGACTGGGGTGAAGTCGTAACAAGGTAACCGTAGG

#### Genscan API

In [1]:
from seqanalyzer import run_genscan

In [10]:
genscan_output = run_genscan(sequence=None, 
                     sequence_file_path='data/single_gene_example.fasta',
                     organism="Vertebrate",
                     exon_cutoff=1.00,
                     sequence_name="")

In [11]:
genscan_output.status

200

In [12]:
genscan_output.exon_list

[Exon 1: Type=Init, Start=235, End=420,
 Exon 2: Type=Intr, Start=1027, End=1072,
 Exon 3: Type=Intr, Start=1260, End=1307,
 Exon 4: Type=Term, Start=1609, End=2030,
 Exon 5: Type=PlyA, Start=2806, End=2811]

In [13]:
genscan_output.intron_list

[Intron 1: Start=421, End=1026,
 Intron 2: Start=1073, End=1259,
 Intron 3: Start=1308, End=1608,
 Intron 4: Start=2031, End=2805]

In [14]:
genscan_output.cds_list

[>/tmp/04_27_24-12:29:12.fasta|GENSCAN_predicted_peptide_1|233_aa 
 MSTESMIRDVELAEEALPKKTGGPQGSRRCLFLSLFSFLIVAGATTLFCLLHFGVIGPQREEFPRDLSLISPLAQ
 AVRSSSRTPSDKPVAHVVANPQAEGQLQWLNRRANALLANGVELRDNQLVVPSEGLYLIYSQVLFKGQGCPSTHV
 LLTHTISRIAVSYQTKVNLLSAIKSPCQRETPEGAEAKPWYEPIYLGGVFQLEKGDRLSAEINRPDYLDFAESGQ
 VYFGIIAL]

#### Custom RandomForestClassifier

In [4]:
from sklearn.datasets import make_classification
from cutom_random_forest import RandomForestClassifierCustom

X, y = make_classification(n_samples=100000)
random_forest = RandomForestClassifierCustom(max_depth=30, n_estimators=10, 
                                             max_features=2, random_state=42)

In [5]:
%%time

random_forest.fit(X, y, n_jobs=1)

CPU times: user 12.6 s, sys: 14.8 ms, total: 12.6 s
Wall time: 12.6 s


In [3]:
%%time

random_forest.fit(X, y, n_jobs=2)

CPU times: user 121 ms, sys: 328 ms, total: 449 ms
Wall time: 9.68 s


In [4]:
%%time

results_1 = random_forest.predict(X, n_jobs=1)

CPU times: user 174 ms, sys: 167 µs, total: 174 ms
Wall time: 172 ms


In [5]:
%%time

results_2 = random_forest.predict(X, n_jobs=2)

CPU times: user 63.2 ms, sys: 151 ms, total: 215 ms
Wall time: 490 ms


In [10]:
(results_1 == results_2).all()

True

### Unit tests

In [14]:
! python test_seqanalyzer.py

.....Filtered sequences saved to fastq_filtrator_results/test_input_filtered.fastq
.Filtered sequences saved to fastq_filtrator_results/test_input_filtered.fastq
.Filtered sequences saved to fastq_filtrator_results/test_input_filtered.fastq
.Filtered sequences saved to fastq_filtrator_results/test_input_filtered.fastq
.
----------------------------------------------------------------------
Ran 9 tests in 0.012s

OK
