# Machine Learning in Bioinformatics

## Import the library

In [1]:
# Import library
from Bio import SeqIO

## Import fasta dataset

In [2]:
# Import dataset
file_path = "sequence.fasta"

In [4]:
# Read bio-data in FASTA format
with open(file_path, 'r') as seq_file:
    fasta_content = seq_file.readlines()
    print(fasta_content)

['>NG_047557.1 Staphylococcus aureus N315 bleO gene for bleomycin binding protein, complete CDS\n', 'CGGGCCATTTTGCGTAATAAGAAAAAGGATTAATTATGAGCGAATTGAATTAATAATAAGGTAATAGATT\n', 'TACATTAGAAAATGAAAGGGGATTTTATGCGTGAGAATGTTACAGTCTATCCCGGCATTGCCAGTCGGGG\n', 'ATATTAAAAAGAGTATAGGTTTTTATTGCGATAAACTAGGTTTCACTTTGGTTCACCATGAAGATGGATT\n', 'CGCAGTTCTAATGTGTAATGAGGTTCGGATTCATCTATGGGAGGCAAGTGATGAAGGCTGGCGCTCTCGT\n', 'AGTAATGATTCACCGGTTTGTACAGGTGCGGAGTCGTTTATTGCTGGTACTGCTAGTTGCCGCATTGAAG\n', 'TAGAGGGAATTGATGAATTATATCAACATATTAAGCCTTTGGGCATTTTGCACCCCAATACATCATTAAA\n', 'AGATCAGTGGTGGGATGAACGAGACTTTGCAGTAATTGATCCCGACAACAATTTGATTAGCTTTTTTCAA\n', 'CAAATAAAAAGCTAAAATCTATTATTAATCTGTTCAGCAATCGGGCGCGATTGCTGAATAAAAGATACGA\n', 'GAGACCTCTCTTGTATCTTTTTTATTTTGAGTGGTTTTGTCCGTT\n', '\n']


In [5]:
# Display few lines
fasta_content[:20]

['>NG_047557.1 Staphylococcus aureus N315 bleO gene for bleomycin binding protein, complete CDS\n',
 'CGGGCCATTTTGCGTAATAAGAAAAAGGATTAATTATGAGCGAATTGAATTAATAATAAGGTAATAGATT\n',
 'TACATTAGAAAATGAAAGGGGATTTTATGCGTGAGAATGTTACAGTCTATCCCGGCATTGCCAGTCGGGG\n',
 'ATATTAAAAAGAGTATAGGTTTTTATTGCGATAAACTAGGTTTCACTTTGGTTCACCATGAAGATGGATT\n',
 'CGCAGTTCTAATGTGTAATGAGGTTCGGATTCATCTATGGGAGGCAAGTGATGAAGGCTGGCGCTCTCGT\n',
 'AGTAATGATTCACCGGTTTGTACAGGTGCGGAGTCGTTTATTGCTGGTACTGCTAGTTGCCGCATTGAAG\n',
 'TAGAGGGAATTGATGAATTATATCAACATATTAAGCCTTTGGGCATTTTGCACCCCAATACATCATTAAA\n',
 'AGATCAGTGGTGGGATGAACGAGACTTTGCAGTAATTGATCCCGACAACAATTTGATTAGCTTTTTTCAA\n',
 'CAAATAAAAAGCTAAAATCTATTATTAATCTGTTCAGCAATCGGGCGCGATTGCTGAATAAAAGATACGA\n',
 'GAGACCTCTCTTGTATCTTTTTTATTTTGAGTGGTTTTGTCCGTT\n',
 '\n']

## Import protein dataset

In [6]:
# Import dataset
file_path = "protein.fasta"

In [7]:
# Read bio-data in FASTA format
with open(file_path, 'r') as seq_file:
    fasta_content = seq_file.readlines()
    print(fasta_content)

[">YP_010963491.1 nucleocapsid protein [Bird's-foot trefoil nucleorhabdovirus]\n", 'MADITLEELKAIRPQYTSLSSVLRPEISSGQCVHREYTFQDAARFPIYKVRDLKNDEIVSIFKNITEVKK\n', 'SLNERDLYNIVSIALNVKDPFTNQRTIPDPFTPDVRCADFETAQPSESSEVNMLKEGGVHTITTIVPTSV\n', 'PMETSDSPTESIDEQASAICFLFAWLTRYVVKSPSQALSIQYSKVQDTYMKFYQKSSKIFDKFQADKLWI\n', 'LSLRNAFDAFLRVRNTLVLYVANAETVSKATPRIFNLLRYLFFQNLEFMGMHAYVSIVMIMSRIALPPAQ\n', 'ILTWLRMSGSELAIDEAYKIMANHDNGMKEGGATSERLWKYARILDAGYFNQLQTSYAAELIATLAYIEI\n', 'KLGISKEAGHSSPLNITVIADNKHIRDIGKAKAEAFMECKARSISLARDASAVDKIYAKRMGLDVSNLPP\n', 'PERPGHSQKRKEPVEAPRSEPVLPIKKPRDPPGRNIPPPPPF\n', '\n']


In [8]:
# Display few lines
fasta_content[:20]

[">YP_010963491.1 nucleocapsid protein [Bird's-foot trefoil nucleorhabdovirus]\n",
 'MADITLEELKAIRPQYTSLSSVLRPEISSGQCVHREYTFQDAARFPIYKVRDLKNDEIVSIFKNITEVKK\n',
 'SLNERDLYNIVSIALNVKDPFTNQRTIPDPFTPDVRCADFETAQPSESSEVNMLKEGGVHTITTIVPTSV\n',
 'PMETSDSPTESIDEQASAICFLFAWLTRYVVKSPSQALSIQYSKVQDTYMKFYQKSSKIFDKFQADKLWI\n',
 'LSLRNAFDAFLRVRNTLVLYVANAETVSKATPRIFNLLRYLFFQNLEFMGMHAYVSIVMIMSRIALPPAQ\n',
 'ILTWLRMSGSELAIDEAYKIMANHDNGMKEGGATSERLWKYARILDAGYFNQLQTSYAAELIATLAYIEI\n',
 'KLGISKEAGHSSPLNITVIADNKHIRDIGKAKAEAFMECKARSISLARDASAVDKIYAKRMGLDVSNLPP\n',
 'PERPGHSQKRKEPVEAPRSEPVLPIKKPRDPPGRNIPPPPPF\n',
 '\n']

## Applying machine learning to bio-data

In [1]:
# Import library
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [2]:
# Sample data
sequences = ["ATGCGT", "ATCGTG", "GCTTAA"]
labels = [1, 0,1] # 1: promoter, 0: non-promoter

In [3]:
# Vectorize sequence
vectorizer = CountVectorizer(analyzer='char', ngram_range=(2, 2))
X = vectorizer.fit_transform(sequences)

In [None]:
# Train classifier
classifier = MultinomialNB()
classifier.fit(X, labels)

In [None]:
# Make prediction
new_seq = ["ATGCGT"]
X_new = vectorizer.transform(new_seq)
print(f"Prediction: {classifier.predict(X_new)}")