In [26]:
import re
import os

def load_fasta(fasta_path):
    """Without Using Biopython"""
    fasta = open(fasta_path, 'r').read()
    record_dict = dict()
    records = fasta.split('>')
    for rec in records:
        if rec:
            rec_lines = rec.split('\n')
            header = rec_lines[0]
            seq = ''.join(rec_lines[1:]).replace(' ', '')
            record_dict[header] = seq
    return record_dict

def check_seq_for_motif(seq, motif):
    """
    Check seq for motif
    Input:
    seq: str, sequence
    motif: str, motif to search seq for
    """
    r_all = re.findall(motif, seq)
    r_first = re.search(motif, seq)
    motif_count = len(r_all)
    if r_first:
        start, end = r_first.span()
        print(f'Found motif: {motif}, {motif_count} time(s).')
        print(f'Motif first found at positions {start} - {end}\n\n')
    else:
        print(f'Unable to find {motif}\n\n')
        
    return motif_count
        
        
def check_all_seqs(fasta_path, motif):
    """
    Input:
    fasta_path: str, path to fasta file
    motif: str, motif to search for within fasta
    
    """
    if os.path.exists(fasta_path):
        record_dict = load_fasta(fasta_path)
    else:
        print(f'Fasta path {fasta_path} does not exist.')
        raise OSError
    total_count = 0
    N_records = 0
    for header in record_dict:
        N_records += 1
        print(f'Fasta header: {header}\n')
        tcount = check_seq_for_motif(record_dict[header], motif)
        total_count += tcount
    print(f'Motif discovered {total_count} time(s) across {N_records} records.')

#check_all_seqs('BRCA1.faa', 'QKVNE')

try_count = 0
while try_count < 3:
    try:
        fasta_path = input('Enter local Fasta: ')
        motif = input('Enter Motif: ')
        print('\n')
        check_all_seqs(fasta_path, motif)
        break
    except Exception:
        print('Issue with fasta input, check fasta.')
    try_count += 1

Enter local Fasta:  BRCA1.faa
Enter Motif:  QKVNE




Fasta header: NP_009225.1 BRCA1 [organism=Homo sapiens] [GeneID=672] [isoform=1]

Found motif: QKVNE, 1 time(s).
Motif first found at positions 379 - 384


Fasta header: NP_009228.2 BRCA1 [organism=Homo sapiens] [GeneID=672] [isoform=3]

Found motif: QKVNE, 1 time(s).
Motif first found at positions 332 - 337


Fasta header: NP_009229.2 BRCA1 [organism=Homo sapiens] [GeneID=672] [isoform=4]

Unable to find QKVNE


Fasta header: NP_009230.2 BRCA1 [organism=Homo sapiens] [GeneID=672] [isoform=5]

Unable to find QKVNE


Fasta header: NP_009231.2 BRCA1 [organism=Homo sapiens] [GeneID=672] [isoform=2]

Found motif: QKVNE, 1 time(s).
Motif first found at positions 379 - 384


Motif discovered 3 time(s) across 5 records.


In [7]:
import re
def load_fasta(fasta_path):
    """Without Using Biopython"""
    fasta = open(fasta_path, 'r').read()
    record_dict = dict()
    records = fasta.split('>')
    for rec in records:
        if rec:
            rec_lines = rec.split('\n')
            header = rec_lines[0]
            seq = ''.join(rec_lines[1:]).replace(' ', '')
            record_dict[header] = seq
    return record_dict

records = load_fasta('BRCA1.faa')
for header in records:
    r = re.search('\w+\.\w', header)
    print(r.group())

NP_009225.1
NP_009228.2
NP_009229.2
NP_009230.2
NP_009231.2


In [40]:
import re
text =  """Several rapidly developing RNA interference (RNAi)
methodologies hold the promise to selectively inhibit gene expression in
mammals. RNAi is an innate cellular process activated when a
double-stranded RNA (dsRNA) molecule of greater than 19 duplex
nucleotides enters the cell, causing the degradation of not only the
invading dsRNA molecule, but also single-stranded (ssRNAs) RNAs of
identical sequences, including endogenous mRNAs."""

results = re.finditer('[\w\(\)]*RNA[\w\(\)]*', text)
match_list = []
for r in results:
    string = r.group()
    end = r.span()[1]
    if string == 'RNA':
        continue
    if string not in match_list:
        match_list.append(string)
    else:
        continue
    format_line = f'{string} ends at position {end}\n'
    print(format_line)

(RNAi) ends at position 50

RNAi ends at position 137

(dsRNA) ends at position 212

dsRNA ends at position 331

(ssRNAs) ends at position 375

RNAs ends at position 380

mRNAs ends at position 431



In [47]:
def load_fasta(fasta_path):
    """Without Using Biopython"""
    fasta = open(fasta_path, 'r').read()
    record_dict = dict()
    records = fasta.split('>')
    for rec in records:
        if rec:
            rec_lines = rec.split('\n')
            header = rec_lines[0].split(' ')[0]
            seq = ''.join(rec_lines[1:]).replace(' ', '')
            record_dict[header] = seq[:100]
    print(f'Loaded {len(record_dict)} records')
    return record_dict

rec_dict = load_fasta('BRCA1.faa')
rec_dict#['NP_009225.1']

print(rec_dict['NP_009225.1'])

Loaded 5 records


'MDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKFCMLKLLNQKKGPSQCPLCKNDITKRSLQESTRFSQLVEELLKIICAFQLDTGLE'

In [60]:
def load_fasta(fasta_path):
    """Without Using Biopython"""
    fasta = open(fasta_path, 'r').read()
    record_dict = dict()
    records = fasta.split('>')
    for rec in records:
        if rec:
            rec_lines = rec.split('\n')
            header = rec_lines[0]
            seq = ''.join(rec_lines[1:]).replace(' ', '')
            record_dict[header] = seq
    return record_dict

gencode = {
'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_',
'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W'}

def get_aa(seq):
    seq_len = len(seq)
    aa_seq = ''
    for ind in range(0, seq_len, 3):
        end = ind + 3
        codon = seq[ind:end]
        if len(codon) != 3:
            continue
        aa = gencode[codon]
        aa_seq += aa
    print(aa_seq)

inpu = input('Enter in fasta file path: ')
records = load_fasta(inpu)
for header in records:
    print('Header: ', header)
    aa = get_aa(records[header])
    print('AA seq: ', aa)


Enter in fasta file path:  seq1.txt


Header:  seq1
AWTHRSISRRGTFLEDVEGLEQLPRAVRKRRLQCGR_IAS_MERLG_RPGEKCLSPRGFTTSQELYKTNRTSSFLRCGQCSLIPLLQGGI_ATKSMLRFAQKGRFSIYFMSNRLRNYLCREVSGTNKHNRGSV
AA seq:  None


In [79]:
import re
dna = open('dna.txt', 'r').read().strip('\n')

re_pattern = 'A[ATGC]TAAT'

results = re.finditer(re_pattern, dna)
seqs = []

start = 0
for r in results:
    end = r.span()[1] - 3
    seq = dna[start:end]
    seqs.append(seq)
    start = end
seq = dna[start:-1]
seqs.append(seq)
seq_lens = [len(seq) for seq in seqs]
print('Fragment lengths: ', seq_lens)
print('Average sequence length: ', sum(seq_lens) / len(seq_lens))

(1140, 1146)
0 1143
(1625, 1631)
1143 1628
Fragment lengths:  [1143, 485, 383]
Average sequence length:  670.3333333333334


In [120]:
import pandas as pd
import re
enzy_dict = pd.read_csv('bionet.txt', sep='\s{3,}', header=None, skiprows=9, engine='python', index_col=0).to_dict()[1]

enzy_dict2 = dict()
with open('bionet.txt', 'r') as bionet:
    bionet = bionet.read()
    results = re.findall('.+I.+', bionet)
    for r in results:
        key, val = re.split('\s{3,}', r)
        enzy_dict2[key] = val
    

enzy_dict2

{'AaaI (XmaIII)': 'C^GGCCG',
 'AacI (BamHI)': 'GGATCC',
 'AaeI (BamHI)': 'GGATCC',
 'AagI (ClaI)': 'AT^CGAT',
 'AaqI (ApaLI)': 'GTGCAC',
 'AarI': '^NNNNNNNNGCAGGTG',
 'AatI (StuI)': 'AGG^CCT',
 'AatII': 'GACGT^C',
 'AauI (Bsp1407I)': 'T^GTACA',
 'AbaI (BclI)': 'T^GATCA',
 'AbeI (BbvCI)': 'GC^TGAGG',
 'AbrI (XhoI)': 'C^TCGAG',
 'AcaI (AsuII)': 'TTCGAA',
 'AcaII (BamHI)': 'GGATCC',
 'AcaIII (MstI)': 'TGCGCA',
 'AcaIV (HaeIII)': 'GGCC',
 'AccI': 'GT^MKAC',
 'AccII (FnuDII)': 'CG^CG',
 'AccIII (BspMII)': 'T^CCGGA',
 'Acc16I (MstI)': 'TGC^GCA',
 'Acc36I (BspMI)': '^NNNNNNNNGCAGGT',
 'Acc38I (EcoRII)': 'CCWGG',
 'Acc65I (KpnI)': 'G^GTACC',
 'Acc113I (ScaI)': 'AGT^ACT',
 'AccB1I (HgiCI)': 'G^GYRCC',
 'AccB2I (HaeII)': 'RGCGC^Y',
 'AccB7I (PflMI)': 'CCANNNN^NTGG',
 'AccBSI (BsrBI)': 'GAG^CGG',
 'AccEBI (BamHI)': 'G^GATCC',
 'AceI (TseI)': 'G^CWGC',
 'AceII (NheI)': 'GCTAG^C',
 'AceIII': '^NNNNNNNNNNNGAGCTG',
 'AciI': 'G^CGG',
 'AclI': 'AA^CGTT',
 'AclNI (SpeI)': 'A^CTAGT',
 'AclWI (BinI)': 'GG