# Parsing data

In [11]:
from __future__ import print_function

In [12]:
import re
import os
import glob

In [13]:
scrape_dir = os.path.join('..', 'data-scrapes')
print(scrape_dir)

..\data-scrapes


In [14]:
import datetime, time
ts = time.time()
st = datetime.datetime.fromtimestamp(ts).strftime('%Y%m%d-%H%M%S')

out_file = os.path.join('..', 'data', 'protein-seqs_' + st + '.txt')
print("Converting sequences. . . to [" +out_file+"]")

Converting sequences. . . to [..\data\protein-seqs_20180714-095845.txt]


In [15]:
num_proteins_done = 0

fasta_files = glob.glob(scrape_dir + "/*fasta")
print(fasta_files)

['..\\data-scrapes\\all-human-0001.fasta']


In [16]:
def dump_to_file(protein_id, seq):
    print('Writing', protein_id, seq, "to file", out_file)
    with open(out_file, "a") as f:
        f.write(protein_id + "," + seq + "\n")

In [17]:
for fname in fasta_files:
    print("Converting %s:" % fname)
    
    with open(fname, 'r') as f:
        protein_seq = ''
        protein_id = ''
        
        for line in f:
            match = re.search(r'^>([a-z]{2})\|([A-Z0-9]*)\|', line)
            if match:
                if protein_id != '':
                    dump_to_file(protein_id, protein_seq)
                
                num_proteins_done += 1
                if num_proteins_done > 10: break
                
                protein_id = match.group(2)
                protein_seq = ''
            else:
                protein_seq += line.strip()
                
        if protein_id != '':
            dump_to_file(protein_id, protein_seq)

Converting ..\data-scrapes\all-human-0001.fasta:
Writing P27361 MAAAAAQGGGGGEPRRTEGVGPGVPGEVEMVKGQPFDVGPRYTQLQYIGEGAYGMVSSAYDHVRKTRVAIKKISPFEHQTYCQRTLREIQILLRFRHENVIGIRDILRASTLEAMRDVYIVQDLMETDLYKLLKSQQLSNDHICYFLYQILRGLKYIHSANVLHRDLKPSNLLINTTCDLKICDFGLARIADPEHDHTGFLTEYVATRWYRAPEIMLNSKGYTKSIDIWSVGCILAEMLSNRPIFPGKHYLDQLNHILGILGSPSQEDLNCIINMKARNYLQSLPSKTKVAWAKLFPKSDSKALDLLDRMLTFNPNKRITVEEALAHPYLEQYYDPTDEPVAEEPFTFAMELDDLPKERLKELIFQETARFQPGVLEAP to file ..\data\protein-seqs_20180714-095845.txt
Writing P53779 MSLHFLYYCSEPTLDVKIAFCQGFDKQVDVSYIAKHYNMSKSKVDNQFYSVEVGDSTFTVLKRYQNLKPIGSGAQGIVCAAYDAVLDRNVAIKKLSRPFQNQTHAKRAYRELVLMKCVNHKNIISLLNVFTPQKTLEEFQDVYLVMELMDANLCQVIQMELDHERMSYLLYQMLCGIKHLHSAGIIHRDLKPSNIVVKSDCTLKILDFGLARTAGTSFMMTPYVVTRYYRAPEVILGMGYKENVDIWSVGCIMGEMVRHKILFPGRDYIDQWNKVIEQLGTPCPEFMKKLQPTVRNYVENRPKYAGLTFPKLFPDSLFPADSEHNKLKASQARDLLSKMLVIDPAKRISVDDALQHPYINVWYDPAEVEAPPPQIYDKQLDEREHTIEEWKELIYKEVMNSEEKTKNGVVKGQPSPSGAAVNSSESLPPSSSVNDISSMSTDQTLASDTDSSLEASAGPLGCCR to file ..\data\protein-seq

In [18]:
out_file_fns = os.path.join('..', 'data', 'protein_fun_' + st + '.txt')
print("Converting functions to [", out_file_fns, "]")

target_functions = ['0005524']

Converting functions to [ ..\data\protein_fun_20180714-095845.txt ]


In [19]:
annot_files = glob.glob(scrape_dir + "/*annotations.txt")
print(annot_files)

['..\\data-scrapes\\all-human-0001-annotations.txt']


In [20]:
func_list = []

for fname in annot_files:
    print("Parsing", fname)
    with open(fname, 'r') as f:
        for line in f:
            match = re.search(r'([A-Z0-9]*)\sGO:(.*);\sF:.*;', line)
            if match:
                protein_id = match.group(1)
                function = match.group(2)
                
                if function not in target_functions:
                    continue
                
                func_list.append(protein_id)
    import json
    with open(out_file_fns, 'w') as fp:
        json.dump(func_list, fp)
        
    print(func_list[:10])

Parsing ..\data-scrapes\all-human-0001-annotations.txt
['P27361', 'P53779', 'Q9UHC1', 'Q9NYL2', 'O15440', 'P33527', 'Q92887', 'O15438', 'O15439', 'Q5T3U5']
