In [1]:
import os
import gzip

from collections import defaultdict

In [21]:
symbol_entrez_p_file = '../data_example/symbol_entrez_p.example.tsv'
sent_tagging_file = '../data_example/sent_tagging.example.txt'
bio_concept_save_file = '../data_example/Bio-concept.example.txt'
entrez_p_sent_save_file = '../data_example/summary_data.example.txt'

save_tag_type = {'CPA', 'MPA', 'Disease', 'Phenotype'}


In [23]:

def read_p_file(p_file: str):
    
    entrez_to_p = {}
    entrez_to_symbol = {}
    with open(p_file) as f:
        for line in f:
            l = line.strip().split('\t')
            
            gene_symbol = l[0]
            entrez = l[1]
            p = float(l[2])
            
            entrez_to_p[entrez]= p
            entrez_to_symbol[entrez] = gene_symbol
    
    print(f'gene count: {len(entrez_to_p)}')
    return entrez_to_p, entrez_to_symbol

# 读AGAC注释文件，质量控制，保存 合并后文件
def read_agac(agac_file: str, term_save_file: str, entrez_p_sent_save_file: str, entrez_to_p: dict):
    
    entrez_to_text = defaultdict(set)
    entrez_to_text_count = defaultdict(int)
    sentence_recall_entrez = set()
    sentence_to_tags = defaultdict(set)

    MPA_CPA_DIS_HPO_set = set()
    
    line_count =0 
    with open(agac_file) as f:
        for line in f:
            line_count += 1
            if line_count % 50000 == 0:
                print(line_count)
            
            l = line.strip().split('\t')
            pmid = l[1]
            sentence = l[2]
            tags = l[3:]
            for _ in tags:
                tag = eval(_)
                sentence_to_tags[ sentence ].add(tag)

                symbol = tag[ 0 ]
                type = tag[ 1 ]
                entrez = tag[ 2 ]

                if type in save_tag_type:
                    MPA_CPA_DIS_HPO_set.add((type, symbol))
                if len(tag) == 3:
                    continue
                if type == 'Gene':
                    sentence_recall_entrez.add(entrez)
                    entrez_to_text[entrez].add((pmid, sentence))
                    entrez_to_text_count[entrez] += 1

    with open(term_save_file, 'w') as wf:
        wf.write('Type\tPhrase\n')
        for _type, symbol in MPA_CPA_DIS_HPO_set:
            wf.write(f'{_type}-{symbol}\t{symbol}\n')

    print(f'sentence_recall_entrez: {len(sentence_recall_entrez)}')
    
    save_sentence_count = 10
    entrez_to_quality_text = {}

    for entrez in entrez_to_text.keys():
        if len(entrez_to_text[entrez]) <= save_sentence_count:
            entrez_to_quality_text[entrez] = entrez_to_text[entrez]
        else:
            sentence_to_quality_tag_count = defaultdict(int)
            for (pmid, sentence) in entrez_to_text[entrez]:
                tags = sentence_to_tags[sentence]

                for tag in tags:
                    tag_type = tag[2]
                    if tag_type in save_tag_type:
                        sentence_to_quality_tag_count[(pmid, sentence)] += 1
                    else:
                        sentence_to_quality_tag_count[(pmid, sentence)] = sentence_to_quality_tag_count[(pmid, sentence)]

            save_sentence_list = list(sorted(sentence_to_quality_tag_count.keys(), key=lambda x: sentence_to_quality_tag_count[x], reverse=True))

            entrez_to_quality_text[entrez] = save_sentence_list[:save_sentence_count]
    print('sentence filter done.')

    recall_entrez = set()
    entrez_sort = sorted(entrez_to_p, key=lambda x:entrez_to_p[x])
    with open(entrez_p_sent_save_file, 'w') as wf:
        for entrez in entrez_sort:
            if entrez_to_quality_text.get(entrez):
                recall_entrez.add(entrez)
                p = entrez_to_p[entrez]
                symbol = entrez_to_symbol[entrez]
                wf.write(f'GENE_LINE:\t{symbol}\t{entrez}\t{p}\n')
                for (pmid, sentence) in entrez_to_quality_text[entrez]:
                    tag = str(sentence_to_tags[sentence])
                    wf.write(f'{pmid}\t{sentence}\t{tag}\n')

    print(f'sentence_map_recall_entrez: {len(recall_entrez)}')
    print(f'{entrez_p_sent_save_file} save done.')



In [None]:
entrez_to_p, entrez_to_symbol = read_p_file(symbol_entrez_p_file)

In [24]:
read_agac(sent_tagging_file, bio_concept_save_file, entrez_p_sent_save_file, entrez_to_p)

50000
100000
150000
200000
250000
300000
350000
400000
450000
500000
550000
600000
650000
sentence_recall_entrez: 20632
sentence filter done.
sentence_map_recall_entrez: 10498
