In [1]:
import pandas as pd
from Bio import SeqIO
from matplotlib import pyplot as plt
from tqdm import tqdm
import glob
import datetime

In [2]:
res = pd.read_table("/mnt/nfs/wangd/project/paper_db/peps_all_info_to_str.tsv")
res["pep_id"] = res.index

In [3]:
res_split = res['all_info'].str.split(';', expand=True).stack().reset_index(level=1, drop=True).to_frame('all_info')
res = res.drop('all_info', axis=1).join(res_split).reset_index(drop=True)

res_split2 = res['all_info'].str.split(',', expand=True)
res_split2.columns = ['sample', 'protein', 'gene', 'transcript', 'gene_biotype', 'gene_symbol']
res = pd.concat([res, res_split2], axis=1)
res = res.drop('all_info', axis=1)

In [4]:
res["gtf_index"] = res["gene"].str.split(".").apply(lambda x: x[0]) + "," + res["gene_symbol"] + "," + res["gene_biotype"]
res["xref_index"] = res["gene"].str.split(".").apply(lambda x: x[0]) + "," + res["transcript"].str.split(".").apply(lambda x: x[0]) + "," + res["protein"].str.split(".").apply(lambda x: x[0])
res.drop_duplicates()
res.fillna("",inplace=True)

### add gtf info

In [5]:
def add_gtf(file):
    gtf = pd.read_table(file, sep='\t', comment='#', header=None, names=['chrom', 'source','feature','start','end','score ','strand','frame','attribute'], low_memory=False)
    
    gtf['gene_id'] = gtf['attribute'].str.extract('gene_id "(.*?)";')
    gtf['gene_version'] = gtf['attribute'].str.extract('gene_version "(.*?)";')
    gtf['gene_name'] = gtf['attribute'].str.extract('gene_name "(.*?)";')
    gtf['gene_source'] = gtf['attribute'].str.extract('gene_source "(.*?)";')
    gtf['gene_biotype'] = gtf['attribute'].str.extract('gene_biotype "(.*?)";')
    gtf['projection_parent_gene'] = gtf['attribute'].str.extract('projection_parent_gene "(.*?)";')

    gtf.fillna("",inplace=True)
    gtf.index = gtf["gene_id"] + "," + gtf["gene_name"] + "," + gtf["gene_biotype"]

    gtf.drop(['attribute', 'gene_id', 'gene_name', 'gene_biotype'], axis=1, inplace=True)
    
    gtf_dict = {}
    for index, row in gtf.iterrows():
        if index in gtf_dict:
            gtf_dict[index].append( ','.join(map(str, row)))
        else:
            gtf_dict[index] = [ ','.join(map(str, row))]
    
    return gtf_dict

### add xref info

In [6]:
def add_xref(file):
    xref = pd.read_table(file)

    xref.fillna("",inplace=True)
    xref.index = xref["gene_stable_id"] + "," + xref["transcript_stable_id"] + "," + xref["protein_stable_id"]
    xref.drop(['gene_stable_id', 'transcript_stable_id', 'protein_stable_id'], axis=1, inplace=True)
    
    xref_dict = {}
    for index, row in xref.iterrows():
        if index in xref_dict:
            xref_dict[index].append( ','.join(map(str, row)))
        else:
            xref_dict[index] = [ ','.join(map(str, row))]
    return xref_dict

### add gtf And xref info

In [7]:
def get_info(info_dict,index):
    info = info_dict.get(index)
    if info:
        return ';'.join(info) 
    else:
        return ""

In [8]:
start_time = datetime.datetime.now()

all_dfs = []
grouped_dfs = res.groupby("sample")
for sample, group_df in tqdm(grouped_dfs):
    gtf_pattern = "/mnt/nfs/wangd/project/paper_db/" + "gtf/*" + sample + "*"
    xref_pattern = "/mnt/nfs/wangd/project/paper_db/" + "xref/*" + sample + "*"
    matching_gtf = glob.glob(gtf_pattern)[0]
    matching_xref = glob.glob(xref_pattern)[0]
    gtf_dict = add_gtf(matching_gtf)
    xref_dict = add_xref(matching_xref)

    group_df["gtf_info"] = group_df["gtf_index"].apply(lambda x: get_info(gtf_dict,x))
    group_df["xref_info"] = group_df["xref_index"].apply(lambda x: get_info(xref_dict,x))

    all_dfs.append(group_df)

end_time = datetime.datetime.now()
time_taken = end_time - start_time
print("Time consumption :", time_taken)

100%|██████████| 97/97 [5:12:46<00:00, 193.47s/it]  

Time consumption : 5:12:46.576471





In [9]:
all_res = pd.concat(all_dfs)
all_res.to_csv("/mnt/nfs/wangd/project/paper_db/add_gtf_xref_info.tsv", header=1, sep="\t",index = None)

In [10]:
pd.read_table("/mnt/nfs/wangd/project/paper_db/add_gtf_xref_info.tsv")

Unnamed: 0,sequence_x,protein_accessions,charge,scan_number,peptidoform,exp_mass_to_charge,calc_mass_to_charge,seq,tr,modifications,...,sample,protein,gene,transcript,gene_biotype,gene_symbol,gtf_index,xref_index,gtf_info,xref_info
0,AAAAAAAAAPAAAATAATTAATTAATAAQ,GCA_109298,2,28971,AAAAAAAAAPAAAATAATTAATTAATAAQ,1172.107178,1171.600967,AAAAAAAAAPAAAATAATTAATTAATAAQ,4215.9185,,...,GCA_009914755.4,ENSP05220080001.1,ENSG05220046501.1,ENST05220179403.1,protein_coding,SRP14,"ENSG05220046501,SRP14,protein_coding","ENSG05220046501,ENST05220179403,ENSP05220080001","15,ensembl,gene,37841027,37844451,.,-,.,1,ense...","GO:0005786,GO:0005786,signal recognition parti..."
1,AAAAAAAAAPAAAATAATTAATTAATAAQ,GCA_109298,2,28971,AAAAAAAAAPAAAATAATTAATTAATAAQ,1172.107178,1171.600967,AAAAAAAAAPAAAATAATTAATTAATAAQ,4215.9185,,...,GCA_009914755.4,ENSP05220080004.1,ENSG05220046501.1,ENST05220179408.1,protein_coding,SRP14,"ENSG05220046501,SRP14,protein_coding","ENSG05220046501,ENST05220179408,ENSP05220080004","15,ensembl,gene,37841027,37844451,.,-,.,1,ense...","GO:0005786,GO:0005786,signal recognition parti..."
2,AAAAAAAAAPAAAATAATTAATTAATAAQ,GCA_109298,2,28971,AAAAAAAAAPAAAATAATTAATTAATAAQ,1172.107178,1171.600967,AAAAAAAAAPAAAATAATTAATTAATAAQ,4215.9185,,...,GCA_009914755.4,ENSP05220080007.1,ENSG05220046501.1,ENST05220179411.1,protein_coding,SRP14,"ENSG05220046501,SRP14,protein_coding","ENSG05220046501,ENST05220179411,ENSP05220080007","15,ensembl,gene,37841027,37844451,.,-,.,1,ense...","GO:0005786,GO:0005786,signal recognition parti..."
3,AAAFYKNILGAQVSEVVPLPEHGVSVVFVNLGNTK,GCA_133924,4,62173,AAAFYKNILGAQ[Deamidated]VSEVVPLPEHGVSVVFVNLGNTK,918.255432,917.997627,AAAFYKNILGAQVSEVVPLPEHGVSVVFVNLGNTK,5808.4260,12|Deamidated,...,GCA_009914755.4,ENSP05220077422.1,ENSG05220045157.1,ENST05220173529.1,protein_coding,MCEE,"ENSG05220045157,MCEE,protein_coding","ENSG05220045157,ENST05220173529,ENSP05220077422","2,ensembl,gene,71120697,71141279,.,-,.,1,ensem...","UPI0000369399,UPI0000369399,,UniParc,CHECKSUM,,,"
4,AAASHLFPFEK,"GCA_27750,GCA_107325,GCA_279617,GCA_313537",3,24015,AAASHLFPFEK,406.549468,406.548631,AAASHLFPFEK,2901.2000,,...,GCA_009914755.4,ENSP05220065711.1,ENSG05220038055.1,ENST05220146556.1,protein_coding,XPC,"ENSG05220038055,XPC,protein_coding","ENSG05220038055,ENST05220146556,ENSP05220065711","3,ensembl,gene,14147095,14180485,.,-,.,1,ensem...","A0A024R2M8,A0A024R2M8,,UniProtKB generic acces..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
912085,YVQSAQSQIHNTCWAMMGLMAVR,GCA_102815,3,22873,YVQSAQSQ[Deamidated]IHNTC[Carbamidomethyl]WAMM...,900.753479,900.411420,YVQSAQSQIHNTCWAMMGLMAVR,3952.7363,8|Deamidated|13|Carbamidomethyl|17|Oxidation,...,GCA_021951015.1,ENSP05525043594.1,ENSG05525025770.1,ENST05525097559.1,protein_coding,LSS,"ENSG05525025770,LSS,protein_coding","ENSG05525025770,ENST05525097559,ENSP05525043594","21,ensembl,gene,38336946,38377856,.,-,.,1,ense...","GO:0005811,GO:0005811,lipid droplet,GO,DEPENDE..."
912086,YVSLENQHCEHCNSCTSK,GCA_156290,3,5609,YVSLENQHC[Carbamidomethyl]EHC[Carbamidomethyl]...,751.640869,751.642109,YVSLENQHCEHCNSCTSK,823.0072,9|Carbamidomethyl|12|Carbamidomethyl|15|Carbam...,...,GCA_021951015.1,ENSP05525070939.1,ENSG05525041216.1,ENST05525158826.1,protein_coding,ZCCHC4,"ENSG05525041216,ZCCHC4,protein_coding","ENSG05525041216,ENST05525158826,ENSP05525070939","4,ensembl,gene,25451616,25509248,.,+,.,1,ensem...","GO:0003676,GO:0003676,nucleic acid binding,GO,..."
912087,YVSLENQHCEHCNSCTSK,GCA_156290,3,5609,YVSLENQHC[Carbamidomethyl]EHC[Carbamidomethyl]...,751.640869,751.642109,YVSLENQHCEHCNSCTSK,823.0072,9|Carbamidomethyl|12|Carbamidomethyl|15|Carbam...,...,GCA_021951015.1,ENSP05525070944.1,ENSG05525041216.1,ENST05525158838.1,protein_coding,ZCCHC4,"ENSG05525041216,ZCCHC4,protein_coding","ENSG05525041216,ENST05525158838,ENSP05525070944","4,ensembl,gene,25451616,25509248,.,+,.,1,ensem...","GO:0003676,GO:0003676,nucleic acid binding,GO,..."
912088,YVSLENQHCEHCNSCTSKDGR,GCA_156290,5,9951,YVSLENQ[Deamidated]HC[Carbamidomethyl]EHC[Carb...,517.421082,517.214883,YVSLENQHCEHCNSCTSKDGR,1444.7499,7|Deamidated|9|Carbamidomethyl|12|Carbamidomet...,...,GCA_021951015.1,ENSP05525070939.1,ENSG05525041216.1,ENST05525158826.1,protein_coding,ZCCHC4,"ENSG05525041216,ZCCHC4,protein_coding","ENSG05525041216,ENST05525158826,ENSP05525070939","4,ensembl,gene,25451616,25509248,.,+,.,1,ensem...","GO:0003676,GO:0003676,nucleic acid binding,GO,..."


### gtf test

In [180]:
gtf = pd.read_csv('/mnt/nfs/wangd/project/paper_db/gtf/Homo_sapiens-GCA_018505865.1-2022_07-genes.gtf', sep='\t', comment='#', header=None, names=['chrom', 'source','feature','start','end','score ','strand','frame','attribute'])

In [181]:
gtf['gene_id'] = gtf['attribute'].str.extract('gene_id "(.*?)";')
gtf['gene_version'] = gtf['attribute'].str.extract('gene_version "(.*?)";')
gtf['gene_name'] = gtf['attribute'].str.extract('gene_name "(.*?)";')
gtf['gene_source'] = gtf['attribute'].str.extract('gene_source "(.*?)";')
gtf['gene_biotype'] = gtf['attribute'].str.extract('gene_biotype "(.*?)";')
gtf['projection_parent_gene'] = gtf['attribute'].str.extract('projection_parent_gene "(.*?)";')

gtf.fillna("",inplace=True)
gtf.index = gtf["gene_id"] + "," + gtf["gene_name"] + "," + gtf["gene_biotype"]

gtf.drop(['attribute', 'gene_id', 'gene_name', 'gene_biotype'], axis=1, inplace=True)

In [190]:
gtf_dict = {}
for index, row in tqdm(gtf.iterrows()):
    if index in gtf_dict:
        gtf_dict[index].append( ','.join(map(str, row)))
    else:
        gtf_dict[index] = [ ','.join(map(str, row))]

3100070it [01:39, 31058.78it/s]


### xref test

In [86]:
xref = pd.read_table("/mnt/nfs/wangd/project/paper_db/xref/Homo_sapiens-GCA_018852615.1-2022_07-xref.tsv")

xref.fillna("",inplace=True)
xref.index = xref["gene_stable_id"] + "," + xref["transcript_stable_id"] + "," + xref["protein_stable_id"]
xref.drop(['gene_stable_id', 'transcript_stable_id', 'protein_stable_id'], axis=1, inplace=True)

In [53]:
xref_dict = {}
for index, row in tqdm(xref.iterrows()):
    if index in xref_dict:
        xref_dict[index].append( ','.join(map(str, row)))
    else:
        xref_dict[index] = [ ','.join(map(str, row))]

848450it [00:27, 30632.87it/s]
