In [1]:
import requests
import pandas as pd

In [2]:
def get_pfam_from_interpro_dict(uniprot_id):
    url = f'https://www.ebi.ac.uk/interpro/api/protein/uniprot/{uniprot_id}?ida=ida&extra_features=extra_features'
    response = requests.get(url, headers={"Accept": "application/json"})

    if response.status_code != 200:
        print(f"Error fetching data from interpro for {uniprot_id}: {response.status_code}")
        return []
    
    protein_data = response.json()

    pfam_dict = {}
    for key in protein_data.keys():
        if protein_data[key]['source_database'] == 'pfam-n':
            pfam = protein_data[key]['accession']
            pfam_dict[pfam] = []
            for fragment in protein_data[key]['locations']:
                start = fragment['fragments'][0]['start']
                end = fragment['fragments'][0]['end']
                pfam_dict[pfam].append({'start': start, 'end':end})
    return pfam_dict

In [3]:
def get_pfam_from_uniprot_dict(uniprot_id):
    url = f"https://www.ebi.ac.uk/proteins/api/proteins/{uniprot_id}"
    response = requests.get(url, headers={"Accept": "application/json"})
    
    if response.status_code != 200:
        print(f"Error fetching data from uniprot for {uniprot_id}: {response.status_code}")
        return []
    
    protein_data = response.json()
    pfams = {}
    
    if 'dbReferences' in protein_data:
        for feature in protein_data['dbReferences']:
            if feature['type'] == 'Pfam':
                pfam_id = feature['id']
                pfam_name = feature['properties']['entry name']
                pfams[pfam_id] = pfam_name
            if feature['type'] == 'Pfam-B':
                pfam_id = feature['id']
                pfam_name = feature['properties']['entry name']
                pfams[pfam_id] = pfam_name
                
    return pfams

In [4]:
genes = pd.read_csv('../v2.0/EpiGenes_main.csv')

pfam_dict = {}
pfam_names = {}
for index, row in genes.iterrows():
    pfam_dict[row['UniProt_AC']] = (get_pfam_from_interpro_dict(row['UniProt_AC']))
    pfam_names[row['UniProt_AC']] = (get_pfam_from_uniprot_dict(row['UniProt_AC']))
    pfam_list = []
    if type(pfam_dict[row['UniProt_AC']]) == dict:
        for pfam_id_interpro in pfam_dict[row['UniProt_AC']].keys():
            coors = []
            for fragment in pfam_dict[row['UniProt_AC']][pfam_id_interpro]:
                start = str(fragment['start'])
                end = str(fragment['end'])
                coors.append(start + '-' + end)
            if pfam_id_interpro in pfam_names[row['UniProt_AC']].keys():
                pfam_name = pfam_names[row['UniProt_AC']][pfam_id_interpro]
            else:
                pfam_name = 'domain'
            coors_str = ' '.join(coors)
            new_pfam = pfam_name + ' ' + pfam_id_interpro + ' ' + coors_str
            pfam_list.append(new_pfam)
            genes.at[index, 'Domain'] = ', '.join(pfam_list)
    elif (pfam_dict[row['UniProt_AC']] == []) & (type(pfam_names[row['UniProt_AC']]) == dict):
        for pfam_id_uniprot in pfam_names[row['UniProt_AC']].keys():
            pfam_name = pfam_names[row['UniProt_AC']][pfam_id_uniprot]
            new_pfam = pfam_name + ' ' + pfam_id_interpro
        genes.at[index, 'Domain'] = ', '.join(pfam_list)
    else:
        print(pfam_dict[row['UniProt_AC']])
        print(pfam_names[row['UniProt_AC']])
        genes.at[index, 'Domain'] = ''

Error fetching data from interpro for Q6ZN18: 204
Error fetching data from interpro for Q9NXW9: 204
Error fetching data from interpro for Q9NWV8: 204
Error fetching data from interpro for Q8IXM2: 204
Error fetching data from interpro for Q9Y6B2: 204
Error fetching data from interpro for Q8N6I1: 204
Error fetching data from interpro for Q96D98: 204
Error fetching data from interpro for Q8WUU5: 204
Error fetching data from interpro for P08107: 204
Error fetching data from uniprot for P08107: 404
[]
[]
Error fetching data from interpro for P08107: 204
Error fetching data from uniprot for P08107: 404
[]
[]
Error fetching data from interpro for Q8NBZ0: 204
Error fetching data from interpro for Q9P267: 204
Error fetching data from interpro for Q96DN6: 204
Error fetching data from interpro for Q9NS73: 204
Error fetching data from interpro for O60828: 204
Error fetching data from interpro for Q96IZ7: 204
Error fetching data from interpro for Q6P1X5: 204
Error fetching data from interpro for P0

In [7]:
genes

Unnamed: 0,Id,HGNC_symbol,Status,HGNC_ID,HGNC_name,GeneID,UniProt_AC,UniProt_ID,Domain,MGI_symbol,...,Function,Modification,PMID_function,Complex_name,Target,Specific_target,Product,UniProt_ID_target,PMID_target,Comment
0,1,A1CF,#,24086,APOBEC1 complementation factor,29974,Q9NQ94,A1CF_HUMAN,"RRM_1 PF00076 58-126 138-199 233-297, DND1_DSR...",A1cf,...,RNA modification,RNA deamination,10781591,APOB_mRNA_editosome,RNA,"mRNA, mC",U,#,10781591,ASP=A1CF has three RNA-binding domains with ho...
1,2,ACINU,New,17066,Apoptotic chromatin condensation inducer in th...,22985,Q9UKV3,ACINU_HUMAN,"SAP PF02037 72-106, RSB_motif PF16294 1171-1247",Acin1,...,RNA modification,Alternative splicing,22203037,#,RNA,mRNA,#,#,22203037,Production of the proapoptotic Bcl-x(S) splice...
2,3,ACTB,#,132,"actin, beta",60,P60709,ACTB_HUMAN,Actin PF00022 4-375,Actb,...,Chromatin remodeling cofactor,#,10966108,"BAF, nBAF, npBAF, PBAF, SWI/SNF-like EPAFB, bB...",chromatin,#,#,#,10966108,β-actin=ACTB and actin-related proteins appear...
3,4,ACTL6A,#,24124,actin-like 6A,86,O96019,ACL6A_HUMAN,Actin PF00022 10-428,Actl6a,...,Chromatin remodeling cofactor,#,9845365,"BAF, npBAF, PBAF, SWI/SNF_Brg1(I), SWI/SNF_Brg...",chromatin,#,#,#,9845365,β-actin and BAF53 =ACTL6A are required for max...
4,5,ACTL6B,#,160,actin-like 6B,51412,O94805,ACL6B_HUMAN,Actin PF00022 9-425,Actl6b,...,Chromatin remodeling cofactor,#,11726552,"BAF, nBAF, PBAF, SWI/SNF_Brg1(I), SWI/SNF_Brg1...",chromatin,#,#,#,11726552,Belongs to the chromatin remodeling brain-spec...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
796,797,ZNF687,#,29277,zinc finger protein 687,57592,Q8N1G0,ZN687_HUMAN,"zf-C2H2 PF00096 993-1016 1200-1222, zf-C2H2_11...",Zfp687,...,Histone modification erase cofactor,Histone acetylation,25123934,#,histone,#,#,#,#,A member of NuRD complex.
797,798,ZNF711,#,13128,zinc finger protein 711,7552,Q9Y462,ZN711_HUMAN,"Zfx_Zfy_act PF04704 62-356, zf-C2H2 PF00096 38...",Zfp711,...,Histone modification erase cofactor,Histone acetylation,20346720,#,histone,#,#,#,20346720,The PHD domain of PHF8 binds to H3K4me3 and co...
798,799,ZNHIT1,#,21688,"zinc finger, HIT-type containing 1",10467,O43257,ZNHI1_HUMAN,zf-HIT PF04438 113-141,Znhit1,...,"Chromatin remodeling cofactor, Histone modific...",Histone acetylation,15647280,SRCAP,histone,#,#,#,15647280,YL1 protein is also present in cells as a subu...
799,800,ZRANB3,#,25249,"zinc finger, RAN-binding domain containing 3",84083,Q5FWF4,ZRAB3_HUMAN,"SNF2-rel_dom PF00176 40-299, Helicase_C PF0027...",Zranb3,...,"Chromatin remodeling, Histone modification rea...",Histone methylation,22705370,#,histone,#,#,#,22705370,"All four proteins (HARP, HARP-like domain (HPL..."


In [15]:
genes.to_csv("EpiGenes_main.csv",
             sep='\t', header=True, index = False)

genes.to_excel(excel_writer = "EpiGenes_main.xlsx", sheet_name='epigenes_main',
             header=True, index = False)