In [None]:
# script for download and parse pfam domains using interpro API, and secondary structure predictions using mobidb API
# https://www.ebi.ac.uk/interpro/api/protein/UniProt/O96013/entry/pfam

In [1]:
import pandas as pd
import os
import requests
import pyranges as pr
import json

In [2]:
msa_regions_pdbs = pd.read_csv('../datasets/msa_regions_pdbs.tsv', sep= '\t')
to_calculate = pd.read_csv('../datasets/to_calculate_distances.tsv', sep= '\t')

## PDBs no calculated

In [4]:
to_calculate.pdb.nunique() # 828 prepared to calculate interatomic distances

828

In [5]:
path = '../datasets/interatomic_distances_both_regions_kd_cre/'
pdbs_calculated = []
pdbs_empty = []

for file in os.listdir(path):
    if file.endswith(".csv"):
        if os.path.getsize(path + file) < 100:
            pdbs_empty.append(file)
        else:
            pdbs_calculated.append(file) # ok!

pdbs_calculated = [ i.split(".")[0] for i in pdbs_calculated]
pdbs_empty = [ i.split(".")[0] for i in pdbs_empty]

In [6]:
pdbs_no_calculated = set(to_calculate.pdb.unique().tolist()) - set(pdbs_calculated)

In [43]:
# # Save txt with pdbs no calculated
# with open('../datasets/pdbs_no_calculated.txt', 'w') as f:
#     for pdb in pdbs_no_calculated:
#         f.write("%s," % pdb)

## PDBs empty

In [7]:
# Check if they have different chains belonging to the same uniprot
pdbs_empty

['1ig1', '1jkl', '1jks', '1p4f', '2hz4', '4fif', '4fii', '4l67']

In [8]:
# SIFT pdb - Uniprot mapping
pdb_uniprot_mapping = pd.read_csv('../datasets/pdb_uniprot_mapping.tsv', sep= '\t')
pdb_uniprot_mapping

Unnamed: 0,uniprot,pdb,chain,res_beg,res_end,pdb_beg,pdb_end,sp_beg,sp_end
0,A0A003,6kv9,A,19,326,13,320,13,320
1,A0A003,6kvc,A,19,327,13,321,13,321
2,A0A009I821,7m4w,R,1,109,1,109,1,109
3,A0A009I821,7m4x,R,1,109,1,109,1,109
4,A0A009I821,7m4y,R,1,109,1,109,1,109
...,...,...,...,...,...,...,...,...,...
635147,X8CHM4,5o5l,G,30,253,205,428,205,428
635148,X8CHM4,5o5l,H,30,253,205,428,205,428
635149,X8CHM4,5o5l,I,30,253,205,428,205,428
635150,X8CHM4,5o5l,J,30,253,205,428,205,428


In [47]:
pdb_uniprot_mapping[pdb_uniprot_mapping.pdb.isin(pdbs_empty)]

Unnamed: 0,uniprot,pdb,chain,res_beg,res_end,pdb_beg,pdb_end,sp_beg,sp_end
125690,O96013,4fif,A,51,344,296,589,296,589
125691,O96013,4fif,B,52,344,297,589,297,589
125692,O96013,4fif,C,1,7,49,55,49,55
125693,O96013,4fif,D,1,7,49,55,49,55
125697,O96013,4fii,A,54,345,299,590,299,590
125698,O96013,4fii,B,1,6,49,54,49,54
125704,O96013,4l67,A,1,290,300,589,300,589
125705,O96013,4l67,B,4,23,39,58,39,58
132667,P00519,2hz4,A,7,273,234,500,234,500
132668,P00519,2hz4,B,6,271,233,498,233,498


## CREs into groups  
try to clasify CREs into the following groups: N-ter, C-ter, intradomain, other

In [4]:
all_shortest_distances_mapped = pd.read_csv('../datasets/interatomic_distances_both_regions_kd_cre/all_shortest_distances_mapped.tsv', sep= '\t')
all_shortest_distances_mapped

Unnamed: 0,chain_a,pos_kd,aa_a,atom_a,chain_b,pos_cre,aa_b,atom_b,dist,pdb,pdb_chain,pos_cre_uniprot,msa,kd_location
0,A,111,GLY,O,A,285,ASN,ND2,2.839370,1a06,1a06_A,285,Q63450_60,n-ter
1,A,111,GLY,O,A,285,ASN,ND2,2.839370,1a06,1a06_A,285,Q91YS8_60,n-ter
2,A,113,TYR,O,A,286,ILE,N,2.824408,1a06,1a06_A,286,Q63450_60,n-ter
3,A,113,TYR,O,A,286,ILE,N,2.824408,1a06,1a06_A,286,Q91YS8_60,n-ter
4,A,107,ILE,O,A,287,HIS,NE2,2.719374,1a06,1a06_A,287,Q63450_60,n-ter
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14478,A,849,GLN,NE2,A,990,PRO,O,3.034817,8d76,8d76_A,990,P00533_60,n-ter
14479,A,850,HIS,N,A,992,PRO,CG,7.311846,8d76,8d76_A,992,P00533_60,n-ter
14480,A,848,PRO,CD,A,995,SER,OG,3.173897,8d76,8d76_A,995,P00533_60,n-ter
14481,A,741,PRO,CG,A,996,ASN,O,9.931299,8d76,8d76_A,996,P00533_60,n-ter


In [5]:
all_shortest_distances_mapped[['msa', 'pdb', 'kd_location']].drop_duplicates().kd_location.value_counts()

n-ter    565
c-ter    116
Name: kd_location, dtype: int64

### Get the pfam domains from InterPro API

In [53]:
interpro_api_url = 'https://www.ebi.ac.uk/interpro/api/protein/UniProt/{}/entry/pfam'
data_list = []

for accession in msa_regions_pdbs.uniprot.unique():
    # construct the API url
    api_url = interpro_api_url.format(accession)

    # Send the API request
    response = requests.get(api_url)
    # Check if the request was successful
    if response.status_code == 200:
        # Save the json response
        filename = f'../datasets/domains/json_files/{accession}.json'
        with open(filename, 'w') as file:
            file.write(response.text)
        
        json_data = response.json()
        entry_subset = json_data.get("entry_subset", [])

        for entry in entry_subset:
            #entry["accession"] = accession

            protein_info = json_data.get("metadata", {})
            pfam_info = entry

            # Extract the desired information from the protein_info
            accession = protein_info.get("accession")
            id = protein_info.get("id")
            source_organism = protein_info.get("source_organism", {}).get("scientificName")
            taxId = protein_info.get("source_organism", {}).get("taxId")
            scientificName = protein_info.get("source_organism", {}).get("scientificName")
            name = protein_info.get("name")
            description = protein_info.get("description")
            length = protein_info.get("length")
            sequence = protein_info.get("sequence")

            # Extract the desired information from the pfam_info
            pfam_accession = entry.get("accession")
            start = entry.get("entry_protein_locations", [])[0].get("fragments", [])[0].get("start")
            end = entry.get("entry_protein_locations", [])[0].get("fragments", [])[0].get("end")
            entry_integrated = entry.get("entry_integrated")


            # Append the extracted data to the data_list
            data_list.append({
                "accession": accession,
                "id": id,
                "source_organism": source_organism,
                "taxId": taxId,
                "scientificName": scientificName,
                "name": name,
                "description": description,
                "length": length,
                "sequence": sequence,
                "pfam_accession": pfam_accession,
                "start": start,
                "end": end,
                "entry_integrated": entry_integrated
            })

        #data_list.extend(entry_subset)


    else:
        print(f'Error downloading json for {accession}. Status code: {response.status_code}')

df = pd.DataFrame(data_list)

In [44]:
df

Unnamed: 0,accession,id,source_organism,taxId,scientificName,name,description,length,sequence,pfam_accession,start,end,entry_integrated
0,A5K0N4,KGP_PLAVS,Plasmodium vivax (strain Salvador I),126793,Plasmodium vivax (strain Salvador I),cGMP-dependent protein kinase,[Serine/threonine protein kinase which acts as...,846,MRCNERNKKKAIFSNDDFSGEDTLMEDHLQLREKLSEDIEMIKASL...,PF00027,71,151,ipr000595
1,A5K0N4,KGP_PLAVS,Plasmodium vivax (strain Salvador I),126793,Plasmodium vivax (strain Salvador I),cGMP-dependent protein kinase,[Serine/threonine protein kinase which acts as...,846,MRCNERNKKKAIFSNDDFSGEDTLMEDHLQLREKLSEDIEMIKASL...,PF00069,535,791,ipr000719
2,Q8I719,KGP_PLAF7,Plasmodium falciparum (isolate 3D7),36329,Plasmodium falciparum (isolate 3D7),cGMP-dependent protein kinase,[Serine/threonine protein kinase which acts as...,853,MEEDDNLKKGNERNKKKAIFSNDDFTGEDSLMEDHLELREKLSEDI...,PF00027,78,158,ipr000595
3,Q8I719,KGP_PLAF7,Plasmodium falciparum (isolate 3D7),36329,Plasmodium falciparum (isolate 3D7),cGMP-dependent protein kinase,[Serine/threonine protein kinase which acts as...,853,MEEDDNLKKGNERNKKKAIFSNDDFTGEDSLMEDHLELREKLSEDI...,PF00069,542,798,ipr000719
4,O74536,SNF1_SCHPO,Schizosaccharomyces pombe (strain 972 / ATCC 2...,284812,Schizosaccharomyces pombe (strain 972 / ATCC 2...,SNF1-like protein kinase ssp2,[Serine/threonine protein kinase essential for...,576,MQPQEVDLMENSTMRNGARVLPPEAISKRHIGPYIIRETLGEGSFG...,PF16579,451,573,ipr032270
...,...,...,...,...,...,...,...,...,...,...,...,...,...
104,Q8IBS5,CDPK4_PLAF7,Plasmodium falciparum (isolate 3D7),36329,Plasmodium falciparum (isolate 3D7),Calcium-dependent protein kinase 4,[Calcium-dependent protein kinase which acts a...,528,MGQEVSSVNNTKNEHHKTNKKSLKGGNERHEMKESSVGISKKIVEN...,PF00069,70,328,ipr000719
105,Q8IBS5,CDPK4_PLAF7,Plasmodium falciparum (isolate 3D7),36329,Plasmodium falciparum (isolate 3D7),Calcium-dependent protein kinase 4,[Calcium-dependent protein kinase which acts a...,528,MGQEVSSVNNTKNEHHKTNKKSLKGGNERHEMKESSVGISKKIVEN...,PF13499,381,450,ipr002048
106,Q8VDF3,DAPK2_MOUSE,Mus musculus,10090,Mus musculus,Death-associated protein kinase 2,[Calcium/calmodulin-dependent serine/threonine...,370,MVQASMRSPNMETFKQQKVEDFYDIGEELGSGQFAIVKKCREKSTG...,PF00069,23,285,ipr000719
107,Q9UIK4,DAPK2_HUMAN,Homo sapiens,9606,Homo sapiens,Death-associated protein kinase 2,[Calcium/calmodulin-dependent serine/threonine...,370,MFQASMRSPNMEPFKQQKVEDFYDIGEELGSGQFAIVKKCREKSTG...,PF00069,23,285,ipr000719


In [41]:
# Add domains name
import re
import gzip

def parse_pfam_mapping(file_path):
    accession_domain_mapping = {}

    with gzip.open(file_path, 'rt') as file:
        content = file.read()

    # Use regex to extract accession and domain names
    pattern = r"#=GF ID\s+(\w+)\n#=GF AC\s+([\w\.]+)\n"
    matches = re.findall(pattern, content)

    # Create a dictionary with accession as keys and domain names as values
    for match in matches:
        accession = match[1]
        accession = accession.split(".")[0]
        domain_name = match[0]
        accession_domain_mapping[accession] = domain_name

    return accession_domain_mapping

# Specify the path to the PFAM mapping file
file_path = '../raw_data/Pfam-A.hmm.dat.gz'

# Parse the file and get the accession-domain mapping
mapping = parse_pfam_mapping(file_path)

pfam_domains = pd.DataFrame.from_dict(mapping, orient= 'index', columns= ['domain_name']).reset_index().rename(columns={'index': 'pfam'}).sort_values('pfam')
pfam_domains


Unnamed: 0,pfam,domain_name
42,PF00001,7tm_1
43,PF00002,7tm_2
44,PF00003,7tm_3
72,PF00004,AAA
155,PF00005,ABC_tran
...,...,...
7835,PF20620,DUF6805
7836,PF20621,DUF6806
1233,PF20622,Big_15
15497,PF20623,Sgo0707_N2


In [107]:
to_add = {'pfam': 'PF07714', 'domain_name': 'PK_Tyr_Ser'}
pfam_domains = pfam_domains.append(to_add, ignore_index= True)
pfam_domains

Unnamed: 0,pfam,domain_name
0,PF00001,7tm_1
1,PF00002,7tm_2
2,PF00003,7tm_3
3,PF00004,AAA
4,PF00005,ABC_tran
...,...,...
18105,PF20621,DUF6806
18106,PF20622,Big_15
18107,PF20623,Sgo0707_N2
18108,PF20624,DMRT5_DMB


In [54]:
# Format it
df = df[['accession', 'source_organism', 'name', 'pfam_accession', 'start', 'end', 'entry_integrated']].rename(
    columns= {
        'accession': 'uniprot',
        'name': 'protein_name',
        'pfam_accession': 'pfam',
        'entry_integrated': 'interpro'
    }
)

In [108]:
pfam = df[['uniprot', 'pfam', 'protein_name', 'start', 'end']].merge(pfam_domains, how= 'left')
pfam

Unnamed: 0,uniprot,pfam,protein_name,start,end,domain_name
0,A5K0N4,PF00027,cGMP-dependent protein kinase,71,151,cNMP_binding
1,A5K0N4,PF00069,cGMP-dependent protein kinase,535,791,Pkinase
2,Q8I719,PF00027,cGMP-dependent protein kinase,78,158,cNMP_binding
3,Q8I719,PF00069,cGMP-dependent protein kinase,542,798,Pkinase
4,O74536,PF16579,SNF1-like protein kinase ssp2,451,573,AdenylateSensor
...,...,...,...,...,...,...
104,Q8IBS5,PF00069,Calcium-dependent protein kinase 4,70,328,Pkinase
105,Q8IBS5,PF13499,Calcium-dependent protein kinase 4,381,450,
106,Q8VDF3,PF00069,Death-associated protein kinase 2,23,285,Pkinase
107,Q9UIK4,PF00069,Death-associated protein kinase 2,23,285,Pkinase


In [109]:
pfam[pfam.domain_name.isna()].pfam.unique() # les falta el domain name

array(['PF00757', 'PF17988', 'PF07679', 'PF13499'], dtype=object)

In [110]:
#pfam.to_csv('../datasets/pfam_domains.tsv', sep= '\t', index= False)

In [111]:
regions = msa_regions_pdbs[['uniprot', 'term_id_cre', 'start_cre', 'end_cre', 'length_cre', 'term_id_kd', 'start_kd', 'end_kd']].drop_duplicates()
pfam_cre = pfam.merge(regions)
pfam_cre

Unnamed: 0,uniprot,pfam,protein_name,start,end,domain_name,term_id_cre,start_cre,end_cre,length_cre,term_id_kd,start_kd,end_kd
0,A5K0N4,PF00027,cGMP-dependent protein kinase,71,151,cNMP_binding,cre10,1,22,22,kd10,534,791
1,A5K0N4,PF00069,cGMP-dependent protein kinase,535,791,Pkinase,cre10,1,22,22,kd10,534,791
2,Q8I719,PF00027,cGMP-dependent protein kinase,78,158,cNMP_binding,cre223,1,29,29,kd223,541,798
3,Q8I719,PF00069,cGMP-dependent protein kinase,542,798,Pkinase,cre223,1,29,29,kd223,541,798
4,O74536,PF16579,SNF1-like protein kinase ssp2,451,573,AdenylateSensor,cre21,305,351,47,kd21,34,285
...,...,...,...,...,...,...,...,...,...,...,...,...,...
109,Q8IBS5,PF00069,Calcium-dependent protein kinase 4,70,328,Pkinase,cre224,350,358,9,kd224,71,329
110,Q8IBS5,PF13499,Calcium-dependent protein kinase 4,381,450,,cre224,350,358,9,kd224,71,329
111,Q8VDF3,PF00069,Death-associated protein kinase 2,23,285,Pkinase,cre233,292,301,10,kd233,23,285
112,Q9UIK4,PF00069,Death-associated protein kinase 2,23,285,Pkinase,cre273,292,301,10,kd273,23,285


In [112]:
# Find wich CREs overlaps PFAM domains using Pyranges join method
regions_pr = pr.PyRanges(regions.rename(columns={'uniprot': 'Chromosome', 'start_cre': 'Start', 'end_cre': 'End'}))
pfam_pr = pr.PyRanges(pfam.rename(columns={'uniprot': 'Chromosome', 'start': 'Start', 'end': 'End'}))

In [113]:
overlapped = pfam_pr.join(regions_pr, slack= 1, suffix= '_cre', report_overlap= True)
overlapped = overlapped.as_df()
overlapped = overlapped.rename(columns= {'Chromosome': 'uniprot'})
overlapped.columns = overlapped.columns.str.lower()
overlapped["fraction_overlap"] = round(overlapped.overlap / overlapped.length_cre, 2)
overlapped

Unnamed: 0,uniprot,pfam,protein_name,start,end,domain_name,term_id_cre,start_cre,end_cre,length_cre,term_id_kd,start_kd,end_kd,overlap,fraction_overlap
0,O74536,PF08587,SNF1-like protein kinase ssp2,305,345,UBA_2,cre21,305,351,47,kd21,34,285,41,0.87
1,O96013,PF00786,Serine/threonine-protein kinase PAK 4,10,63,PBD,cre32,1,130,130,kd34,321,572,55,0.42
2,P00519,PF00018,Tyrosine-protein kinase ABL1,67,113,SH3_1,cre36,61,233,173,kd37,242,493,48,0.28
3,P00519,PF00017,Tyrosine-protein kinase ABL1,127,202,SH2,cre36,61,233,173,kd37,242,493,77,0.45
4,P00523,PF00018,Proto-oncogene tyrosine-protein kinase Src,87,134,SH3_1,cre37,88,248,161,kd38,267,520,47,0.29
5,P00523,PF00017,Proto-oncogene tyrosine-protein kinase Src,148,230,SH2,cre37,88,248,161,kd38,267,520,84,0.52
6,P00533,PF07714,Epidermal growth factor receptor,713,965,PK_Tyr_Ser,cre39,965,998,34,kd39,712,979,1,0.03
7,P10721,PF07714,Mast/stem cell growth factor receptor Kit,589,923,PK_Tyr_Ser,cre51,544,591,48,kd51,589,937,3,0.06
8,P28482,PF00069,Mitogen-activated protein kinase 1,25,313,Pkinase,cre66,173,196,24,kd66,25,313,23,0.96
9,P29323,PF14575,Ephrin type-B receptor 2,544,617,EphA2_TM,cre71,588,621,34,kd71,621,884,30,0.88


In [114]:
#overlapped.to_csv('../datasets/pfam_cre_overlap.tsv', sep= '\t', index= False)

In [115]:
overlapped.domain_name.value_counts()

Pkinase       7
PK_Tyr_Ser    3
KA1           2
PBD           2
SH3_1         2
SH2           2
PH            2
FERM_N_2      1
EphA2_TM      1
UBA_2         1
FERM_M        1
Name: domain_name, dtype: int64

In [118]:
overlapped.term_id_cre.nunique()

21

In [116]:
overlapped[overlapped.uniprot == 'P00519']

Unnamed: 0,uniprot,pfam,protein_name,start,end,domain_name,term_id_cre,start_cre,end_cre,length_cre,term_id_kd,start_kd,end_kd,overlap,fraction_overlap
2,P00519,PF00018,Tyrosine-protein kinase ABL1,67,113,SH3_1,cre36,61,233,173,kd37,242,493,48,0.28
3,P00519,PF00017,Tyrosine-protein kinase ABL1,127,202,SH2,cre36,61,233,173,kd37,242,493,77,0.45


## PFAM - CRE

In [None]:
pd.read_csv('../datasets/pfam_cre_overlap.tsv')

## Secondary structure scores - Fess scores

In [3]:
# Dataset with all proteins
#msa_regions_pdbs = pd.read_csv('../datasets/msa_regions_pdbs.tsv', sep= '\t')
all_msas = pd.read_csv('../datasets/all_msas.tsv', sep= '\t')
all_msas = all_msas[all_msas.msa.str.endswith("_60")]

region_to_transfer = pd.read_csv('../homology_transfer/pipeline_MSA_uniprot/data_snake/12_region_to_transfer.txt', sep= " ")
region_to_transfer = region_to_transfer[(region_to_transfer.full_identity == '60') & (region_to_transfer.region_identity == '60')]

In [4]:
set1 = set(region_to_transfer.uniprot_acc.unique()) # unique proteins in our orthologs dataset
set2 = set(all_msas.uniprot.unique())
all = set1.union(set2)
all = list(all)

In [5]:
grouped_strings = []
for i in range(0, len(all), 20):
    group = all[i:i+20]
    group_string = ",".join(group)
    grouped_strings.append(group_string)

In [11]:
# Empty df to concat the new data
# cols to keep
columns = ['acc', 'length', 'organism', 'ncbi_taxon_id', 'sequence',
       'homology_domain_pfam', 'homology_domain_merge',
       'prediction_disorder_mobidb_lite',  "prediction_helix_fess", "prediction_sheet_fess", "prediction_coil_fess"] # 'homology_disorder_disprot',

mobidb_lite = pd.DataFrame() #columns= columns

i= 0
errors = {}
# Empty list to store raw JSON data
raw_json_data = []

for chunk in grouped_strings: 
    # API download mobidb
    to_download = f'https://mobidb.bio.unipd.it/api/download?acc={chunk}&projection='
    # Request
    response = requests.get(to_download)

    if response.status_code == 200:
        try:
            # Store raw JSON data
            raw_json_data.append(response.text)

            # Replace blank spaces with a comma so json loads can read it properly
            json_data = response.text.replace("}\n{", "},{")
            j_text = "[" + json_data + "]"
            # Parse the json string
            query = json.loads(j_text)

            # To dataframe
            df = pd.DataFrame(query)
            # Keep cols of interest
            df.columns = df.columns.str.replace("-", "_")
            df = df[columns]
            
            # Add the new data
            mobidb_lite = pd.concat([mobidb_lite, df])

            i += 1
            print(f"chunk {i} downloaded and parsed. No. of entries {mobidb_lite.shape[0]}")
        
        except BaseException as e:
            print(f'an exception occured in chunk {i}: {e}')
            continue

    else:
        errors[i] = to_download
        print(f"error with {to_download}")

# Concatenate raw JSON data and save to a file
concatenated_json = "[" + ",".join(raw_json_data) + "]"
with open("../raw_data/mobidb_query_all_orthologs.json", "w") as json_file:
    json_file.write(concatenated_json)

chunk 1 downloaded and parsed. No. of entries 20
chunk 2 downloaded and parsed. No. of entries 40
chunk 3 downloaded and parsed. No. of entries 60
chunk 4 downloaded and parsed. No. of entries 80
chunk 5 downloaded and parsed. No. of entries 100
chunk 6 downloaded and parsed. No. of entries 120
chunk 7 downloaded and parsed. No. of entries 140
chunk 8 downloaded and parsed. No. of entries 160
chunk 9 downloaded and parsed. No. of entries 180
chunk 10 downloaded and parsed. No. of entries 200
chunk 11 downloaded and parsed. No. of entries 220
chunk 12 downloaded and parsed. No. of entries 240
chunk 13 downloaded and parsed. No. of entries 260
chunk 14 downloaded and parsed. No. of entries 280
chunk 15 downloaded and parsed. No. of entries 300
chunk 16 downloaded and parsed. No. of entries 320
chunk 17 downloaded and parsed. No. of entries 340
chunk 18 downloaded and parsed. No. of entries 360
chunk 19 downloaded and parsed. No. of entries 380
chunk 20 downloaded and parsed. No. of entri

In [None]:
#mobidb_lite.to_csv('../raw_data/mobidb_query_all_orthologs.tsv', sep= '\t', index= False)

In [101]:
mobidb_lite = mobidb_lite[['acc', 'length', 'prediction_helix_fess', 'prediction_sheet_fess', 'prediction_coil_fess', 'score_helix_fess']].reset_index()
mobidb_lite.drop(columns='index', inplace= True)
mobidb_lite

Unnamed: 0,acc,length,prediction_helix_fess,prediction_sheet_fess,prediction_coil_fess,score_helix_fess
0,A0A091DQI4,973,"{'scores': [0.004, 0.068, 0.114, 0.153, 0.145,...","{'scores': [0.006, 0.454, 0.59, 0.617, 0.618, ...","{'scores': [0.99, 0.478, 0.296, 0.229, 0.237, ...","[0.004, 0.068, 0.114, 0.153, 0.145, 0.125, 0.1..."
1,A0A0A2V081,315,"{'scores': [0.003, 0.028, 0.03, 0.011, 0.013, ...","{'scores': [0.002, 0.044, 0.051, 0.037, 0.016,...","{'scores': [0.995, 0.927, 0.919, 0.952, 0.97, ...","[0.003, 0.028, 0.03, 0.011, 0.013, 0.054, 0.12..."
2,A0A0D9R447,433,"{'scores': [0.003, 0.021, 0.02, 0.018, 0.054, ...","{'scores': [0.002, 0.064, 0.068, 0.107, 0.141,...","{'scores': [0.995, 0.915, 0.912, 0.874, 0.805,...","[0.003, 0.021, 0.02, 0.018, 0.054, 0.082, 0.12..."
3,A0A0D9RZF3,974,"{'scores': [0.003, 0.018, 0.019, 0.023, 0.027,...","{'scores': [0.01, 0.516, 0.633, 0.636, 0.653, ...","{'scores': [0.987, 0.467, 0.348, 0.341, 0.32, ...","[0.003, 0.018, 0.019, 0.023, 0.027, 0.03, 0.03..."
4,A0A0E0IYS8,513,"{'scores': [0.003, 0.011, 0.013, 0.01, 0.038, ...","{'scores': [0.002, 0.055, 0.064, 0.028, 0.027,...","{'scores': [0.995, 0.933, 0.923, 0.962, 0.935,...","[0.003, 0.011, 0.013, 0.01, 0.038, 0.036, 0.03..."
...,...,...,...,...,...,...
8406,D6W6N7,359,"{'scores': [0.003, 0.039, 0.119, 0.146, 0.157,...","{'scores': [0.002, 0.071, 0.197, 0.187, 0.134,...","{'scores': [0.995, 0.89, 0.684, 0.666, 0.709, ...","[0.003, 0.039, 0.119, 0.146, 0.157, 0.236, 0.2..."
8407,G5EA63,395,"{'scores': [0.003, 0.015, 0.058, 0.035, 0.052,...","{'scores': [0.002, 0.034, 0.028, 0.023, 0.034,...","{'scores': [0.995, 0.951, 0.914, 0.942, 0.914,...","[0.003, 0.015, 0.058, 0.035, 0.052, 0.05, 0.06..."
8408,H0Z3Y5,1130,"{'scores': [0.003, 0.028, 0.041, 0.063, 0.08, ...","{'scores': [0.003, 0.363, 0.592, 0.684, 0.671,...","{'scores': [0.994, 0.609, 0.367, 0.253, 0.249,...","[0.003, 0.028, 0.041, 0.063, 0.08, 0.126, 0.11..."
8409,P36507,400,"{'scores': [0.004, 0.167, 0.198, 0.199, 0.121,...","{'scores': [0.002, 0.088, 0.092, 0.107, 0.118,...","{'scores': [0.994, 0.746, 0.711, 0.694, 0.76, ...","[0.004, 0.167, 0.198, 0.199, 0.121, 0.046, 0.0..."


In [106]:
mobidb_lite['score_helix_fess'] = mobidb_lite.prediction_helix_fess.apply(lambda x: x['scores'])
mobidb_lite['score_sheet_fess'] = mobidb_lite.prediction_sheet_fess.apply(lambda x: x['scores'])
mobidb_lite['score_coil_fess'] = mobidb_lite.prediction_coil_fess.apply(lambda x: x['scores'])

In [109]:
mobidb_lite # ok!

Unnamed: 0,acc,length,prediction_helix_fess,prediction_sheet_fess,prediction_coil_fess,score_helix_fess,score_sheet_fess,score_coil_fess
0,A0A091DQI4,973,"{'scores': [0.004, 0.068, 0.114, 0.153, 0.145,...","{'scores': [0.006, 0.454, 0.59, 0.617, 0.618, ...","{'scores': [0.99, 0.478, 0.296, 0.229, 0.237, ...","[0.004, 0.068, 0.114, 0.153, 0.145, 0.125, 0.1...","[0.006, 0.454, 0.59, 0.617, 0.618, 0.535, 0.37...","[0.99, 0.478, 0.296, 0.229, 0.237, 0.341, 0.51..."
1,A0A0A2V081,315,"{'scores': [0.003, 0.028, 0.03, 0.011, 0.013, ...","{'scores': [0.002, 0.044, 0.051, 0.037, 0.016,...","{'scores': [0.995, 0.927, 0.919, 0.952, 0.97, ...","[0.003, 0.028, 0.03, 0.011, 0.013, 0.054, 0.12...","[0.002, 0.044, 0.051, 0.037, 0.016, 0.015, 0.0...","[0.995, 0.927, 0.919, 0.952, 0.97, 0.931, 0.85..."
2,A0A0D9R447,433,"{'scores': [0.003, 0.021, 0.02, 0.018, 0.054, ...","{'scores': [0.002, 0.064, 0.068, 0.107, 0.141,...","{'scores': [0.995, 0.915, 0.912, 0.874, 0.805,...","[0.003, 0.021, 0.02, 0.018, 0.054, 0.082, 0.12...","[0.002, 0.064, 0.068, 0.107, 0.141, 0.213, 0.2...","[0.995, 0.915, 0.912, 0.874, 0.805, 0.706, 0.5..."
3,A0A0D9RZF3,974,"{'scores': [0.003, 0.018, 0.019, 0.023, 0.027,...","{'scores': [0.01, 0.516, 0.633, 0.636, 0.653, ...","{'scores': [0.987, 0.467, 0.348, 0.341, 0.32, ...","[0.003, 0.018, 0.019, 0.023, 0.027, 0.03, 0.03...","[0.01, 0.516, 0.633, 0.636, 0.653, 0.61, 0.437...","[0.987, 0.467, 0.348, 0.341, 0.32, 0.359, 0.52..."
4,A0A0E0IYS8,513,"{'scores': [0.003, 0.011, 0.013, 0.01, 0.038, ...","{'scores': [0.002, 0.055, 0.064, 0.028, 0.027,...","{'scores': [0.995, 0.933, 0.923, 0.962, 0.935,...","[0.003, 0.011, 0.013, 0.01, 0.038, 0.036, 0.03...","[0.002, 0.055, 0.064, 0.028, 0.027, 0.031, 0.0...","[0.995, 0.933, 0.923, 0.962, 0.935, 0.933, 0.9..."
...,...,...,...,...,...,...,...,...
8406,D6W6N7,359,"{'scores': [0.003, 0.039, 0.119, 0.146, 0.157,...","{'scores': [0.002, 0.071, 0.197, 0.187, 0.134,...","{'scores': [0.995, 0.89, 0.684, 0.666, 0.709, ...","[0.003, 0.039, 0.119, 0.146, 0.157, 0.236, 0.2...","[0.002, 0.071, 0.197, 0.187, 0.134, 0.142, 0.1...","[0.995, 0.89, 0.684, 0.666, 0.709, 0.622, 0.58..."
8407,G5EA63,395,"{'scores': [0.003, 0.015, 0.058, 0.035, 0.052,...","{'scores': [0.002, 0.034, 0.028, 0.023, 0.034,...","{'scores': [0.995, 0.951, 0.914, 0.942, 0.914,...","[0.003, 0.015, 0.058, 0.035, 0.052, 0.05, 0.06...","[0.002, 0.034, 0.028, 0.023, 0.034, 0.064, 0.0...","[0.995, 0.951, 0.914, 0.942, 0.914, 0.887, 0.8..."
8408,H0Z3Y5,1130,"{'scores': [0.003, 0.028, 0.041, 0.063, 0.08, ...","{'scores': [0.003, 0.363, 0.592, 0.684, 0.671,...","{'scores': [0.994, 0.609, 0.367, 0.253, 0.249,...","[0.003, 0.028, 0.041, 0.063, 0.08, 0.126, 0.11...","[0.003, 0.363, 0.592, 0.684, 0.671, 0.577, 0.4...","[0.994, 0.609, 0.367, 0.253, 0.249, 0.297, 0.4..."
8409,P36507,400,"{'scores': [0.004, 0.167, 0.198, 0.199, 0.121,...","{'scores': [0.002, 0.088, 0.092, 0.107, 0.118,...","{'scores': [0.994, 0.746, 0.711, 0.694, 0.76, ...","[0.004, 0.167, 0.198, 0.199, 0.121, 0.046, 0.0...","[0.002, 0.088, 0.092, 0.107, 0.118, 0.08, 0.09...","[0.994, 0.746, 0.711, 0.694, 0.76, 0.873, 0.86..."


In [113]:
# # Control - the length of the score lists must be equal to the protein length (one score per position)
# (mobidb_lite.score_helix_fess.apply(lambda x: len(x)) == mobidb_lite.length).value_counts() # ok!
# (mobidb_lite.score_sheet_fess.apply(lambda x: len(x)) == mobidb_lite.length).value_counts() # ok!
# (mobidb_lite.score_coil_fess.apply(lambda x: len(x)) == mobidb_lite.length).value_counts() # Ok!!

In [117]:
mobidb_lite.rename(columns= {'acc': 'uniprot_acc'}, inplace= True)

In [120]:
mobidb_sec_scores = mobidb_lite[['uniprot_acc', 'length', 'score_helix_fess', 'score_sheet_fess', 'score_coil_fess']]

In [121]:
#mobidb_sec_scores.to_csv('../datasets/mobidb_secondary_scores.tsv', sep= '\t', index= False)