# EXTRACTING UNIPROT FEATURE ANNOTATIONS TO CLASSIFY SITES BASED ON FUNCTION

In this notebook we automatically extract UniProt feature annotations via UniProt Proteins API.

### IMPORTING NECESSARY PACKAGES

In [1]:
from fragsys_analysis import *

### READING INPUT DATA

In [2]:
main_dir = "./../"
results_dir = os.path.join(main_dir, "results")

In [3]:
prots_df = pd.read_pickle(os.path.join(results_dir, "all_prots.pkl"))
all_bs_ress = pd.read_pickle(os.path.join(results_dir, "all_bs_ress.pkl"))

In [7]:
prots_df.head(5)

Unnamed: 0,acc,group,n_strucs,n_ligs,n_un_ligs,n_bs,n_seqs,n_human_seqs,n_var_seqs,n_vars,n_human_res,vars_per_seq,vars_per_res
0,H0Y4R8,0,1,1,1,1,596,143,138,3566,10864,24.94,0.33
1,O43809,0,10,19,10,4,10,1,1,206,390,206.0,0.53
2,Q5T0W9,0,11,24,10,6,415,16,16,1398,2254,87.38,0.62
3,Q460N5,0,18,18,18,1,223,12,10,837,1821,69.75,0.46
4,Q12830,0,8,8,8,1,225,57,53,1711,5221,30.02,0.33


In [5]:
print(all_bs_ress.shape)

(14172, 62)


In [6]:
accs = prots_df.acc.unique().tolist()
print(len(accs))

35


In [6]:
acc_feat_dfs = []
for acc in accs:
    acc_feat_df = pd.read_json("https://www.ebi.ac.uk/proteins/api/features/{}?categories=DOMAINS_AND_SITES".format(acc))
    acc_feat_dfs.append(acc_feat_df)

In [7]:
feats_df = pd.concat(acc_feat_dfs).reset_index(drop = True)

In [8]:
feats_df.head(5)

Unnamed: 0,accession,entryName,sequence,sequenceChecksum,taxid,features
0,H0Y4R8,H0Y4R8_HUMAN,XTLYKERFKQGTSKKCIQSEDKKWFTPREFEIEGDRGASKNWKLSI...,99E406A0DA7ECC75,9606,"{'type': 'DOMAIN', 'category': 'DOMAINS_AND_SI..."
1,H0Y4R8,H0Y4R8_HUMAN,XTLYKERFKQGTSKKCIQSEDKKWFTPREFEIEGDRGASKNWKLSI...,99E406A0DA7ECC75,9606,"{'type': 'DOMAIN', 'category': 'DOMAINS_AND_SI..."
2,H0Y4R8,H0Y4R8_HUMAN,XTLYKERFKQGTSKKCIQSEDKKWFTPREFEIEGDRGASKNWKLSI...,99E406A0DA7ECC75,9606,"{'type': 'BINDING', 'category': 'DOMAINS_AND_S..."
3,H0Y4R8,H0Y4R8_HUMAN,XTLYKERFKQGTSKKCIQSEDKKWFTPREFEIEGDRGASKNWKLSI...,99E406A0DA7ECC75,9606,"{'type': 'BINDING', 'category': 'DOMAINS_AND_S..."
4,H0Y4R8,H0Y4R8_HUMAN,XTLYKERFKQGTSKKCIQSEDKKWFTPREFEIEGDRGASKNWKLSI...,99E406A0DA7ECC75,9606,"{'type': 'BINDING', 'category': 'DOMAINS_AND_S..."


In [9]:
feats_df.shape

(607, 6)

In [10]:
dump_pickle(feats_df, os.path.join(results_dir, "up_func_anns.pkl"))

In [11]:
accs_in_df = feats_df.accession.unique().tolist()

In [12]:
[acc for acc in accs if acc not in accs_in_df] # not in data frame as it is unreviewed

['Q8WS26']

In [13]:
feats_df = feats_df.join(feats_df.features.apply(lambda x: pd.Series(x))).drop(columns = "features")

In [14]:
cc = ["accession", "type", "description", "begin", "end", "molecule", "evidences", "ligand", "ligandPart"]

In [15]:
feats_df[cc].head(3)

Unnamed: 0,accession,type,description,begin,end,molecule,evidences,ligand,ligandPart
0,H0Y4R8,DOMAIN,SAND,1,58,,"[{'code': 'ECO:0000259', 'source': {'name': 'P...",,
1,H0Y4R8,DOMAIN,PHD-type,76,122,,"[{'code': 'ECO:0000259', 'source': {'name': 'P...",,
2,H0Y4R8,BINDING,,79,79,,"[{'code': 'ECO:0007829', 'source': {'name': 'P...","{'name': 'Zn(2+)', 'dbReference': {'name': 'Ch...",


In [16]:
feats_df.type.value_counts()

BINDING     227
REGION      123
DOMAIN      114
SITE         49
ACT_SITE     36
MOTIF        17
REPEAT       17
ZN_FING      13
COILED       10
DNA_BIND      1
Name: type, dtype: int64

In [17]:
int_feats = ["BINDING", "SITE", "ACT_SITE"] # NOT: REGION, DOMAIN, MOTIF, REPEAT, ZN_FING, COILED, DNA_BIND

In [18]:
feats_df_filt = feats_df.query('type in @int_feats').copy().reset_index(drop = True)
feats_df_filt.type.value_counts()

BINDING     227
SITE         49
ACT_SITE     36
Name: type, dtype: int64

In [19]:
feats_df_filt.end = feats_df_filt.end.astype(int)
feats_df_filt.begin = feats_df_filt.begin.astype(int)

## GETTING FUNCTIONAL RESIDUES FOR EACH PROTEIN

In [20]:
func_ress = {}
for acc, acc_df in feats_df_filt.groupby("accession"):
    func_ress[acc] = []
    for i, row in acc_df.iterrows():
        beg = row.begin
        end = row.end
        if beg == end:
            func_ress[acc].append(beg)
        else:
            func_ress[acc].extend(list(range(beg, end+1)))
            
func_ress = {k: sorted(list(set(v))) for k, v in func_ress.items()}

In [21]:
print(len(func_ress)) # 24/35 proteins have functional feature annotations

24


In [22]:
bs_cols = [
    "BS0", "BS1", "BS2", "BS3", "BS4",
    "BS5", "BS6", "BS7", "BS8", "BS9",
    "BS10", "BS11", "BS12", "BS13", "BS14",
    "BS15", "BS16", "BS17", "BS18", "BS19",
    "BS20", "BS21", "BS22", "BS23"
]

## CLASSIFYING EACH SITE AS KF (OVERLAP WITH FUNCTIONAL RESIDUES) OR UF

In [23]:
bs_ress_membership = {}
binding_site_func_dict = {}
for prot, prot_rows in all_bs_ress.groupby("protein"):
    for grp, grp_rows in prot_rows.groupby("group"):
        for bs_col in bs_cols:
            bs_id = "{}_{}_{}".format(prot, str(grp), bs_col)
            bs_rows = grp_rows[grp_rows[bs_col] == 1]
            if len(bs_rows) == 0:
                continue
            else:
                bs_ress = bs_rows.UniProt_ResNum.unique().tolist()
                bs_ress_membership[bs_id] = bs_ress
                if prot not in func_ress:
                    binding_site_func_dict[bs_id] = "UF"
                else:
                    intersect = set(bs_ress).intersection(set(func_ress[prot]))
                    if len(intersect) > 0:
                        binding_site_func_dict[bs_id] = "KF"
                    else:
                        binding_site_func_dict[bs_id] = "UF"


In [24]:
print(len({k: v for k, v in binding_site_func_dict.items() if v == "KF"})) # 29 KF sites

29


In [25]:
for k in list(binding_site_func_dict.keys())[:5]:
    print(k, binding_site_func_dict[k])

H0Y4R8_0_BS0 UF
O15178_0_BS0 UF
O15178_0_BS1 UF
O15178_0_BS2 UF
O15178_0_BS3 UF


In [26]:
dump_pickle(binding_site_func_dict, os.path.join(results_dir, "prot_func_dict_auto.pkl")) 