# EXTRACTING UNIPROT FEATURE ANNOTATIONS TO CLASSIFY SITES BASED ON FUNCTION

### IMPORTING NECESSARY PACKAGES

In [2]:
from fragsys_analysis import *

### READING INPUT DATA

In [3]:
main_dir = "/Users/2394007/Documents/PHD/LOCAL/FRAGSYS_DEF/"
results_dir = os.path.join(main_dir, "results")
prots_df = pd.read_csv(os.path.join(results_dir, "all_proteins.csv"))
binding_ress = pd.read_csv(os.path.join(results_dir, "all_binding_site_residues_wgroup.csv"))

In [4]:
print(binding_ress.shape)

(14172, 62)


In [5]:
accs = prots_df.acc.unique().tolist()
print(len(accs))

35


In [6]:
acc_feat_dfs = []
for acc in accs:
    acc_feat_df = pd.read_json("https://www.ebi.ac.uk/proteins/api/features/{}?categories=DOMAINS_AND_SITES".format(acc))
    acc_feat_dfs.append(acc_feat_df)

In [7]:
feats_df = pd.concat(acc_feat_dfs).reset_index(drop=True)

In [8]:
feats_df.head(5)

Unnamed: 0,accession,entryName,sequence,sequenceChecksum,taxid,features
0,Q9UGL1,KDM5B_HUMAN,MEAATTLHPGPRPALPLGGPGPLGEFLPPPECPVFEPSWEEFADPF...,70A0738D9A709F61,9606,"{'type': 'DOMAIN', 'category': 'DOMAINS_AND_SI..."
1,Q9UGL1,KDM5B_HUMAN,MEAATTLHPGPRPALPLGGPGPLGEFLPPPECPVFEPSWEEFADPF...,70A0738D9A709F61,9606,"{'type': 'DOMAIN', 'category': 'DOMAINS_AND_SI..."
2,Q9UGL1,KDM5B_HUMAN,MEAATTLHPGPRPALPLGGPGPLGEFLPPPECPVFEPSWEEFADPF...,70A0738D9A709F61,9606,"{'type': 'DOMAIN', 'category': 'DOMAINS_AND_SI..."
3,Q9UGL1,KDM5B_HUMAN,MEAATTLHPGPRPALPLGGPGPLGEFLPPPECPVFEPSWEEFADPF...,70A0738D9A709F61,9606,"{'type': 'ZN_FING', 'category': 'DOMAINS_AND_S..."
4,Q9UGL1,KDM5B_HUMAN,MEAATTLHPGPRPALPLGGPGPLGEFLPPPECPVFEPSWEEFADPF...,70A0738D9A709F61,9606,"{'type': 'ZN_FING', 'category': 'DOMAINS_AND_S..."


In [9]:
feats_df.shape

(607, 6)

In [10]:
dump_pickle(feats_df, os.path.join(results_dir, "uniprot_func_anns.pkl"))

In [11]:
accs_in_df = feats_df.accession.unique().tolist()

In [12]:
[acc for acc in accs if acc not in accs_in_df] # not in data frame as it is unreviewed

['Q8WS26']

In [13]:
feats_df = feats_df.join(feats_df.features.apply(lambda x: pd.Series(x))).drop(columns = "features")

In [14]:
cc = ["accession", "type", "description", "begin", "end", "molecule", "evidences", "ligand", "ligandPart"]

In [15]:
feats_df[cc].head(3)

Unnamed: 0,accession,type,description,begin,end,molecule,evidences,ligand,ligandPart
0,Q9UGL1,DOMAIN,JmjN,32,73,,"[{'code': 'ECO:0000255', 'source': {'name': 'P...",,
1,Q9UGL1,DOMAIN,ARID,97,187,,"[{'code': 'ECO:0000255', 'source': {'name': 'P...",,
2,Q9UGL1,DOMAIN,JmjC,453,619,,"[{'code': 'ECO:0000255', 'source': {'name': 'P...",,


In [16]:
feats_df.type.value_counts()

BINDING     227
REGION      123
DOMAIN      114
SITE         49
ACT_SITE     36
MOTIF        17
REPEAT       17
ZN_FING      13
COILED       10
DNA_BIND      1
Name: type, dtype: int64

In [17]:
int_feats = ["BINDING", "SITE", "ACT_SITE"] # NOT: REGION, DOMAIN, MOTIF, REPEAT, ZN_FING, COILED, DNA_BIND

In [18]:
feats_df_filt = feats_df.query('type in @int_feats').copy().reset_index(drop = True)
feats_df_filt.type.value_counts()

BINDING     227
SITE         49
ACT_SITE     36
Name: type, dtype: int64

In [19]:
feats_df_filt.end = feats_df_filt.end.astype(int)
feats_df_filt.begin = feats_df_filt.begin.astype(int)

## GETTING FUNCTIONAL RESIDUES FOR EACH PROTEIN

In [20]:
func_ress = {}
for acc, acc_df in feats_df_filt.groupby("accession"):
    func_ress[acc] = []
    for i, row in acc_df.iterrows():
        beg = row.begin
        end = row.end
        if beg == end:
            func_ress[acc].append(beg)
        else:
            func_ress[acc].extend(list(range(beg, end+1)))
            
func_ress = {k: sorted(list(set(v))) for k, v in func_ress.items()}

In [21]:
print(len(func_ress)) # 24/35 proteins have functional feature annotations

24


In [22]:
for i, k in enumerate(list(func_ress.keys())):
    print(i+1, k, len(func_ress[k]))

1 H0Y4R8 8
2 O43809 2
3 P01584 10
4 P06873 8
5 P0C024 3
6 P0DTD1 130
7 P11838 2
8 P15379 4
9 P18031 9
10 P22557 12
11 P47811 11
12 Q12830 4
13 Q15047 21
14 Q32ZE1 78
15 Q460N5 34
16 Q6B0I6 11
17 Q6PJP8 4
18 Q6PL18 8
19 Q7LBC6 3
20 Q8IU60 2
21 Q9BRQ3 10
22 Q9UGL1 7
23 Q9UJM8 20
24 Q9UKK9 11


In [23]:
bs_cols = [
    "BS0", "BS1", "BS2", "BS3", "BS4",
    "BS5", "BS6", "BS7", "BS8", "BS9",
    "BS10", "BS11", "BS12", "BS13", "BS14",
    "BS15", "BS16", "BS17", "BS18", "BS19",
    "BS20", "BS21", "BS22", "BS23"
]

## CLASSIFYING EACH SITE AS KF (OVERLAP WITH FUNCTIONAL RESIDUES) OR UF

In [24]:
bs_ress_membership = {}
binding_site_func_dict = {}
for prot, prot_rows in binding_ress.groupby("protein"):
    for grp, grp_rows in prot_rows.groupby("group"):
        for bs_col in bs_cols:
            bs_id = "{}_{}_{}".format(prot, str(grp), bs_col)
            bs_rows = grp_rows[grp_rows[bs_col] == 1]
            if len(bs_rows) == 0:
                continue
            else:
                bs_ress = bs_rows.UniProt_ResNum.unique().tolist()
                bs_ress_membership[bs_id] = bs_ress
                if prot not in func_ress:
                    binding_site_func_dict[bs_id] = "UF"
                else:
                    intersect = set(bs_ress).intersection(set(func_ress[prot]))
                    if len(intersect) > 0:
                        binding_site_func_dict[bs_id] = "KF"
                    else:
                        binding_site_func_dict[bs_id] = "UF"


In [25]:
print(len({k: v for k, v in binding_site_func_dict.items() if v == "KF"})) # 29 KF sites

29


In [26]:
for k in list(binding_site_func_dict.keys())[:5]:
    print(k, binding_site_func_dict[k])

H0Y4R8_0_BS0 UF
O15178_0_BS0 UF
O15178_0_BS1 UF
O15178_0_BS2 UF
O15178_0_BS3 UF


In [27]:
dump_pickle(binding_site_func_dict, os.path.join(results_dir, "prot_func_dict_auto.pkl")) 