## Uniprot Annotations

Gets Uniprot annotations for human proteins and aligns them to domains of interest.

### Requirements:
1. List of domains
2. List of genes for each domain
3. List of canonical proteins for each gene
4. Match data for each canonical protein (start position, end position, and target sequence)
5. Uniprot annotation data can be downloaded from: ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/genome_annotation_tracks/UP000005640_9606_beds/
6. Uniprot protein id to ensembl mapping for humans can be downloaded from: ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/

### Notes:
Descriptions for uniprot annotations can be found at: ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/genome_annotation_tracks/README

See cell 6 for a list of categories chosen

### Running:
Run cells in order.

### Output:
The output is a dictionary that has the Uniprot annotations for each position per gene per domain. Positions are indices in the target sequence (0 indicates the first amino acid, etc.)

In [1]:
import numpy as np
import pandas as pd
import cPickle as pickle
from collections import defaultdict
curr_dir = !pwd

#Reading the list of filtered domains
with open(curr_dir[0]+"/../../5.domains_stats/filtered10_list.pik", 'rb') as handle:
    filtered_domains_list10 = pickle.load(handle)
filtered_domains_list10.sort()

### Create a mapping from ensembl genes of interest to uniprot protein IDs

In [2]:
# Get list of genes
gene_list = []

for domain_name in filtered_domains_list10:
    in_path = curr_dir[0]+"/../../3.parse_HMMER/hmm_domains/pfam-v30/"
    filename = domain_name+".csv"
    domain_data = pd.read_csv(in_path+filename, sep='\t', index_col=0, dtype={"chrom_num": str})

    #Sort the domain data
    sorted_domain_data = domain_data.sort_values(by=["chrom_num", "gene", "TargetStart"])
    sorted_domain_data = sorted_domain_data.reset_index(drop=True)

    #Get the canonic protein id
    with open(curr_dir[0]+"/../../4.parse_Uniprot/domains_canonic_prot/pfam-v30/"+domain_name+"_canonic_prot.pik", 'rb') as handle:
        canonic_protein = pickle.load(handle)
    
    for gene in sorted_domain_data.loc[:,'gene']:
        if gene.split('.')[0] in gene_list:
            continue
        gene_list.append(gene.split('.')[0])
    print("Finished domain "+domain_name)

Finished domain 2OG-FeII_Oxy_3
Finished domain 7TM_GPCR_Srsx
Finished domain 7tm_1
Finished domain 7tm_2
Finished domain 7tm_3
Finished domain 7tm_4
Finished domain A2M
Finished domain A2M_N
Finished domain A2M_N_2
Finished domain A2M_comp
Finished domain A2M_recep
Finished domain AAA
Finished domain AAA_11
Finished domain AAA_12
Finished domain AAA_17
Finished domain AAA_18
Finished domain AAA_33
Finished domain AAA_5
Finished domain AAA_6
Finished domain AAA_7
Finished domain AAA_8
Finished domain AAA_9
Finished domain AA_permease
Finished domain AA_permease_2
Finished domain ABC2_membrane_3
Finished domain ABC_membrane
Finished domain ABC_tran
Finished domain ADAM_CR
Finished domain ADAM_spacer1
Finished domain ADH_N
Finished domain ADH_zinc_N
Finished domain ADK
Finished domain AMP-binding
Finished domain AMP-binding_C
Finished domain ANAPC3
Finished domain ANAPC4_WD40
Finished domain ANATO
Finished domain ANF_receptor
Finished domain APC_r
Finished domain APOBEC_C
Finished domain 

Finished domain LCE
Finished domain LIM
Finished domain LRRC37
Finished domain LRRCT
Finished domain LRRNT
Finished domain LRR_4
Finished domain LRR_5
Finished domain LRR_6
Finished domain LRR_8
Finished domain LRR_9
Finished domain LSM
Finished domain Laminin_B
Finished domain Laminin_EGF
Finished domain Laminin_G_1
Finished domain Laminin_G_2
Finished domain Laminin_G_3
Finished domain Laminin_N
Finished domain Ldl_recept_a
Finished domain Ldl_recept_b
Finished domain Lectin_C
Finished domain Lig_chan
Finished domain Lig_chan-Glu_bd
Finished domain Linker_histone
Finished domain Lipocalin
Finished domain Lys
Finished domain MAGE
Finished domain MAGE_N
Finished domain MAM
Finished domain MARVEL
Finished domain MBOAT
Finished domain MBT
Finished domain MFS_1
Finished domain MH1
Finished domain MHC_I
Finished domain MHC_II_alpha
Finished domain MHC_II_beta
Finished domain MHC_I_C
Finished domain MIF4G
Finished domain MIP
Finished domain MIT
Finished domain MMR_HSR1
Finished domain MORN


Finished domain zf-C3HC4_3
Finished domain zf-C3HC4_4
Finished domain zf-C4
Finished domain zf-CCCH
Finished domain zf-CCHC
Finished domain zf-CXXC
Finished domain zf-DHHC
Finished domain zf-FCS
Finished domain zf-H2C2_2
Finished domain zf-H2C2_5
Finished domain zf-HC5HC2H
Finished domain zf-HC5HC2H_2
Finished domain zf-MYND
Finished domain zf-NF-X1
Finished domain zf-RING_11
Finished domain zf-RING_2
Finished domain zf-RING_5
Finished domain zf-RING_UBOX
Finished domain zf-RanBP
Finished domain zf-TRAF
Finished domain zf-UBP
Finished domain zf-met
Finished domain zf-rbx1


In [3]:
# Extract uniprot manually curated list with sequences for help with alignment
uniprot_map = defaultdict(str)
seq_flag = False
seq = ""
prots = []

with open("uniprot_sprot.dat") as f:
    for line in f.readlines():
        tokens = line.split()
        
        # Get protein IDs in entry
        if tokens[0] == 'AC':
            for i in range(1,len(tokens)):
                prot = tokens[i]
                prots.append(prot[0:len(prot)-1])
        
        # Save protein and sequence data for entry; reset variables
        if tokens[0] == '//':
            seq = seq.replace(' ','')
            for prot_id in prots:
                uniprot_map[prot_id] = seq
            seq_flag = False
            seq = ""
            prots = []
        
        # Build sequence line by line
        if seq_flag:
            seq += line.strip()
            
        # Mark first line of sequence data for subsequent lines
        if tokens[0] == 'SQ':
            seq_flag = True

In [4]:
# Create dict to map ensembl to uniprot protein id
ensembl2uniprot = defaultdict(list)

with open("HUMAN_9606_idmapping.dat") as f:
    for line in f.readlines():
        tokens = line.split()
        if tokens[1] != "Ensembl" or not tokens[2] in gene_list or not tokens[0] in uniprot_map:
            continue
        ensembl2uniprot[tokens[2]].append(tokens[0])

### Extract Uniprot annotations

In [5]:
# Reads uniprot annotation files, extracts position data, and saves to dict
def read_uniprot_annotations(filename,feature_name,out_dict):
    with open(filename) as f:
        for line in f.readlines():
            # Skip header
            if line[0:3] != 'chr':
                continue

            # Extract relevant info
            tokens = line.split('\t')
            prot_id = tokens[3]
            chrom = tokens[0][3:len(tokens[0])]
            aa_list = tokens[len(tokens)-1].split(';')[0].split(',')
            
            for aa in aa_list:
                # If range, get start and end aa
                aa_tokens = aa.split('-')
                start = aa_tokens[0]
                aa_start = start[0:1]
                pos_start = int(start[1:len(start)])
                end = aa_tokens[len(aa_tokens)-1]
                aa_end = end[0:1]
                pos_end = int(end[1:len(end)])
                
                # Disulfide bridges are formatted differently
                if feature_name == 'disulfide':
                    interchain = 'f'
                    if 'Interchain' in tokens[len(tokens)-1]:
                        interchain = 't'
                    add_dict(pos_start,'disulfide_'+interchain,out_dict[prot_id])
                    add_dict(pos_start,aa_start,out_dict[prot_id])
                    if start != end:
                        add_dict(pos_end,'disulfide_'+interchain,out_dict[prot_id])
                        add_dict(pos_end,aa_end,out_dict[prot_id])
                else:
                    add_dict(pos_start,aa_start,out_dict[prot_id])
                    add_dict(pos_end,aa_end,out_dict[prot_id])
                    for pos in range(pos_start,pos_end+1):
                        # Save to dict
                        add_dict(pos,feature_name,out_dict[prot_id])

# Add to dict by adding non-duplicate value to existing entry or creating new entry
def add_dict(key,val,out_dict):
    if key in out_dict:
        if val in out_dict[key]:
            return
        # Keep amino acid entries at end of lists
        elif len(out_dict[key][len(out_dict[key])-1]) == 1:
            out_dict[key].insert(len(out_dict[key])-1,val)
        else:
            out_dict[key].append(val)
    else:
        out_dict[key] = [val]

In [6]:
uniprot_dict = defaultdict(dict)

# Disulfides
read_uniprot_annotations("UP000005640_9606_disulfid.bed","disulfide",uniprot_dict)

# Binding
read_uniprot_annotations("UP000005640_9606_binding.bed","binding",uniprot_dict)

# Calcium binding
read_uniprot_annotations("UP000005640_9606_ca_bind.bed","calcium",uniprot_dict)

# Metal binding
read_uniprot_annotations("UP000005640_9606_metal.bed","metal",uniprot_dict)

# DNA binding
read_uniprot_annotations("UP000005640_9606_dna_bind.bed","dna",uniprot_dict)

# Nucleotide binding
read_uniprot_annotations("UP000005640_9606_np_bind.bed","nucleotide",uniprot_dict)

# Cross-link
read_uniprot_annotations("UP000005640_9606_crosslnk.bed","crosslink",uniprot_dict)

### Combine the pieces above to complete the mapping from domain and ensembl ID to Uniprot annotation

In [16]:
# Map domains to features by position
domain_uniprot_dict = defaultdict(dict)

for domain in filtered_domains_list10:
    domain_uniprot_dict[domain] = {}
    
    in_path = curr_dir[0]+"/../../3.parse_HMMER/hmm_domains/pfam-v30/"
    filename = domain+".csv"
    domain_data = pd.read_csv(in_path+filename, sep='\t', index_col=0, dtype={"chrom_num": str})

    # Sort the domain data
    sorted_domain_data = domain_data.sort_values(by=["chrom_num", "gene", "TargetStart"])
    sorted_domain_data = sorted_domain_data.reset_index(drop=True)
    
    # Get the canonic protein id for Zinc domain
    with open(curr_dir[0]+"/../../4.parse_Uniprot/domains_canonic_prot/pfam-v30/"+domain+"_canonic_prot.pik", 'rb') as handle:
        canonic_protein = pickle.load(handle)
    
    for gene in canonic_protein.keys():
        # Convert ensembl to uniprot
        if not gene[0:15] in ensembl2uniprot:
            continue
        prot_list = ensembl2uniprot[gene[0:15]]
        
        # Find location of canonic protein for gene
        if not canonic_protein[gene] in sorted_domain_data.loc[:,'prot'].values:
            continue
        index = sorted_domain_data.loc[:,'prot'] == canonic_protein[gene]
        
        # Iterate through uniprot IDs
        domain_uniprot_dict[domain][gene] = {}
        for prot_id in prot_list:
            # If annotation data exists, add gene to dict
            if prot_id not in uniprot_dict:
                continue
            
            # Initialize alignment variables
            misaligned = False
            offset = -1
            start = sorted_domain_data.loc[index,'TargetStart'].values[0]
            end = sorted_domain_data.loc[index,'TargetEnd'].values[0]
            target_seq = sorted_domain_data.loc[index,'Target_Seq'].values[0].replace('-','').upper()
            
            # Check alignment with amino acid name
            for pos in uniprot_dict[prot_id]:
                # Check that position is in the domain
                if pos < start or pos > end:
                    continue
                
                # Find amino acid name and check match
                aa = uniprot_dict[prot_id][pos][len(uniprot_dict[prot_id][pos])-1]
                # Not all positions have an aa
                if len(aa) > 1:
                    continue
                
                if aa != target_seq[int(pos-start)]:
                    # Try to align with sequences if amino acid check fails
                    prot_seq = uniprot_map[prot_id]
                    align = prot_seq.find(target_seq)
                    if align == -1:
                        misaligned = True
                    # If new alignment possible, adjust offset
                    else:
                        offset = align+1
                if misaligned:
                    break
                                
            # Skip misaligned proteins
            if misaligned:
                continue
            
            # Add aligned annotations
            for pos in uniprot_dict[prot_id]:
                for feature in uniprot_dict[prot_id][pos]:
                    if offset > 0:
                        # Exclude positions outside of domain
                        if pos < offset or pos > end:
                            continue
                        add_dict(pos-offset,feature,domain_uniprot_dict[domain][gene])
                    else:
                        if pos < start or pos > end:
                            continue
                        add_dict(pos-start,feature,domain_uniprot_dict[domain][gene])
        
        # Remove genes with no aligned positions
        if len(domain_uniprot_dict[domain][gene]) == 0:
            del domain_uniprot_dict[domain][gene]
            
# Save to file
with open(curr_dir[0]+'/uniprot_annotations_dict.pik', 'wb') as handle:
    pickle.dump(domain_uniprot_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)