In [44]:
import pandas as pd
import requests
import os
import numpy as np
from ratelimit import limits, sleep_and_retry
from Bio import SeqIO
from multiprocessing import Pool

A notebook to calculate "lineage scores" for every UniRef cluster used to create the ColabFold clusters.

In [45]:
# url = "https://rest.uniprot.org/uniref/stream?format=json&query=%28%28id%3AUniRef100_A0A2G8JTM8%29%29"
# response = requests.get(url).json()
# print(response['results'].keys())
# # df = pd.json_normalize(response['results'])
# # taxon = response['results'][0]['commonTaxon']
# # print(taxon)
# # df = pd.json_normalize(taxon)
# # df.head()
# # tax = df.iloc[0]['taxonId']
# # print(tax)

In [46]:
# Some helper functions
def divide_chunks(l, n):

    # looping till length l
    for i in range(0, len(l), n):
        yield l[i:i + n]

def get_unirefs(fp):
    '''
    Read the UniRef IDs from the a3m file
    '''

    unirefs = {}
    uniprots = {}

    for record in SeqIO.parse(fp, "fasta"):
        if "UniRef" in record.id:
            uniref = record.id

            # Find the percent completeness of the sequence
            total_len = len(record.seq)
            num_missing = record.seq.count("-")
            percent_complete = 100 - ((num_missing / total_len) * 100)

            unirefs[uniref] = percent_complete

        elif len(record.id) == 10 or len(record.id) == 6:
            uniprot = record.id

            # Find the percent completeness of the sequence
            total_len = len(record.seq)
            num_missing = record.seq.count("-")
            percent_complete = 100 - ((num_missing / total_len) * 100)

            uniprots[uniprot] = percent_complete
        
        else:
            continue

    return unirefs, uniprots

# @sleep_and_retry
# @limits(calls=1, period=timedelta(seconds=1).total_seconds())
def get_uniref_taxonId(uniref):
    '''
    Get the seed taxonID for a given UniRef cluster
    '''

    # UniProt REST API URL
    url = f"https://rest.uniprot.org/uniref/stream?format=json&query=%28%28id%3A{uniref}%29%29"

    response = requests.get(url).json()
    if len(response['results']) == 0:
        return None
    else:
        taxon = response['results'][0]['commonTaxon']
        df = pd.json_normalize(taxon)
        taxonId = df.iloc[0]['taxonId']

        return taxonId

# @sleep_and_retry
# @limits(calls=1, period=timedelta(seconds=1).total_seconds())
def get_uniprot_taxonId(uniprot):
    '''
    Get the taxonId for a given UniProt ID
    '''

    # UniProt REST API URL
    url = f'https://rest.uniprot.org/uniprotkb/{uniprot}.json'
    response = requests.get(url).json()
    if response['entryType'] == 'Inactive':
        taxonId = np.nan
    else:  
        taxonId = response['organism']['taxonId']

    return taxonId

# @sleep_and_retry
# @limits(calls=1, period=timedelta(seconds=1).total_seconds())
def get_lineage(taxonId):
    '''
    Get the lineage for a given taxonID
    '''
    
    url = f"https://rest.uniprot.org/taxonomy/{taxonId}.json"
    response = requests.get(url).json()
    if 'lineage' not in response.keys():
        return [taxonId]
    else:
        lineage = response['lineage']
        df = pd.json_normalize(lineage)

        # Turn taxonId column to list of strings
        taxa_list = df['taxonId'].tolist()

        return taxa_list

def cluster_lineages(df, path):
    '''
    Calculate the 'lineage score' (i.e. the number of shared taxa with the original organism) for each cluster
    '''

    # Keep track of which UniProt ID is being queried to avoid redundant queries
    current_uniprot = ''
    current_lineage = []

    # Empty Dataframe to keep track of the lineage scores
    lineage_df_list = []

    # Iterate over the rows in the DataFrame
    for index, row in df.iterrows():

        uniprot = row['uniprot']
        cluster = row['cluster']

        print(f'UniProt: {uniprot}, Cluster: {cluster}')

        # If the UniProt ID is different from the last one queried, get the lineage
        if uniprot != current_uniprot:
            current_uniprot = uniprot

            # Get the lineage for the UniProt ID
            uniprot_taxon = get_uniprot_taxonId(uniprot)
            current_lineage = get_lineage(uniprot_taxon)

        # Open the cluster file to get the UniRef IDs
        fn = f'{uniprot}_{cluster}.a3m'
        fp = os.path.join(path, uniprot, fn) # File path looks like data/O08967/O08967_000.a3m
        unirefs, sub_uniprots = get_unirefs(fp)

        # Iterate over the UniRef IDs to get their lineages
        for uniref in unirefs.keys():
            print(f'UniProt: {uniprot}, Cluster: {cluster}, UniRef: {uniref}')
            cluster_taxon = get_uniref_taxonId(uniref)

            if cluster_taxon == None:
                cluster_lineage = []

                # Create temp_df to append to lineage_df_list
                temp_df = pd.DataFrame([[uniprot, cluster, fn, uniref, 
                                        cluster_taxon, current_lineage, cluster_lineage, unirefs[uniref]]],
                                        columns=['uniprot', 'cluster', 'cf_filename', 'uniref', 
                                                 'taxonId', 'uniprot_lineage', 'cluster_lineage', 'percent_complete'])
                
                # Append temp_df to lineage_df_list
                lineage_df_list.append(temp_df)

            else:
                print(f'Taxon: {cluster_taxon}')
                cluster_lineage = get_lineage(cluster_taxon)

                # Add values to temp_df
                temp_df = pd.DataFrame([[uniprot, cluster, fn, uniref, 
                                        cluster_taxon, current_lineage, cluster_lineage, unirefs[uniref]]],
                                        columns=['uniprot', 'cluster', 'cf_filename', 'uniref', 
                                                 'taxonId', 'uniprot_lineage', 'cluster_lineage', 'percent_complete'])

                # Append temp_df to lineage_df_list
                lineage_df_list.append(temp_df)

        # Iterate over the UniProt IDs to get their lineages
        for sub_uniprot in sub_uniprots.keys():
            print(f'UniProt: {uniprot}, Cluster: {cluster}, UniProt: {sub_uniprot}')
            cluster_taxon = get_uniprot_taxonId(sub_uniprot)

            if cluster_taxon == None:
                cluster_lineage = []

                # Create temp_df to append to lineage_df_list
                temp_df = pd.DataFrame([[uniprot, cluster, fn, sub_uniprot, 
                                        cluster_taxon, current_lineage, cluster_lineage, sub_uniprots[sub_uniprot]]],
                                        columns=['uniprot', 'cluster', 'cf_filename', 'uniref', 
                                                 'taxonId', 'uniprot_lineage', 'cluster_lineage', 'percent_complete'])
                
                # Append temp_df to lineage_df_list
                lineage_df_list.append(temp_df)

            else:
                print(f'Taxon: {cluster_taxon}')
                cluster_lineage = get_lineage(cluster_taxon)

                # Add values to temp_df
                temp_df = pd.DataFrame([[uniprot, cluster, fn, sub_uniprot, 
                                        cluster_taxon, current_lineage, cluster_lineage, sub_uniprots[sub_uniprot]]],
                                        columns=['uniprot', 'cluster', 'cf_filename', 'uniref', 
                                                 'taxonId', 'uniprot_lineage', 'cluster_lineage', 'percent_complete'])

                # Append temp_df to lineage_df_list
                lineage_df_list.append(temp_df)

    # Concatenate the lineage_df_list into a single DataFrame
    lineage_df = pd.concat(lineage_df_list).reset_index(drop=True)

    return lineage_df

def calculate_lineage_scores(df):

    # Take the dataframe we made in cluster_lineages and get our lineage scores
    for idx, row in df.iterrows():
        uniprot_lineage = row['uniprot_lineage']
        cluster_lineage = row['cluster_lineage']

        # Calculate the lineage score
        lineage_score = len(set(uniprot_lineage).intersection(cluster_lineage))

        # Calculate the lineage lengths
        uniprot_lineage_length = len(uniprot_lineage)
        cluster_lineage_length = len(cluster_lineage)

        # Add the lineage score to the DataFrame
        df.loc[idx, 'lineage_score'] = lineage_score
        df.loc[idx, 'uniprot_lineage_length'] = uniprot_lineage_length
        df.loc[idx, 'cluster_lineage_length'] = cluster_lineage_length
    
    return df

In [47]:
# Get lists of UniProt IDs for autoinhibitory and multi-domain proteins that we have clusters for
path1 = './project_pipeline/data/input/Colabfold_cif/autoinhibited'
path2 = './project_pipeline/data/input/Colabfold_cif/multi_domain'

autoinhibited = os.listdir(path1)
multi_domain = os.listdir(path2)

In [48]:
df1 = pd.read_csv('./project_pipeline/data/ai_full-depth_cluster_compared.tsv', sep='\t')

df1 = df1[df1['uniprot'].isin(autoinhibited)]

df1_chunks = divide_chunks(df1, 500)

with Pool() as pool:
    ai_results = pool.starmap(cluster_lineages, [(chunk, path1) for chunk in df1_chunks])

# Retrieve the lineage scores
ai_lineage = pd.concat(ai_results).reset_index(drop=True)

UniProt: P62826, Cluster: 013UniProt: P07038, Cluster: U100-001UniProt: P29350, Cluster: 030
UniProt: P53042, Cluster: 002




UniProt: P53042, Cluster: 002, UniRef: UniRef100_A0A0D6R3I1
UniProt: P29350, Cluster: 030, UniRef: UniRef100_A0A3M7RTJ6
UniProt: P62826, Cluster: 013, UniRef: UniRef100_M3UQQ7
UniProt: P07038, Cluster: U100-001, UniRef: UniRef100_A0A6A7YAK2
Taxon: 56994Taxon: 10195

UniProt: P53042, Cluster: 002, UniRef: UniRef100_A0A2G9HV15UniProt: P29350, Cluster: 030, UniRef: UniRef100_A0A813MA17

UniProt: P62826, Cluster: 013, UniRef: UniRef100_B0ESG1
Taxon: 2608987
UniProt: P07038, Cluster: U100-001, UniRef: UniRef100_A0A0E3PDB5
Taxon: 429701
UniProt: P53042, Cluster: 002, UniRef: UniRef100_A0A3N7FYU6
Taxon: 38027
UniProt: P07038, Cluster: U100-001, UniRef: UniRef100_W1NSW6
Taxon: 370354
UniProt: P62826, Cluster: 013, UniProt: K2GI49
Taxon: 104777
Taxon: 1076696
UniProt: P29350, Cluster: 021
UniProt: P29350, Cluster: 021, UniRef: UniRef100_UPI000678BC28
UniProt: P62826, Cluster: 071
UniProt: P62826, Cluster: 071, UniRef: UniRef100_A1C5M7
Taxon: 3694
UniProt: P53042, Cluster: 002, UniRef: UniRef100

In [49]:
df2 = pd.read_csv('./project_pipeline/data/md_full-depth_cluster_compared.tsv', sep='\t')
df2 = df2[df2['uniprot'].isin(multi_domain)]

df2_chunks = divide_chunks(df2, 500)

with Pool() as pool:
    md_results = pool.starmap(cluster_lineages, [(chunk, path2) for chunk in df2_chunks])

md_lineage = pd.concat(md_results).reset_index(drop=True)

UniProt: D9N168, Cluster: U10-006UniProt: F2Z6J5, Cluster: 075UniProt: E1K248, Cluster: 251UniProt: O16025, Cluster: U100-005


UniProt: H6SG27, Cluster: 300

UniProt: D9N168, Cluster: U10-006, UniRef: UniRef100_A0A8T8IB48
UniProt: E1K248, Cluster: 251, UniRef: UniRef100_A0A934GJV4
UniProt: F2Z6J5, Cluster: 075, UniRef: UniRef100_A0A520X7E3UniProt: H6SG27, Cluster: 300, UniRef: UniRef100_A0A538TVP4

UniProt: O16025, Cluster: U100-005, UniRef: UniRef100_F9U5X3
Taxon: 1891241
Taxon: 2821731
UniProt: E1K248, Cluster: 251, UniRef: UniRef100_A0A1G0EBG0
UniProt: D9N168, Cluster: U10-006, UniRef: UniRef100_A0A2H0YTG5
Taxon: 2212470
UniProt: H6SG27, Cluster: 053
UniProt: H6SG27, Cluster: 053, UniRef: UniRef100_A0A951JQ33
Taxon: 1798257
UniProt: E1K248, Cluster: 251, UniRef: UniRef100_A0A916D482
Taxon: 131567
UniProt: D9N168, Cluster: U10-006, UniRef: UniRef100_A0A2V8K8B2
Taxon: 768671
UniProt: O16025, Cluster: U100-005, UniRef: UniRef100_UPI0022E43BDF
Taxon: 1978231
UniProt: D9N168, Cluster: U

In [51]:
ai_lineage2 = calculate_lineage_scores(ai_lineage)
md_lineage2 = calculate_lineage_scores(md_lineage)

In [52]:
ai_lineage.to_csv('./project_pipeline/data/ai_lineage_scores.csv', index=False)
md_lineage.to_csv('./project_pipeline/data/md_lineage_scores.csv', index=False)