In [1]:
import json
import math
import numpy as np
import string

In [2]:
with open("export.json", "r") as read_file:
    interpro_data = json.load(read_file)


In [3]:
## header creation
import pandas as pd
header = ['target_name','t-len','query_name','q-len','e-value','score','bias','#','off','ci-e-value','i-e-value','score','bias',
         'hmm_from','hmm_to','ali_from','ali_to','env_from','env_to','acc']
# open hmm results
with open('hmmsearch.hmmer_domtblout', 'r') as f2:
    data = f2.read()



# dataframe creation
lines = []
for line in data.split('\n'):
    if line != None:
        if len(line) > 1:
            if line[0] != '#':
                lines.append(line.translate(str.maketrans('', '', string.punctuation)).split()[:20])

                

data = pd.DataFrame(lines, columns = header)
target_ids = [d[2:8] for d in data['target_name']] #id of all proteins
data['target_name'] = target_ids
data.head()

Unnamed: 0,target_name,t-len,query_name,q-len,e-value,score,bias,#,off,ci-e-value,i-e-value,score.1,bias.1,hmm_from,hmm_to,ali_from,ali_to,env_from,env_to,acc
0,O75970,2070,msacutu90,91,1.5e+16,571,237,1,13,19,72.0,68,1,36,89,167,224,154,226,72
1,O75970,2070,msacutu90,91,1.5e+16,571,237,2,13,35,13.0,92,0,34,88,279,336,274,339,81
2,O75970,2070,msacutu90,91,1.5e+16,571,237,3,13,32,12.0,61,1,46,89,419,463,401,465,81
3,O75970,2070,msacutu90,91,1.5e+16,571,237,4,13,17,67.0,69,0,35,62,577,606,568,622,83
4,O75970,2070,msacutu90,91,1.5e+16,571,237,5,13,93,3600.0,14,0,32,63,726,759,719,770,82


In [11]:
def get_interpro_positions(accession, interpro_data):
    coordinates = []
    for protein in interpro_data:
        if protein['metadata']['accession'] == accession:
            for location in protein['entries'][0]['entry_protein_locations']:
                start = location['fragments'][0]['start']
                end = location['fragments'][0]['end']
                coordinates.append((start,end))
    return coordinates
        

def get_original_positions(accession, original_data):
    
    
    accession_data = original_data[original_data['target_name'] == accession]
    starts = accession_data['ali_from'].values
    ends = accession_data['ali_to'].values
    original_coordinates = [(int(starts[i]),int(ends[i])) for i in range(len(accession_data))]
    return original_coordinates
        

def compare_coordinates(coord_set1, coord_set2, length):
    set_1 = [np.arange(x[0],x[1]) for x in coord_set1]
    set_1 = set([i for c in set_1 for i in c])
    set_2 = [np.arange(x[0],x[1]) for x in coord_set2]
    set_2 = set([i for c in set_2 for i in c])
    tp = len(set_1 & set_2)
    fp = len(set_1) - tp
    fn = len(set_2) - tp
    tn = length - tp - fp - fn
    
    recall = tp/(tp+fn) #or sensitivity
    acc = (tp+tn)/(length) # accuracy
    if tp+fp != 0:
        precision = tp/(tp+fp) 
        mcc = (tp*tn - fp*fn)/math.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))
    else:
        precision = None
        mcc = None
    specificity = tn/(tn+fp)
    
    return acc,precision,recall,specificity,mcc

def get_seq_length(accession, original_data):
    return int(original_data[original_data['target_name'] == accession]['t-len'].unique())

def compute_metrics(original_data, interpro_data):
    metrics = {}
    accessions = original_data['target_name'].unique()
    for accession in accessions:
        set1 = get_interpro_positions(accession,interpro_data)
        set2 = get_original_positions(accession, original_data)
        l = get_seq_length(accession,data)
        metrics[accession] = compare_coordinates(set1, set2, l)
    return metrics

In [22]:
def compute_avg_metrics(metrics_dict):
    keys = metrics_dict.keys()
    useful = [metrics_dict[key] for key in keys if None not in metrics_dict[key]]
    avg_metrics = np.sum(useful, axis = 0)/len(useful)
    return {'avg_acc':avg_metrics[0],'avg_precision':avg_metrics[1],'avg_recall':avg_metrics[2],
           'avg_specificity':avg_metrics[3],'avg_mcc':avg_metrics[4]}

In [25]:
# average metrics for all proteins 
metrics = compute_metrics(data, interpro_data)
compute_avg_metrics(metrics)

{'avg_acc': 0.861058329229154,
 'avg_precision': 0.4828664087187212,
 'avg_recall': 0.9070334644993584,
 'avg_specificity': 0.8516722326214426,
 'avg_mcc': 0.5868485588641982}