In [84]:
def get_interpro_positions(accession, interpro_data):
    coordinates = []
    for protein in interpro_data:
        if protein['metadata']['accession'] == accession:
            for location in protein['entries'][0]['entry_protein_locations']:
                start = location['fragments'][0]['start']
                end = location['fragments'][0]['end']
                coordinates.append((start,end))
    return coordinates
        

def get_original_positions(accession, original_data):
    
    
    accession_data = original_data[original_data['target_name'] == accession]
    starts = accession_data['ali_from'].values
    ends = accession_data['ali_to'].values
    original_coordinates = [(int(starts[i]),int(ends[i])) for i in range(len(accession_data))]
    return original_coordinates
        

def compare_coordinates(coord_set1, coord_set2, length):
    set_1 = [np.arange(x[0],x[1]+1) for x in coord_set1]
    set_1 = set([i for c in set_1 for i in c])
    set_2 = [np.arange(x[0],x[1]+1) for x in coord_set2]
    set_2 = set([i for c in set_2 for i in c])
    tp = len(set_1 & set_2)
    fp = len(set_1) - tp
    fn = len(set_2) - tp
    tn = length - tp - fp - fn
    
    recall = tp/(tp+fn) #or sensitivity
    acc = (tp+tn)/(length) # accuracy
    if tp+fp != 0:
        precision = tp/(tp+fp) 
        mcc = (tp*tn - fp*fn)/math.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))
    else:
        precision = None
        mcc = None
    specificity = tn/(tn+fp)
    
    return acc,precision,recall,specificity,mcc

def get_seq_length(accession, original_data):
    return int(original_data[original_data['target_name'] == accession]['t-len'].unique())

def compute_metrics(original_data, interpro_data):
    metrics = {}
    accessions = original_data['target_name'].unique()
    for accession in accessions:
        set1 = get_interpro_positions(accession,interpro_data)
        set2 = get_original_positions(accession, original_data)
        l = get_seq_length(accession,original_data)
        metrics[accession] = compare_coordinates(set1, set2, l)
    return metrics

In [1]:
import json
import math
import numpy as np
import string

In [2]:
with open("export.json", "r") as read_file:
    interpro_data = json.load(read_file)


## HMM

In [76]:

## header creation
import pandas as pd
hmm_header = ['target_name','t-len','query_name','q-len','e-value','score','bias','#','off','ci-e-value','i-e-value','score','bias',
         'hmm_from','hmm_to','ali_from','ali_to','env_from','env_to','acc']
# open hmm results
with open('hmmsearch.hmmer_domtblout', 'r') as f2:
    hmm_data = f2.read()



# dataframe creation
lines = []
for line in hmm_data.split('\n'):
    if line != None:
        if len(line) > 1:
            if line[0] != '#':
                lines.append(line.translate(str.maketrans('', '', string.punctuation)).split()[:20])

                
hmm_data = pd.DataFrame(lines, columns = hmm_header)

target_ids = [d[2:8] for d in hmm_data['target_name']] #id of all proteins
hmm_data['target_name'] = target_ids
hmm_data

Unnamed: 0,target_name,t-len,query_name,q-len,e-value,score,bias,#,off,ci-e-value,i-e-value,score.1,bias.1,hmm_from,hmm_to,ali_from,ali_to,env_from,env_to,acc
0,O75970,2070,msacutu90,91,15e15,571,237,1,13,0019,72,68,01,36,89,167,224,154,226,072
1,O75970,2070,msacutu90,91,15e15,571,237,2,13,00035,13,92,00,34,88,279,336,274,339,081
2,O75970,2070,msacutu90,91,15e15,571,237,3,13,0032,12,61,01,46,89,419,463,401,465,081
3,O75970,2070,msacutu90,91,15e15,571,237,4,13,0017,67,69,00,35,62,577,606,568,622,083
4,O75970,2070,msacutu90,91,15e15,571,237,5,13,093,36e02,14,00,32,63,726,759,719,770,082
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136,Q9HAP6,207,msacutu90,91,011,126,00,1,1,000057,022,117,00,34,63,117,148,110,171,084
137,Q6IN97,320,msacutu90,91,011,126,01,1,1,00015,058,103,00,31,65,117,153,96,176,078
138,O14745,358,msacutu90,91,013,125,01,1,2,00087,33,79,00,38,65,41,69,37,76,084
139,O14745,358,msacutu90,91,013,125,01,2,2,092,35e02,14,00,37,63,180,207,177,233,085


## PSSM

In [44]:
with open('pssm_orig_ds.txt', 'r') as f2:
    PSSM_data = f2.read()
# dataframe creation
pssm_lines = PSSM_data.split('\n')
# proteins extraction
pssm_proteins = set([line[:6] for line in pssm_lines])

In [83]:
rows = []

for line in pssm_lines[131:]:
    if line[:1] == '>':
        protein = line[1:7]
        i = 0
    if line[:7] == 'Length=':
        length = line[7:]
    if line[:5] == 'Sbjct':
        elements = line.split(' ')
        rows.append([protein,length, i,elements[2],elements[-1]])
        i += 1
    
pssm_header = ['target_name','t-len','#','ali_from','ali_to']
pssm_data = pd.DataFrame(rows, columns = pssm_header)
pssm_data.to_csv('pssm_pos_data')

In [49]:
def compute_avg_metrics(metrics_dict):
    keys = metrics_dict.keys()
    useful = [metrics_dict[key] for key in keys if None not in metrics_dict[key]]
    avg_metrics = np.sum(useful, axis = 0)/len(useful)
    return {'avg_acc':avg_metrics[0],'avg_precision':avg_metrics[1],'avg_recall':avg_metrics[2],
           'avg_specificity':avg_metrics[3],'avg_mcc':avg_metrics[4]}

In [81]:
# average metrics for all proteins 
hmm_metrics = compute_metrics(hmm_data, interpro_data)
compute_avg_metrics(hmm_metrics)

{'avg_acc': 0.8607334008735124,
 'avg_precision': 0.4895617508589425,
 'avg_recall': 0.9077856073071492,
 'avg_specificity': 0.8507664224856565,
 'avg_mcc': 0.5908108816929698}

In [86]:
pssm_metrics = compute_metrics(pssm_data, interpro_data)
compute_avg_metrics(pssm_metrics)

{'avg_acc': 0.8529094250109757,
 'avg_precision': 0.35396004951483967,
 'avg_recall': 0.9246017230095943,
 'avg_specificity': 0.847497141343868,
 'avg_mcc': 0.4983336095290782}