<a href="https://colab.research.google.com/github/ann-las/DL_project/blob/main/calculate_deepTMHMM_metrics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Metrics calculations: F1 scores for DeepTMHMM on test partition



AUTHORS: Anna Schrøder Lassen (s173461), Astrid Brix Saksager (abrisa) &  Puck Quarles van Ufford (puqu)

DATE: December 2023

DESCRIPTION: This Colab calculates multiclass F1 scores on the partition of the DeepTMHMM dataset which is used for testing in our project. To run the script, a Google drive must be mounted. Paths to the files our_test.json and predictions_deeptmhmm.txt must be put in the field below.

In [13]:
path_to_json_file = '/content/drive/My Drive/DL/our_test.json'                            # Insert path to our_test.json
deeptmhmm_file = '/content/drive/My Drive/DL/predictions_deeptmhmm.txt'    # Insert path to predictions_deeptmhmm.txt

## 1) Mount the drive

In [1]:
# Mount the drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
#!ls drive/'My Drive'/'DL'/

final_runs_predictions_targets	prediction_Felix_metrics_test.txt  target_Felix_metrics_test.txt
last_231123.ckpt		predictions_deeptmhmm.txt	   target.txt
our_test.json			prediction.txt			   test_predictions_deeptmhmm.txt


## 2) Code

In [3]:
!pip install torchmetrics

Collecting torchmetrics
  Downloading torchmetrics-1.2.1-py3-none-any.whl (806 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m806.1/806.1 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.10.0-py3-none-any.whl (24 kB)
Installing collected packages: lightning-utilities, torchmetrics
Successfully installed lightning-utilities-0.10.0 torchmetrics-1.2.1


In [4]:
from torch import tensor
import torch
from torchmetrics.classification import MulticlassF1Score
import sys
import json


In [14]:
# Define our uniprots (pilot)
# our_uniprots = ['P09391', 'D7A6E5', 'P06030']

# Define our uniprots (test)
#our_test_file = open('/content/drive/My Drive/DL/our_test.json')
our_test_file = open(path_to_json_file)
test_dict = json.load(our_test_file)
our_test_file.close()

our_uniprots = list(test_dict.keys())
print(our_uniprots)
print(len(our_uniprots))



['Q59514', 'Q8K4G5', 'P93045', 'P32051', 'P9WP57', 'P48612', 'Q8TD19', 'P30659', 'P03934', 'P05146', 'P40032', 'P50528', 'Q9NZN9', 'Q476F7', 'Q8IZQ1', 'O08648', 'A2VBC4', 'Q8NFK1', 'P47924', 'Q8LKS5', 'P02879', 'Q08119', 'P52554', 'P22266', 'Q9LT15', 'Q8NJU8', 'Q88NC7', 'O01510', 'Q6L545', 'O42924', 'P35874', 'P0AD59', 'Q54WR4', 'Q8NBZ0', 'P70060', 'Q5BGU9', 'P00424', 'Q9USH2', 'O21043', 'Q8L6J3', 'P13727', 'Q9SU05', 'A5W3Z9', 'G5EC68', 'Q95Y36', 'Q59675', 'Q9UTR8', 'P0AFF0', 'P53867', 'Q8RLX0', 'Q9UBS5', 'Q4G0N4', 'Q4KL25', 'P21524', 'Q588Z1', 'Q9ST62', 'P00144', 'Q8NFH5', 'O60055', 'Q7Z6K1', 'Q9I4Z4', 'Q9LTB0', 'P05743', 'B6TTV8', 'P64428', 'P0AFM2', 'O88093', 'Q9NRD9', 'O14773', 'Q93Z81', 'Q15814', 'Q91VR5', 'Q00196', 'P54107', 'P55197', 'Q91V10', 'Q87RN6', 'O36023', 'Q10NX8', 'P40099', 'Q9VSL7', 'O60565', 'E5FYH0', 'P58557', 'Q99593', 'P00652', 'O00408', 'D6WJ77', 'P21128', 'Q04659', 'Q9CAH0', 'Q963A9', 'P34247', 'Q3E7C1', 'Q87TN7', 'P24330', 'Q9X404', 'A6XB80', 'P97302', 'Q9SJN0',

In [15]:
# Name of the deepTMHMM file
#deeptmhmm_file = '/content/drive/My Drive/DL/predictions_deeptmhmm.txt'

# Dict of labels
#LABELS: Dict[str,int] = {'I': 0, 'O':1, 'P': 2, 'S': 3, 'M':4, 'B': 5}
labels_trans = str.maketrans("IOPSMB","012345")


In [16]:
# Lists of sequences
deep_tmhmmm_uniprots_our_uniprots = list()
deep_tmhmmm_uniprots_not_our_uniprots = list()
sequences = list()
ground_truth = list()
tmhmm_prediction = list()

read_lines = False
count = 0

# Run through file and extract sequences
with open(deeptmhmm_file, 'r') as file:
    for line in file:

        # Initiate reading at correct sequences
        if line.startswith('>'):
            protein_uniprot = line[1:-1]

            if protein_uniprot in our_uniprots:
                read_lines = True
                deep_tmhmmm_uniprots_our_uniprots.append(protein_uniprot)

            else:
                read_lines = False
                deep_tmhmmm_uniprots_not_our_uniprots.append(protein_uniprot)


        if not line.startswith('>') and read_lines == True and count == 0:
            sequences.append(line[:-1])
            count += 1

        elif not line.startswith('>') and read_lines == True and count == 1:
            ground_truth.append(line[:-1])
            count += 1

        elif not line.startswith('>') and read_lines == True and count == 2:
            tmhmm_prediction.append(line[:-1])
            count = 0
            readlines = False

In [18]:
### Perform checks ###

# Number of uniprots
print(len(our_uniprots))
print(len(deep_tmhmmm_uniprots_our_uniprots))
print(len(deep_tmhmmm_uniprots_not_our_uniprots))

# Lenghts of sequences
print(len(sequences))
print(len(ground_truth))
print(len(tmhmm_prediction))


print(ground_truth)
print(tmhmm_prediction)




714
714
2860
714
714
714
['IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII', 'SSSSSSSSSSSSSSSSSSSOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO', 'IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII

In [19]:
# Collect sequences
if len(ground_truth) != len(tmhmm_prediction):
    print('Not the same number of sequences.')
    sys.exit(1)

total_ground_truth = ''
total_tmhmmp_prediction = ''


for i in range(len(deep_tmhmmm_uniprots_our_uniprots)):
    if len(ground_truth[i]) != len(tmhmm_prediction[i]):
      print(deep_tmhmmm_uniprots_our_uniprots[i])
      print(ground_truth[i])
      print(tmhmm_prediction[i])
    elif len(ground_truth[i]) == len(tmhmm_prediction[i]):
      total_ground_truth += ground_truth[i]
      total_tmhmmp_prediction += tmhmm_prediction[i]


Q61420
IIIIIIIIIMMMMMMMMMMMMMMMMMMMMMOOOOOOOOOOOOOOOMMMMMMMMMMMMMMMMMMMIIIIIIIIIIIIIIIIIIIIIIIMMMMMMMMMMMMMMMMMMMMMOOOOOOMMMMMMMMMMMMMMMMMMMMMIIIIIIMMMMMMMMMMMMMMMMMMMOOOOOOOOOOOOOOOMMMMMMMMMMMMMMMMMMMMMIIIIIIIIIIIIIMMMMMMMMMMMMMMMMMMMOOOOOOOOOOOOOOOMMMMMMMMMMMMMMMMMMMIIIIIIIMMMMMMMMMMMMMMMMMMMOOOOOOOOMMMMMMMMMMMMMMMMMMMIIIIIIIIIIIIIIIIIIIII
IIIIIIIIIIIIIMMMMMMMMMMMMMMMMMMOOOOOOOOOOOOOOMMMMMMMMMMMMMMMMMMMIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIMMMMMMMMMMOOOOOOOOOOMMMMMMMMMMMMMMMMMMMMMIIIIIIIIMMMMMMMMMMMMMMOOOOOOOOOOOOOOOOMMMMMMMMMMMMMMMMMMMMMIIIIIIIIIIIIIIIMMMMMMMMMMMMMMMMMOOOOOOOOOOOOOOOOMMMMMMMMMMMMMMMMMMMMMIIIIIIIIIMMMMMMMMMMMMMMMMMMOOOOOMMMMMMMMMMMMMMMMMMMIIIIIIIIIIIIIIIIIIII


In [20]:
# Translate sequences and turn into integer lists
target = list(map(int, total_ground_truth.translate(labels_trans)))
preds = list(map(int, total_tmhmmp_prediction.translate(labels_trans)))

torch_target = tensor(target)
torch_preds = tensor(preds)

print(len(torch_target))
print(len(torch_preds))


363523
363523


In [21]:
# Define metric
mcf_macro = MulticlassF1Score(num_classes = 6, average = 'macro')
mcf_micro = MulticlassF1Score(num_classes = 6, average = 'micro')
mcf_none = MulticlassF1Score(num_classes = 6, average = 'none')
mcf_weighted = MulticlassF1Score(num_classes = 6, average = 'weighted')
#mcf_multidim = MulticlassF1Score(num_classes = 3, average = 'none', multidim_average = 'samplewise')

# Print metric
print('Macro')
print(mcf_macro(torch_target, torch_preds))
print('')
print('Micro')
print(mcf_micro(torch_target, torch_preds))
print('')
print('None')
print('I O P S M B')
print(mcf_none(torch_target, torch_preds))
print('')
print('Weighted')
print(mcf_weighted(torch_target, torch_preds))
#print(mcf_multidim(tensor([[1,2,0],[1,1,2]]), tensor([[1,0,2],[1,1,2]])))

Macro
tensor(0.8870)

Micro
tensor(0.9685)

None
I O P S M B
tensor([0.9869, 0.9486, 0.7274, 0.9690, 0.8714, 0.8190])

Weighted
tensor(0.9681)
