In [1]:
import time
notebook_start_time = time.perf_counter()

# NetOGlyc4 data get training/validation/test set indices

## Imports

### Built-in imports

In [2]:
import math
import gzip
import pickle
from pathlib import Path
import warnings
import re
from itertools import chain

### Shared library imports

### External imports

In [3]:
import numpy as np
import pandas as pd
import h5py
from tqdm.auto import tqdm
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

## Paths & Constants

In [4]:
#BASE_DIR = Path("/mnt/g/My Drive/CloudVault/Masters/Data")
BASE_DIR = Path("/home/jakob/Cloudvault_new/Data")

# Paths of imported embeddings file
EMBEDDING_FILE = BASE_DIR/'NetOGlyc5 data'/'GalNAc data'/'05-embedding'/'netoglyc4_protein_embeddings_netsurfp_output_glyc_labels_max.h5'

# Path of directory to get Graphpart output
GRAPHPART_DIR = BASE_DIR/'NetOGlyc5 data'/'GalNAc data'/'06-partitioning'/'netoglyc4_graphpart_partitions'

# Maximum sequence length of proteins for valid/test sets
MAX_SEQ_LENGTH = 1022

In [5]:
GRAPHPART_OUTPUT_FILE = GRAPHPART_DIR/'graphpart_output.csv'

## Fetch embedding indices

In [6]:
with h5py.File(EMBEDDING_FILE, 'r') as embedding_file:
    embedding_identifiers_list = embedding_file['identifiers'].asstr()[:].tolist()
    embedding_seq_lengths = [len(seq) for seq in embedding_file['sequences'].asstr()[:]]

In [19]:
print(f"Max seq length: {max(embedding_seq_lengths)}")

Max seq length: 22152


## Fetch validation/testing partition IDs

In [7]:
clusters = [set(), set(), set()]
with open(GRAPHPART_OUTPUT_FILE, 'r') as f:
    f_iter = iter(f)
    header = next(f_iter).strip().split(',')
    for line in f_iter:
        AC, priority, glycosylated, cluster = line.strip().split(',')
        priority = priority == "True"
        glycosylated = float(glycosylated) > 0
        cluster = int(float(cluster))
        clusters[cluster].add(AC)

In [8]:
[len(cluster) for cluster in clusters]

[291, 36, 36]

In [9]:
validation_ids = clusters[1]
testing_ids = clusters[2]

In [10]:
n_val_glyc_proteins = 0
n_val_sites = 0
n_val_glyc_sites = 0

n_test_glyc_proteins = 0
n_test_sites = 0
n_test_glyc_sites = 0

with h5py.File(EMBEDDING_FILE, 'r') as embedding_file:
    for protein_id in validation_ids:
        embedding_idx = embedding_identifiers_list.index(protein_id)
        seq_length = min(embedding_seq_lengths[embedding_idx], MAX_SEQ_LENGTH)
        gly = embedding_file['labels']['gly'][embedding_idx, :seq_length]
        sites_mask = (gly >= 0)
        glyc_sites_mask = (gly > 0)
        n_val_sites += int(sites_mask.sum())
        n_val_glyc_sites += int(glyc_sites_mask.sum())
        if glyc_sites_mask.sum() > 0:
            n_val_glyc_proteins += 1

    for protein_id in testing_ids:
        embedding_idx = embedding_identifiers_list.index(protein_id)
        seq_length = embedding_seq_lengths[embedding_idx]
        gly = embedding_file['labels']['gly'][embedding_idx, :seq_length]
        sites_mask = (gly >= 0)
        glyc_sites_mask = (gly > 0)
        n_test_sites += int(sites_mask.sum())
        n_test_glyc_sites += int(glyc_sites_mask.sum())
        if glyc_sites_mask.sum() > 0:
            n_test_glyc_proteins += 1

print(f"Validation partition glycosylated proteins: {n_val_glyc_proteins} ({n_val_glyc_proteins/len(validation_ids)*100:.2f}%)")
print(f"Validation partition sites: {n_val_sites}")
print(f"Validation partition glycosylated sites: {n_val_glyc_sites} ({n_val_glyc_sites/n_val_sites*100:.2f}%)")
print(f"Test partition glycosylated proteins: {n_test_glyc_proteins} ({n_test_glyc_proteins/len(testing_ids)*100:.2f}%)")
print(f"Test partition sites: {n_test_sites}")
print(f"Test partition glycosylated sites: {n_test_glyc_sites} ({n_test_glyc_sites/n_test_sites*100:.2f}%)")

Validation partition glycosylated proteins: 10 (27.78%)
Validation partition sites: 284
Validation partition glycosylated sites: 22 (7.75%)
Test partition glycosylated proteins: 11 (30.56%)
Test partition sites: 387
Test partition glycosylated sites: 39 (10.08%)


## Print testing/validation indices

In [11]:
validation_indices = sorted(embedding_identifiers_list.index(protein_id) for protein_id in validation_ids)
testing_indices = sorted(embedding_identifiers_list.index(protein_id) for protein_id in testing_ids)

In [12]:
print(len(validation_indices))
print(validation_indices)

36
[33, 52, 74, 99, 123, 158, 181, 183, 185, 210, 255, 272, 292, 318, 333, 361, 363, 395, 426, 456, 476, 495, 519, 554, 577, 594, 604, 619, 633, 648, 670, 693, 708, 732, 749, 773]


In [13]:
print(len(testing_indices))
print(testing_indices)

36
[31, 51, 69, 98, 121, 141, 174, 223, 238, 248, 269, 290, 316, 362, 365, 378, 393, 424, 449, 473, 491, 493, 520, 553, 575, 593, 603, 617, 632, 647, 671, 692, 709, 729, 748, 768]


## Print all training indices

In [14]:
training_indices = list(range(len(embedding_identifiers_list)))

for idx in sorted(chain(validation_indices, testing_indices), reverse=True):
    del training_indices[idx]

In [15]:
print(len(training_indices))
print(training_indices)

714
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 32, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 70, 71, 72, 73, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 122, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 175, 176, 177, 178, 179, 180, 182, 184, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 23

## Print only training indices for glycosylated proteins

In [16]:
training_glyconly_proteins = []

with h5py.File(EMBEDDING_FILE, 'r') as embedding_file:
    embedding_sequences_dataset = embedding_file['sequences'].asstr()
    embedding_gly_dataset = embedding_file['labels']['gly']
    for train_idx in training_indices:
        seq_length = len(embedding_sequences_dataset[train_idx])
        gly = embedding_gly_dataset[train_idx, :seq_length]
        if (gly > 0).sum() > 0:
            training_glyconly_proteins.append(train_idx)

In [17]:
print(len(training_glyconly_proteins))
print(training_glyconly_proteins)

454
[0, 1, 3, 4, 5, 6, 7, 8, 9, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 24, 25, 27, 28, 29, 30, 32, 34, 35, 36, 38, 40, 41, 42, 44, 47, 48, 49, 50, 53, 54, 55, 58, 59, 60, 62, 63, 65, 66, 67, 70, 71, 72, 73, 75, 77, 78, 79, 81, 82, 85, 86, 87, 88, 90, 92, 93, 94, 95, 96, 100, 102, 104, 105, 106, 108, 109, 110, 111, 112, 113, 114, 116, 117, 118, 120, 122, 124, 126, 127, 128, 129, 131, 133, 134, 135, 136, 137, 138, 139, 142, 143, 144, 146, 149, 150, 151, 152, 154, 155, 156, 157, 160, 163, 165, 167, 168, 171, 172, 175, 176, 177, 178, 179, 180, 182, 184, 186, 187, 189, 190, 191, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 209, 213, 215, 217, 218, 219, 222, 224, 225, 226, 228, 230, 231, 232, 233, 234, 235, 236, 237, 241, 242, 245, 250, 251, 252, 253, 254, 258, 260, 262, 264, 265, 266, 268, 271, 274, 275, 276, 279, 280, 282, 283, 285, 286, 287, 289, 291, 294, 296, 297, 298, 303, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 319, 320, 322, 323, 324, 325, 328, 329, 331, 334

In [18]:
notebook_end_time = time.perf_counter()
print(f"Notebook took {notebook_end_time-notebook_start_time} seconds to run")

Notebook took 0.5274732580874115 seconds to run
