In [1]:
import time
notebook_start_time = time.perf_counter()

# NetOGlyc4 data label encoding

## Imports

### Built-in imports

In [2]:
import gzip
import pickle
from pathlib import Path
import re

### Shared library imports

In [3]:
import glyc_processing.annotation

### External imports

In [4]:
import numpy as np
import pandas as pd
import h5py

## Paths & Constants

In [5]:
#BASE_DIR = Path("/mnt/g/My Drive/CloudVault/Masters/Data")
BASE_DIR = Path("/home/jakob/Cloudvault_new/Data")

# Path of imported annotations file
ANNOTATIONS_FILE = BASE_DIR/'NetOGlyc5 data'/'GalNAc data'/'04-annotation'/'netoglyc4_protein_annotations.pkl.gz'

# Path of exported label files
MAX_LABELS_FILE = BASE_DIR/'NetOGlyc5 data'/'GalNAc data'/'05-embedding'/'netoglyc4_protein_glyc_labels_max.h5'

## Fetch protein annotations

In [6]:
with gzip.open(ANNOTATIONS_FILE, 'rb') as f:
    proteins = pickle.load(f)

## Encode labels to files

In [7]:
n_seqs = len(proteins)
seq_length_sorted_protein_ids = sorted(proteins, key=lambda protein_id: len(proteins[protein_id].protein_seq), reverse=True)
max_seq_length = len(proteins[seq_length_sorted_protein_ids[0]].protein_seq)

In [8]:
proteins.scoring_function = glyc_processing.annotation.max_score

with h5py.File(MAX_LABELS_FILE, 'w') as file:
    identifiers_dataset = file.create_dataset("identifiers", (n_seqs,), h5py.string_dtype(), maxshape=(None,))
    sequences_dataset = file.create_dataset("sequences", (n_seqs,), h5py.string_dtype(), maxshape=(None,))

    labels_group = file.create_group('labels')

    gly_dataset = labels_group.create_dataset("gly", (n_seqs, max_seq_length, 1), dtype='f4')
    gly_dataset.attrs['cast_type'] = 'f4'
    
    seen_dataset = labels_group.create_dataset("seen", (n_seqs, max_seq_length, 1), dtype='f4')
    seen_dataset.attrs['cast_type'] = 'f4'

    for idx, protein_id in enumerate(seq_length_sorted_protein_ids):        
        protein = proteins[protein_id]
        seq_length = len(protein.protein_seq)
        
        identifiers_dataset[idx] = protein_id
        sequences_dataset[idx] = protein.protein_seq
        
        gly_dataset[idx, :seq_length] = np.array(protein.get_glycosylation_labels())[..., None]
        
        seen_dataset[idx, :seq_length] = np.array(protein.seq_idx_seen_count)[..., None]

In [9]:
notebook_end_time = time.perf_counter()
print(f"Notebook took {notebook_end_time-notebook_start_time} seconds to run")

Notebook took 0.9410926349228248 seconds to run
