In [1]:
import time
notebook_start_time = time.perf_counter()

# GalNAc data label encoding

## Imports

### Built-in imports

In [2]:
import gzip
import pickle
from pathlib import Path
import re

### Shared library imports

In [3]:
import glyc_processing.annotation

### External imports

In [4]:
import numpy as np
import pandas as pd
import h5py

## Paths & Constants

In [5]:
#BASE_DIR = Path("/mnt/g/My Drive/CloudVault/Masters/Data")
BASE_DIR = Path("/home/jakob/Cloudvault_new/Data")

# Path of imported annotations file
ANNOTATIONS_FILE = BASE_DIR/'NetOGlyc5 data'/'GalNAc data'/'04-annotation'/'protein_annotations.pkl.gz'

# Path of exported label files
MEAN_LABELS_FILE = BASE_DIR/'NetOGlyc5 data'/'GalNAc data'/'05-embedding'/'protein_glyc_labels_mean.h5'
MAX_LABELS_FILE = BASE_DIR/'NetOGlyc5 data'/'GalNAc data'/'05-embedding'/'protein_glyc_labels_max.h5'

## Fetch protein annotations

In [6]:
with gzip.open(ANNOTATIONS_FILE, 'rb') as f:
    proteins = pickle.load(f)

## Encode site compositions

In [7]:
sugar_encoding_order = [
    'HexNAc',
    'HexHexNAc',
    'Hex',
    'Sia',
    'dHex',
    'NeuAc',
    'NeuGc',
]

default_sugar_encoding = np.zeros(len(sugar_encoding_order), dtype='bool')

def encode_site_composition(site_composition):
    sugar_counts = {}
    for component in re.findall(r'[1-9][0-9]*x[A-Za-z]+', site_composition):
        count, sugar = component.split('x', 1)
        sugar_counts[sugar] = int(count)
    encoded_sugars = [sugar in sugar_counts for sugar in sugar_encoding_order]
    return np.array(encoded_sugars)

In [8]:
proteins_sites_compositions = {}
for protein in proteins.values():
    sites_compositions = [None for idx in range(len(protein.protein_seq))]
    for idx, sites in enumerate(protein.seq_sites):
        if sites is not None and len(sites) > 0:
            for site in sites:
                if 'site_composition' in site.site_annotations and pd.notna(site.site_annotations['site_composition']):
                    if sites_compositions[idx] is None:
                        sites_compositions[idx] = set()
                    sites_compositions[idx].add(site.site_annotations['site_composition'])
    if any(sites is not None for sites in sites_compositions):
        protein_encoded_sugars = []
        for idx, sites in enumerate(sites_compositions):
            sites_encoded_sugars = default_sugar_encoding.copy()
            if sites is not None:
                for site in sites:
                    sites_encoded_sugars = sites_encoded_sugars | encode_site_composition(site)
            protein_encoded_sugars.append(sites_encoded_sugars)
        proteins_sites_compositions[protein.protein_id] = np.stack(protein_encoded_sugars).astype(int)

## Encode labels to files

In [9]:
n_seqs = len(proteins)
seq_length_sorted_protein_ids = sorted(proteins, key=lambda protein_id: len(proteins[protein_id].protein_seq), reverse=True)
max_seq_length = len(proteins[seq_length_sorted_protein_ids[0]].protein_seq)

In [10]:
proteins.scoring_function = glyc_processing.annotation.mean_counts_score

with h5py.File(MEAN_LABELS_FILE, 'w') as file:
    identifiers_dataset = file.create_dataset("identifiers", (n_seqs,), h5py.string_dtype(), maxshape=(None,))
    sequences_dataset = file.create_dataset("sequences", (n_seqs,), h5py.string_dtype(), maxshape=(None,))

    labels_group = file.create_group('labels')

    gly_dataset = labels_group.create_dataset("gly", (n_seqs, max_seq_length, 1), dtype='f4')
    gly_dataset.attrs['cast_type'] = 'f4'

    seen_dataset = labels_group.create_dataset("seen", (n_seqs, max_seq_length, 1), dtype='f4')
    seen_dataset.attrs['cast_type'] = 'f4'
    
    com_dataset = labels_group.create_dataset("com", (n_seqs, max_seq_length, len(sugar_encoding_order)), dtype='f4')
    com_dataset.attrs['cast_type'] = 'f4'

    for idx, protein_id in enumerate(seq_length_sorted_protein_ids):        
        protein = proteins[protein_id]
        seq_length = len(protein.protein_seq)
        
        identifiers_dataset[idx] = protein_id
        sequences_dataset[idx] = protein.protein_seq
        
        gly_dataset[idx, :seq_length] = np.array(protein.get_glycosylation_labels())[..., None]
        
        seen_dataset[idx, :seq_length] = np.array(protein.seq_idx_seen_count)[..., None]
        
        if protein_id in proteins_sites_compositions:
            com_dataset[idx, :seq_length] = proteins_sites_compositions[protein_id]

In [11]:
proteins.scoring_function = glyc_processing.annotation.max_score

with h5py.File(MAX_LABELS_FILE, 'w') as file:
    identifiers_dataset = file.create_dataset("identifiers", (n_seqs,), h5py.string_dtype(), maxshape=(None,))
    sequences_dataset = file.create_dataset("sequences", (n_seqs,), h5py.string_dtype(), maxshape=(None,))

    labels_group = file.create_group('labels')

    gly_dataset = labels_group.create_dataset("gly", (n_seqs, max_seq_length, 1), dtype='f4')
    gly_dataset.attrs['cast_type'] = 'f4'

    seen_dataset = labels_group.create_dataset("seen", (n_seqs, max_seq_length, 1), dtype='f4')
    seen_dataset.attrs['cast_type'] = 'f4'

    com_dataset = labels_group.create_dataset("com", (n_seqs, max_seq_length, len(sugar_encoding_order)), dtype='f4')
    com_dataset.attrs['cast_type'] = 'f4'

    for idx, protein_id in enumerate(seq_length_sorted_protein_ids):        
        protein = proteins[protein_id]
        seq_length = len(protein.protein_seq)
        
        identifiers_dataset[idx] = protein_id
        sequences_dataset[idx] = protein.protein_seq
        
        gly_dataset[idx, :seq_length] = np.array(protein.get_glycosylation_labels())[..., None]

        seen_dataset[idx, :seq_length] = np.array(protein.seq_idx_seen_count)[..., None]
        
        if protein_id in proteins_sites_compositions:
            com_dataset[idx, :seq_length] = proteins_sites_compositions[protein_id]

In [12]:
notebook_end_time = time.perf_counter()
print(f"Notebook took {notebook_end_time-notebook_start_time} seconds to run")

Notebook took 15.355394323007204 seconds to run
