In [1]:
import time
notebook_start_time = time.perf_counter()

# NetOGlyc4 data processing

## Imports

### Built-in imports

In [2]:
import pickle
import gzip
from pathlib import Path
import warnings
import math
import re
import uuid

### Shared library imports

In [3]:
from glyc_processing import cf
from glyc_processing.data_formats.galnac.config import GalNAcConfig
from glyc_processing.data_formats.common.validation import consistent_id_common_info
from glyc_processing.uniprot import get_entry_isoforms_dicts, get_uniprot_entries, get_uniprot_isoforms, \
    get_entry_isoforms_dicts
from glyc_processing.annotation import ProteinSet, AnnotationError, max_score

### External imports

In [4]:
from IPython.display import display
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from tqdm.auto import tqdm
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

## Paths & Constants

In [5]:
#BASE_DIR = Path("/mnt/g/My Drive/CloudVault/Masters/Data")
BASE_DIR = Path("/home/jakob/Cloudvault_new/Data")

# Path of positive glycosite and protein sequence file
POSITIVE_GLYCOSITES_FILE = BASE_DIR/'NetOGlyc5 data'/'Training NetOGlyc4'/'SimpleCell All Glycosites 12_10_26.tsv'

# Path of negative peptide and protein sequence files
NEGATIVE_PEPTIDES_FILE = BASE_DIR/'NetOGlyc5 data'/'Training NetOGlyc4'/'SimpleCell Negatives Peptides 12_07_20.tsv'
NEGATIVE_PROTEIN_SEQUENCES_FILE = BASE_DIR/'NetOGlyc5 data'/'Training NetOGlyc4'/'SimpleCell Negatives Sites 12_07_20.tsv'

# Path of exported annotations file
ANNOTATIONS_FILE = BASE_DIR/'NetOGlyc5 data'/'GalNAc data'/'04-annotation'/'netoglyc4_protein_annotations.pkl.gz'

# Path of exported sequences fasta file
SEQUENCES_FILE = BASE_DIR/'NetOGlyc5 data'/'GalNAc data'/'04-annotation'/'netoglyc4_protein_sequences.fasta'
TRUNCATED_SEQUENCES_FILE = BASE_DIR/'NetOGlyc5 data'/'GalNAc data'/'04-annotation'/'netoglyc4_protein_sequences_truncated.fasta'

# Maximum sequence length that can be embedded in one go
# 1022 for BERT/ALBERT (ESM-1b / ProtBert wo. start/end tokens), No limit for XLNet
MAX_SEQ_LENGTH = 1022

In [6]:
cf.use_config(GalNAcConfig)

In [7]:
# The temp folder is used to organize data-specific temporary files
cf.TEMP_DIR = BASE_DIR/'uniprot'/'NetOGlyc4'

# Uniprot release downloads directory path (make sure you have a few GB of free space)
cf.UNIPROT_DOWNLOADS_DIR = BASE_DIR/'uniprot'

# The amino acids that are allowed to be glycosylated
cf.ALLOWED_AA = ('S','T','Y')

# The uniport release can be 'latest' for the current release or any of those with format (YYYY_MM) found here: https://ftp.uniprot.org/pub/databases/uniprot/previous_releases/
# Warning: Uniprot only keeps previous releases other than the first of the year for 2 years, so only use first yearly (2015_01, 2021_01 etc.) for reproducability!
cf.UNIPROT_RELEASE = '2021_01'

# If True ignores existing data-specific temp files and recreates them from scratch
# This should be used if the data or script has changed
cf.IGNORE_EXISTING_FILES = True

In [8]:
print(f"Using Uniprot Release {cf.TRUE_UNIPROT_RELEASE}")

Using Uniprot Release 2021_01


## Map data to standard annotation format

### Read positive & negative data

In [9]:
positive_glycosites_df = pd.read_csv(POSITIVE_GLYCOSITES_FILE, sep="\t", header=0)
print(f"Positive glycosite rows: {len(positive_glycosites_df)}")

Positive glycosite rows: 2112


In [10]:
negative_peptides_df = pd.read_csv(NEGATIVE_PEPTIDES_FILE, sep="\t", header=None)
negative_peptides_df.columns = ['uniprot_id', 'peptide_start', 'peptide_end']
print(f"Negative peptide rows: {len(negative_peptides_df)}")

Negative peptide rows: 1466


In [11]:
negative_proteins_df = pd.read_csv(NEGATIVE_PROTEIN_SEQUENCES_FILE, sep="\t", header=0)
print(f"Negative protein rows: {len(negative_proteins_df)}")

Negative protein rows: 405


In [12]:
negative_peptides_df = negative_peptides_df.merge(negative_proteins_df, left_on='uniprot_id', right_on='uniprot_id')

### Check for any inconsistent rows:

In [13]:
positive_glycosites_df[positive_glycosites_df['gene'] != positive_glycosites_df['uniprot_id']]

Unnamed: 0,gene,site,uniprot_id,sequence


In [14]:
positive_glycosites_df[~consistent_id_common_info('uniprot_id', positive_glycosites_df[['gene', 'uniprot_id', 'sequence']])]

Unnamed: 0,gene,site,uniprot_id,sequence


In [15]:
negative_peptides_df[negative_peptides_df['gene'] != negative_peptides_df['uniprot_id']]

Unnamed: 0,uniprot_id,peptide_start,peptide_end,gene,site,sequence


In [16]:
negative_peptides_df[~consistent_id_common_info('uniprot_id', negative_peptides_df[['gene', 'uniprot_id', 'sequence']])]

Unnamed: 0,uniprot_id,peptide_start,peptide_end,gene,site,sequence


### Map positive data

In [17]:
proteins = ProteinSet(allowed_aa = cf.ALLOWED_AA, index_start = 1, end_exclusive = False, scoring_function = max_score)

In [18]:
for i, row in tqdm(iterable=positive_glycosites_df.iterrows(), total=len(positive_glycosites_df), desc=f"Annotating positive data"):
    try:
        proteins.add(
            protein_id = row['uniprot_id'],
            protein_seq = row['sequence'],
            peptide_id = str(uuid.uuid4()),
            single_site_or_unclear_start = int(row['site'][1:]),
        )
    except AnnotationError as e:
        print(e)

Annotating positive data:   0%|          | 0/2112 [00:00<?, ?it/s]

protein_id=Q8WXI7 self.peptide_id=23ca06c9-bebd-4dd2-b43b-1b20a757dcb7: idx_single_site_or_unclear_start=3213 is not an allowed AA
protein_id=Q8WXI7 self.peptide_id=3d307d2c-2faa-4b4b-9ca4-da6fa09dd472: idx_single_site_or_unclear_start=914 is not an allowed AA


### Map negative data

In [19]:
for i, row in tqdm(iterable=negative_peptides_df.iterrows(), total=len(negative_peptides_df), desc=f"Annotating negative data"):
    try:
        proteins.add(
            protein_id = row['uniprot_id'],
            protein_seq = row['sequence'],
            peptide_id = str(uuid.uuid4()),
            peptide_start = row['peptide_start'],
            peptide_end = row['peptide_end'],
        )
    except AnnotationError as e:
        print(e)

Annotating negative data:   0%|          | 0/1466 [00:00<?, ?it/s]

## Save annotations & protein sequences

In [20]:
with gzip.open(ANNOTATIONS_FILE, 'wb') as f:
    pickle.dump(proteins, f, protocol=4)

In [21]:
seq_length_sorted_protein_ids = sorted(proteins, key=lambda protein_id: len(proteins[protein_id].protein_seq), reverse=True)

with open(SEQUENCES_FILE, 'w') as f:
    for protein_id in seq_length_sorted_protein_ids:
        protein = proteins[protein_id]
        SeqIO.write(SeqRecord(Seq(protein.protein_seq), protein.protein_id, '', ''), f, "fasta")

In [22]:
with open(TRUNCATED_SEQUENCES_FILE, 'w') as f:
    for protein_id in seq_length_sorted_protein_ids:
        protein = proteins[protein_id]
        seq_length = min(len(protein.protein_seq), MAX_SEQ_LENGTH)
        SeqIO.write(SeqRecord(Seq(protein.protein_seq[:seq_length]), protein.protein_id, '', ''), f, "fasta")

In [23]:
notebook_end_time = time.perf_counter()
print(f"Notebook took {notebook_end_time-notebook_start_time} seconds to run")

Notebook took 3.150583280948922 seconds to run
