In [1]:
import time
notebook_start_time = time.perf_counter()

# GalNAc data processing

## Imports

### Built-in imports

In [2]:
import pickle
import gzip
from pathlib import Path
import warnings
import math
import re

### Shared library imports

In [3]:
from glyc_processing import cf
from glyc_processing.data_formats.galnac.config import GalNAcConfig

from glyc_processing.uniprot import get_entry_isoforms_dicts, get_uniprot_entries, get_uniprot_isoforms, \
    get_entry_isoforms_dicts
from glyc_processing.annotation import ProteinSet, AnnotationError, mean_counts_score

### External imports

In [4]:
from IPython.display import display
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from tqdm.auto import tqdm
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

## Paths & Constants

In [25]:
#BASE_DIR = Path("/mnt/g/My Drive/CloudVault/Masters/Data")
BASE_DIR = Path("/home/jakob/Cloudvault_new/Data")

# Path of cleaned data file
CLEAN_POSITIVE_DATA_FILE = BASE_DIR/'NetOGlyc5 data'/'GalNAc data'/'02-GalNAC_processing'/'clean_data.xlsx'

# Path of cleaned negative data file
CLEAN_NEGATIVE_DATA_FILE = BASE_DIR/'NetOGlyc5 data'/'GalNAc data'/'03-PRIDE_processing'/'clean_data.xlsx'

# Path of exported annotations file
ANNOTATIONS_FILE = BASE_DIR/'NetOGlyc5 data'/'GalNAc data'/'04-annotation'/'protein_annotations.pkl.gz'

# Path of exported sequences fasta file
SEQUENCES_FILE = BASE_DIR/'NetOGlyc5 data'/'GalNAc data'/'04-annotation'/'protein_sequences.fasta'
TRUNCATED_SEQUENCES_FILE = BASE_DIR/'NetOGlyc5 data'/'GalNAc data'/'04-annotation'/'protein_sequences_truncated.fasta'

# Maximum sequence length that can be embedded in one go
# 1022 for BERT/ALBERT (ESM-1b / ProtBert wo. start/end tokens), No limit for XLNet
MAX_SEQ_LENGTH = 1022

In [6]:
cf.use_config(GalNAcConfig)

In [7]:
# The temp folder is used to organize data-specific temporary files
cf.TEMP_DIR = BASE_DIR/'uniprot'/'GalNAc_PRIDE'

# Uniprot release downloads directory path (make sure you have a few GB of free space)
cf.UNIPROT_DOWNLOADS_DIR = BASE_DIR/'uniprot'

# The amino acids that are allowed to be glycosylated
cf.ALLOWED_AA = ('S','T','Y')

# The uniport release can be 'latest' for the current release or any of those with format (YYYY_MM) found here: https://ftp.uniprot.org/pub/databases/uniprot/previous_releases/
# Warning: Uniprot only keeps previous releases other than the first of the year for 2 years, so only use first yearly (2015_01, 2021_01 etc.) for reproducability!
cf.UNIPROT_RELEASE = '2021_01'

# If True ignores existing data-specific temp files and recreates them from scratch
# This should be used if the data or script has changed
cf.IGNORE_EXISTING_FILES = True

In [8]:
print(f"Using Uniprot Release {cf.TRUE_UNIPROT_RELEASE}")

Using Uniprot Release 2021_01


## Map data to standard annotation format

### Read positive & negative data

In [9]:
clean_positive_df = pd.read_excel(CLEAN_POSITIVE_DATA_FILE, header=0)
print(f"Positive data rows: {len(clean_positive_df)}")

Positive data rows: 288257


In [10]:
clean_negative_df = pd.read_excel(CLEAN_NEGATIVE_DATA_FILE, header=0)
print(f"Negative data rows: {len(clean_negative_df)}")

Negative data rows: 12682


### Fetch UniProt sequences

In [11]:
unique_uniprot_ids = set(clean_positive_df['uniprot'].unique())
unique_uniprot_ids.update(clean_negative_df['uniprot'].unique())

get_uniprot_entries(unique_uniprot_ids)
get_uniprot_isoforms(unique_uniprot_ids)
entry_isoforms, isoform_seqs = get_entry_isoforms_dicts()
print(f"Uniprot sequences - Number of entries: {len(entry_isoforms)}, Number of isoforms: {len(isoform_seqs)}")

Extracting data-specific Uniprot entries from /home/jakob/Cloudvault_new/Data/uniprot/uniprot_sprot2021_01.dat…

Extracting data-specific Uniprot entry isoforms from /home/jakob/Cloudvault_new/Data/uniprot/uniprot_sprot_var…

Uniprot sequences - Number of entries: 4995, Number of isoforms: 12293


### Check whether any proteins contain uncommon amino acids, and replace them with X

In [12]:
# We assume that peptides from clean datasets only contain X (unknown) and no other uncommon amino acids
# Otherwise, there might be a discrepancy between peptides and the protein sequence
uncommon_aa_isoforms = {isoform for isoform in isoform_seqs if re.search(r"[BJOUZ]", isoform_seqs[isoform])}
for isoform in uncommon_aa_isoforms:
    isoform_seqs[isoform] = re.sub(r"[BJOUZ]", "X", isoform_seqs[isoform])
print("Replaced uncommon amino acids in the following isoforms with X:")
print(uncommon_aa_isoforms)

Replaced uncommon amino acids in the following isoforms with X:
{'P49908', 'Q9NNW7-4', 'Q9NNW7', 'Q9NNW7-2', 'Q9NNW7-3', 'Q9C0D9', 'Q8WWX9', 'Q9BQE4'}


### Map positive data

In [13]:
proteins = ProteinSet(allowed_aa = cf.ALLOWED_AA, index_start = 1, end_exclusive = False, scoring_function = mean_counts_score)

In [14]:
for i, row in tqdm(iterable=clean_positive_df.iterrows(), total=len(clean_positive_df), desc=f"Annotating GalNAc data"):
    try:
        proteins.add(
            protein_id = row['uniprot'],
            protein_seq = isoform_seqs[row['uniprot']],
            peptide_id = row['peptide_id'],
            single_site_or_unclear_start = row['single_site'] if pd.notna(row['single_site']) else row['unclear_site_start'],
            unclear_site_end = row['unclear_site_end'],
            protein_annotations = None,
            peptide_start = row['peptide_start'],
            peptide_end = row['peptide_end'],
            peptide_annotations = {'source': row['source'], 'dataset': row['dataset']},
            site_annotations = {'site_composition': row['site_composition']},
        )
    except AnnotationError as e:
        print(e)

Annotating GalNAc data:   0%|          | 0/288257 [00:00<?, ?it/s]

### Map negative data

In [15]:
for i, row in tqdm(iterable=clean_negative_df.iterrows(), total=len(clean_negative_df), desc=f"Annotating PRIDE data"):
    try:
        proteins.add(
            protein_id = row['uniprot'],
            protein_seq = isoform_seqs[row['uniprot']],
            peptide_id = row['peptide_id'],
            peptide_start = row['peptide_start'],
            peptide_end = row['peptide_end'],
            peptide_annotations = {'psmsCount': row['psmsCount'], 'projectAccessionsCount': row['projectAccessionsCount']}
        )
    except AnnotationError as e:
        print(e)

Annotating PRIDE data:   0%|          | 0/12682 [00:00<?, ?it/s]

## Save annotations & protein sequences

In [16]:
with gzip.open(ANNOTATIONS_FILE, 'wb') as f:
    pickle.dump(proteins, f, protocol=4)

In [17]:
seq_length_sorted_protein_ids = sorted(proteins, key=lambda protein_id: len(proteins[protein_id].protein_seq), reverse=True)

with open(SEQUENCES_FILE, 'w') as f:
    for protein_id in seq_length_sorted_protein_ids:
        protein = proteins[protein_id]
        SeqIO.write(SeqRecord(Seq(protein.protein_seq), protein.protein_id, '', ''), f, "fasta")

In [26]:
with open(TRUNCATED_SEQUENCES_FILE, 'w') as f:
    for protein_id in seq_length_sorted_protein_ids:
        protein = proteins[protein_id]
        seq_length = min(len(protein.protein_seq), MAX_SEQ_LENGTH)
        SeqIO.write(SeqRecord(Seq(protein.protein_seq[:seq_length]), protein.protein_id, '', ''), f, "fasta")

In [18]:
notebook_end_time = time.perf_counter()
print(f"Notebook took {notebook_end_time-notebook_start_time} seconds to run")

Notebook took 197.1336052720003 seconds to run
