### DeepLoc EDA
In this file we explore and preprocess the datasets published by HTU for the different version of DeepLoc.

In [1]:
# Library imports
from collections import defaultdict
import pandas as pd
import blosum as bl
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader

#### DeepLoc 1.0 Dataset
We explore the dataset published on the [HTU website](https://services.healthtech.dtu.dk/services/DeepLoc-1.0/). The dataset in the GitHub repo is incomplete, with ~40% less data than the dataset mentioned in their paper.

#### DeepLoc 1.0 EDA

In [2]:
# Data file for the original DeepLoc 1.0 dataset
fasta_file_path = './data/DeepLoc1.0/deeploc1.0.fasta'

# File paths for the processed csv used for training and testing
train_path = './data/DeepLoc1.0/deeploc1.0-train.csv'
test_path = './data/DeepLoc1.0/deeploc1.0-test.csv'

In [3]:
# Define the localizations as found in the fasta file and their desired mapping.
location_categories = {
    "Nucleus": ["Nucleus"],
    "Cytoplasm": ["Cytoplasm"],
    "Extracellular": ["Extracellular"],
    "Mitochondrion": ["Mitochondrion"],
    "Cell membrane": ["Cell.membrane"],
    "Endoplasmic reticulum": ["Endoplasmic.reticulum"],
    "Plastid": ["Plastid"],
    "Golgi apparatus": ["Golgi.apparatus"],
    "Lysosome/Vacuole": ["Lysosome/Vacuole"],
    "Peroxisome": ["Peroxisome"]
}

In [4]:
# Find the locations for the sequence
def normalize_location(header_line):
    header_line = header_line.lower()
    matches = set()
    for category, keywords in location_categories.items():
        for keyword in keywords:
            if keyword.lower() in header_line: # Normalize, just in case.
                matches.add(category)
    return matches

# Count the number of sequences for each location
def count_locations(fasta_file_path):
    location_counts = defaultdict(int)
    with open(fasta_file_path, 'r') as fasta_file:
        for line in fasta_file:
            if line.startswith('>'):
                matched_categories = normalize_location(line)
                for category in matched_categories:
                    location_counts[category] += 1
    return location_counts

Note: This still allows one sequence to have multiple locations.

In [5]:
location_counts = count_locations(fasta_file_path)

# Print the counts for each location
print("Location counts:")
for location in location_categories:
    print(f"{location}: {location_counts[location]}")

Location counts:
Nucleus: 4189
Cytoplasm: 2688
Extracellular: 1973
Mitochondrion: 1510
Cell membrane: 1340
Endoplasmic reticulum: 862
Plastid: 757
Golgi apparatus: 356
Lysosome/Vacuole: 321
Peroxisome: 154


We now count only the sequences which have at most one localization in the dataset.
> Note: All the sequences with more than one localization had the following localizations: Nucleus and Cytoplasm 

In [6]:
# Count the number of sequences that only have one location
def count_single_location_only(fasta_file_path):
    location_counts = defaultdict(int)
    with open(fasta_file_path, 'r') as fasta_file:
        for line in fasta_file:
            if line.startswith('>'):
                matched_categories = normalize_location(line)
                if len(matched_categories) == 1:
                    category = next(iter(matched_categories))
                    location_counts[category] += 1
    return location_counts

In [7]:
location_counts = count_single_location_only(fasta_file_path)
total_count = 0
# Print the counts for each location
print("Location counts:")
for location in location_categories:
    total_count += location_counts[location]
    print(f"{location}: {location_counts[location]}")
print(f"Total: {total_count}")

Location counts:
Nucleus: 4043
Cytoplasm: 2542
Extracellular: 1973
Mitochondrion: 1510
Cell membrane: 1340
Endoplasmic reticulum: 862
Plastid: 757
Golgi apparatus: 356
Lysosome/Vacuole: 321
Peroxisome: 154
Total: 13858


> We see that we end up with a total of 13858 sequences, this is exactly the same amount as in DeepLoc 1.0 (As per Section 2.3.1)

In [8]:
# This function will convert the fasta file to a pandas dataframe
# The dataframe will have the following columns:
# - sequence: the protein sequence
# - Membrane: 1 if the protein is a membrane protein, 0 otherwise
# - test: 1 if the protein is a test protein, 0 otherwise
# - location: the location of the protein (e.g. Nucleus, Cytoplasm, etc.) in one-hot encoding

def fasta_to_dataframe(fasta_file_path):
    rows = []
    current_header = ""
    current_seq = ""

    with open(fasta_file_path, 'r') as f:
        for line in f:
            line = line.strip()
            if line.startswith('>'):
                if current_header and current_seq:
                    row = parse_fasta_entry(current_header, current_seq)
                    if row:
                        rows.append(row)
                current_header = line
                current_seq = ""
            else:
                current_seq += line

        # Save last entry
        if current_header and current_seq:
            row = parse_fasta_entry(current_header, current_seq)
            if row:
                rows.append(row)

    df = pd.DataFrame(rows)
    return df


def parse_fasta_entry(header, sequence):
    """ Parse a single FASTA entry and return a dictionary with relevant information per entry if the location is valid. """
    # Check if the header contains a valid location
    location = extract_single_location(header)
    if location is None:
        return None  # Skip if no valid or ambiguous location

    is_membrane = "-M" in header
    is_test = "test" in header.lower()

    row = {
        "Sequence": sequence,
        "Membrane": int(is_membrane),
        "test": is_test
    }

    # One-hot encoding of location
    for cat in location_categories.keys():
        row[cat] = int(cat == location)

    return row


def extract_single_location(header):
    """ Extract a single location from the header. If more than one location is found, return None."""
    matched = []

    for category, substrings in location_categories.items():
        for s in substrings:
            if s in header:
                matched.append(category)
                break  # Only match once per category

    if len(matched) == 1:
        return matched[0]  # Return only if there's exactly one match
    return None


In [9]:
# Examine the first few rows of the newly created dataframe
df = fasta_to_dataframe(fasta_file_path)
print(df.head())


                                            Sequence  Membrane   test  \
0  MGLPVSWAPPALWVLGCCALLLSLWALCTACRRPEDAVAPRKRARR...         1   True   
1  MEVLEEPAPGPGGADAAERRGLRRLLLSGFQEELRALLVLAGPAFL...         1  False   
2  MMKTLSSGNCTLNVPAKNSYRMVVLGASRVGKSSIVSRFLNGRFED...         1  False   
3  MAKRTFSNLETFLIFLLVMMSAITVALLSLLFITSGTIENHKDLGG...         1  False   
4  MGNCQAGHNLHLCLAHHPPLVCATLILLLLGLSGLGLGSFLLTHRT...         1  False   

   Nucleus  Cytoplasm  Extracellular  Mitochondrion  Cell membrane  \
0        0          0              0              0              1   
1        0          0              0              0              1   
2        0          0              0              0              1   
3        0          0              0              0              1   
4        0          0              0              0              1   

   Endoplasmic reticulum  Plastid  Golgi apparatus  Lysosome/Vacuole  \
0                      0        0                0                 0

In [10]:
# We verify that the dataframe has the correct number of rows
total_sequences = df.shape[0]
print(f"Total sequences: {total_sequences}")

Total sequences: 13858


> Note: In the paper, the authors mention that 9,98% of the sequences had length longer than 1000 and had to be truncated. We found that this is most likely a typo as the actual number is 9,907%
> 
> The authors mention nothing of the ~90% sequences that are shorter than 1000, yet the plots in their paper do not show any sequences shorter than 1000. We follow their approach and pad the sequences from the middle, and generate masks for the attention mechanism. The authors only mention this padding in Figure 3 and nowhere else.

In [11]:
# 90 % of sequences are shorter than 1000
# 9,907 sequences are longer than 1000
def count_short_sequences(df, max_len=1000):
    return df['Sequence'].str.len().lt(max_len).sum()
short_count = count_short_sequences(df)
print(f"Number of sequences shorter than 1000: {short_count}")

Number of sequences shorter than 1000: 12481


In [12]:
MAX_LEN = 1000
PAD_CHAR = '-'

# Remove the center of the sequence, keeping the first and last 500.
def truncate_sequence(seq, max_len=1000):
    if len(seq) <= max_len:
        return seq
    half = max_len // 2
    return seq[:half] + seq[-half:]

# Pad the sequence in the center with a specified character
def pad_middle(seq, max_len=MAX_LEN, pad_char=PAD_CHAR):
    pad_total = max_len - len(seq)
    half_idx = len(seq) // 2
    return seq[:half_idx] + (pad_char * pad_total) + seq[half_idx:] 

# Prepare the sequence for the the processed training dataset
def prepare_sequence(seq, max_len=MAX_LEN, pad_char=PAD_CHAR):
    if len(seq) > max_len:
        return truncate_sequence(seq, max_len)
    elif len(seq) < max_len:
        return pad_middle(seq, max_len, pad_char)
    else:
        return seq  # already length 1000

# Generate a mask for the sequence
def generate_mask(seq, pad_char=PAD_CHAR):
    """ The mask is a string of 0s and 1s, where 1 indicates the presence of an amino acid and 0 indicates a padding character """
    return ''.join(['0' if aa == pad_char else '1' for aa in seq])

In [13]:
# Create a new column with truncated sequences
df['PaddedSequence'] = df['Sequence'].apply(prepare_sequence)
df['Mask'] = df['PaddedSequence'].apply(generate_mask)

In [14]:
print(df[df['PaddedSequence'].str.len() == 1000].shape)

(13858, 15)


In [15]:
df.head()

Unnamed: 0,Sequence,Membrane,test,Nucleus,Cytoplasm,Extracellular,Mitochondrion,Cell membrane,Endoplasmic reticulum,Plastid,Golgi apparatus,Lysosome/Vacuole,Peroxisome,PaddedSequence,Mask
0,MGLPVSWAPPALWVLGCCALLLSLWALCTACRRPEDAVAPRKRARR...,1,True,0,0,0,0,1,0,0,0,0,0,MGLPVSWAPPALWVLGCCALLLSLWALCTACRRPEDAVAPRKRARR...,1111111111111111111111111111111111111111111111...
1,MEVLEEPAPGPGGADAAERRGLRRLLLSGFQEELRALLVLAGPAFL...,1,False,0,0,0,0,1,0,0,0,0,0,MEVLEEPAPGPGGADAAERRGLRRLLLSGFQEELRALLVLAGPAFL...,1111111111111111111111111111111111111111111111...
2,MMKTLSSGNCTLNVPAKNSYRMVVLGASRVGKSSIVSRFLNGRFED...,1,False,0,0,0,0,1,0,0,0,0,0,MMKTLSSGNCTLNVPAKNSYRMVVLGASRVGKSSIVSRFLNGRFED...,1111111111111111111111111111111111111111111111...
3,MAKRTFSNLETFLIFLLVMMSAITVALLSLLFITSGTIENHKDLGG...,1,False,0,0,0,0,1,0,0,0,0,0,MAKRTFSNLETFLIFLLVMMSAITVALLSLLFITSGTIENHKDLGG...,1111111111111111111111111111111111111111111111...
4,MGNCQAGHNLHLCLAHHPPLVCATLILLLLGLSGLGLGSFLLTHRT...,1,False,0,0,0,0,1,0,0,0,0,0,MGNCQAGHNLHLCLAHHPPLVCATLILLLLGLSGLGLGSFLLTHRT...,1111111111111111111111111111111111111111111111...


In [16]:
df.describe(include="all")


Unnamed: 0,Sequence,Membrane,test,Nucleus,Cytoplasm,Extracellular,Mitochondrion,Cell membrane,Endoplasmic reticulum,Plastid,Golgi apparatus,Lysosome/Vacuole,Peroxisome,PaddedSequence,Mask
count,13858,13858.0,13858,13858.0,13858.0,13858.0,13858.0,13858.0,13858.0,13858.0,13858.0,13858.0,13858.0,13858,13858
unique,13804,,2,,,,,,,,,,,13804,957
top,MRLPVTVKATKPSFLVIWIRYSSAASSPTVSLNPSGRLQQTLAGSV...,,False,,,,,,,,,,,MRLPVTVKATKPSFLVIWIRYSSAASSPTVSLNPSGRLQQTLAGSV...,1111111111111111111111111111111111111111111111...
freq,5,,11085,,,,,,,,,,,5,1377
mean,,0.256386,,0.291745,0.183432,0.142373,0.108962,0.096695,0.062202,0.054625,0.025689,0.023164,0.011113,,
std,,0.436653,,0.454582,0.387034,0.349445,0.311603,0.295553,0.241531,0.227256,0.158212,0.150428,0.104833,,
min,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
25%,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
50%,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
75%,,1.0,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,


> Note: We see that various sequences in the dataset are actually repeated, with a sequence even having 5 repetitions in the dataset. We do not do anything about this to stay as close as possible to the original paper. 

In [17]:
# Split based on the 'test' column
train_df = df[df["test"] == False]
test_df = df[df["test"] == True]

# Save to CSV files
train_df.to_csv(train_path, index=False)
test_df.to_csv(test_path, index=False)

print(f"Saved {len(train_df)} training sequences to deeploc1.0-train.csv")
print(f"Saved {len(test_df)} test sequences to deeploc1.0-test.csv")


Saved 11085 training sequences to deeploc1.0-train.csv
Saved 2773 test sequences to deeploc1.0-test.csv


#### DeepLoc 2.0/2.1 EDA

To see how the model performs with more (current) data, we also used the dataset that DeepLoc 2.0/2.1 offers. This data was sourced from the [HTU website](https://services.healthtech.dtu.dk/services/DeepLoc-2.0/) We only used the training/validation set and used the validation set as our training set as we will not perform extensive hyperparameter tuning.

> Note: We follow a similar processing approach as in DeepLoc 1.0 to stay as true as possible to the original paper with this new dataset.

In [18]:
csv_path = './data/DeepLoc2.0/deeploc_data_2.1.csv'
train_path = './data/DeepLoc2.0/deeploc2.1_training_processed.csv'
test_path = './data/DeepLoc2.0/deeploc2.1_test_processed.csv'

In [19]:
# Count the number of sequences for each location in the CSV file
df = pd.read_csv(csv_path, delimiter=',')

# Columns corresponding to subcellular locations/columns in the CSV file
location_columns = [
        "Cytoplasm",
        "Nucleus",
        "Extracellular",
        "Cell membrane",
        "Mitochondrion",
        "Plastid",
        "Endoplasmic reticulum",
        "Lysosome/Vacuole",
        "Golgi apparatus",
        "Peroxisome"
        ]

def count_locations(df):
    """ Count the number of sequences for each location in the CSV file. """
    # Count occurrences
    location_counts = df[location_columns].sum().astype(int)

    print("Location counts:")
    for location, count in location_counts.items():
        print(f"{location}: {count}")
    total = location_counts.sum()
    print(f"Total Number of sequences in dataset: {total}")


count_locations(df)


Location counts:
Cytoplasm: 9870
Nucleus: 9720
Extracellular: 3301
Cell membrane: 4187
Mitochondrion: 2590
Plastid: 1047
Endoplasmic reticulum: 2180
Lysosome/Vacuole: 1496
Golgi apparatus: 1279
Peroxisome: 304
Total Number of sequences in dataset: 35974


> Note: We end up with 35974 sequences, this is the same number of sequences as the authors in Deeploc 2.0 as mentioned in their Supplementary Table S2. Unlike DeepLoc 1.0, DeepLoc 2.0 performs multi-label localization prediction. Therefore, we filter out any sequences with more than one localization for our specific use-case.

In [20]:
# Filter out sequences with multiple locations
def count_locations_single_label(df):

    # Filter: keep only rows with exactly 1 location
    df_single_label = df[df[location_columns].sum(axis=1) == 1]

    # Count occurrences in single-label data
    location_counts = df_single_label[location_columns].sum().astype(int)

    print("Location counts (single-label only):")
    for location, count in location_counts.items():
        print(f"{location}: {count}")
    print(f"Total: {location_counts.sum()}")

    return df_single_label

# Returns a filtered DataFrame with sequences that have only one location
df_filtered = count_locations_single_label(df)

Location counts (single-label only):
Cytoplasm: 4582
Nucleus: 5462
Extracellular: 2944
Cell membrane: 2770
Mitochondrion: 2062
Plastid: 932
Endoplasmic reticulum: 1331
Lysosome/Vacuole: 761
Golgi apparatus: 556
Peroxisome: 219
Total: 21619


In [21]:
counts = df_filtered['Membrane'].value_counts()
# Print the counts for each Membrane type
print("Membrane type counts:")
for location, count in counts.items():
    print(f"{location}: {count}")

Membrane type counts:
0.0: 15261
1.0: 6358


> Note: This dataset is used for k-fold cross-validation. Meaning that over K iterations, K-1 are used for training and 1 is used for validation. Again, as we do not perform any extensive hyperparameter tuning, we will simply use their validation set as our test set. This is a valid approach as the authors have already perform the homology-based partioning across different folds, ensuring <30% homology.

In [22]:
# Ensure a maximum protein length of 1000
df[df['Sequence'].str.len() > 1000].shape

(3186, 16)

We see that 3186 samples have to be truncated. We do this as they suggest in DeepLoc 1.0, by taking out amino-acids from the center of the sequence.

Finally, amino acids shorter than length 1000 must be padded to length 1000. Additionally, a mask must be generated to ensure the attention mechanism ignores padding.

In [23]:
MAX_LEN = 1000
PAD_CHAR = '-'

# Remove the center of the sequence, keeping the first and last 500.
def truncate_sequence(seq, max_len=1000):
    if len(seq) <= max_len:
        return seq
    half = max_len // 2
    return seq[:half] + seq[-half:]

# Pad the sequence in the center with a specified character
def pad_middle(seq, max_len=MAX_LEN, pad_char=PAD_CHAR):
    pad_total = max_len - len(seq)
    half_idx = len(seq) // 2
    return seq[:half_idx] + (pad_char * pad_total) + seq[half_idx:] 

# Prepare the sequence for the the processed training dataset
def prepare_sequence(seq, max_len=MAX_LEN, pad_char=PAD_CHAR):
    if len(seq) > max_len:
        return truncate_sequence(seq, max_len)
    elif len(seq) < max_len:
        return pad_middle(seq, max_len, pad_char)
    else:
        return seq  # already length 1000

# Generate a mask for the sequence
def generate_mask(seq, pad_char=PAD_CHAR):
    """ The mask is a string of 0s and 1s, where 1 indicates the presence of an amino acid and 0 indicates a padding character """
    return ''.join(['0' if aa == pad_char else '1' for aa in seq])

In [24]:
# Create a new column with truncated sequences
df_filtered['PaddedSequence'] = df_filtered['Sequence'].apply(prepare_sequence)
df_filtered['Mask'] = df_filtered['PaddedSequence'].apply(generate_mask)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['PaddedSequence'] = df_filtered['Sequence'].apply(prepare_sequence)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Mask'] = df_filtered['PaddedSequence'].apply(generate_mask)


In [25]:
# Save to a new CSV
df_filtered.to_csv(train_path, index=False)
# Filter by partition
df_train = df_filtered[df_filtered['Partition'] != 4]  # Partitions 0–3 for training/validation
df_test = df_filtered[df_filtered['Partition'] == 4]       # Partition 4 for testing

# Save to separate CSV files
df_train.to_csv(train_path, index=False)
df_test.to_csv(test_path, index=False)

In [26]:
# Examine the newly generated CSV files
df_processed = pd.read_csv(train_path, delimiter=',')
print(df_processed[df_processed['Sequence'].str.len() == 1000].shape)
print(df_processed[df_processed['PaddedSequence'].str.len() == 1000].shape)
print(df_processed[df_processed['Mask'].str.len() == 1000].shape)
print(df_processed.shape)

(4, 18)
(17351, 18)
(17351, 18)
(17351, 18)


##### Converting the sequence to a feature vector using BLOSUM-62

In [27]:
matrix = bl.BLOSUM(62, default=0)
print(matrix)

# Get all the keys in the dictionary and convert them to a list
matrix_keys = list(matrix.keys())

print(matrix_keys)
print(len(matrix_keys))

BLOSUM(62, default=0, {'A': defaultdict(<function BLOSUM.__init__.<locals>.<lambda> at 0x000001B72E13EA70>, {'A': 4.0, 'R': -1.0, 'N': -2.0, 'D': -2.0, 'C': 0.0, 'Q': -1.0, 'E': -1.0, 'G': 0.0, 'H': -2.0, 'I': -1.0, 'L': -1.0, 'K': -1.0, 'M': -1.0, 'F': -2.0, 'P': -1.0, 'S': 1.0, 'T': 0.0, 'W': -3.0, 'Y': -2.0, 'V': 0.0, 'B': -2.0, 'J': -1.0, 'Z': -1.0, 'X': -1.0, '*': -4.0}), 'R': defaultdict(<function BLOSUM.__init__.<locals>.<lambda> at 0x000001B72E13D510>, {'A': -1.0, 'R': 5.0, 'N': 0.0, 'D': -2.0, 'C': -3.0, 'Q': 1.0, 'E': 0.0, 'G': -2.0, 'H': 0.0, 'I': -3.0, 'L': -2.0, 'K': 2.0, 'M': -1.0, 'F': -3.0, 'P': -2.0, 'S': -1.0, 'T': -1.0, 'W': -3.0, 'Y': -2.0, 'V': -3.0, 'B': -1.0, 'J': -2.0, 'Z': 0.0, 'X': -1.0, '*': -4.0}), 'N': defaultdict(<function BLOSUM.__init__.<locals>.<lambda> at 0x000001B730CB9480>, {'A': -2.0, 'R': 0.0, 'N': 6.0, 'D': 1.0, 'C': -3.0, 'Q': 0.0, 'E': 0.0, 'G': 0.0, 'H': 1.0, 'I': -3.0, 'L': -3.0, 'K': 0.0, 'M': -2.0, 'F': -3.0, 'P': -2.0, 'S': 1.0, 'T': 0.0, '

In [28]:
# Remove B, Z, X, J, * from the keys list as they are not standard amino acids and DeepLoc 1.0 only has 20 features.
matrix_keys = [key for key in matrix_keys if key not in ['B', 'Z', 'X', 'J', '*']]
print(len(matrix_keys))
print(matrix_keys)

20
['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']


In [29]:
def encode_sequence_with_blosum(seq):
    """
    Encode a sequence using BLOSUM62, applying the mask.
    Returns: (max_len x 20) numpy array
    """
    encoded = np.zeros((MAX_LEN, len(matrix_keys)), dtype=np.float32)
    for i, aa in enumerate(seq):
        encoded[i] = [matrix[aa][other_aa] for other_aa in matrix_keys]
        # We ensured the default was 0, so no need to use the mask.
    return encoded

In [30]:
# Test the encoding function
features = encode_sequence_with_blosum("ACDXYZ--")
print(features)

[[ 4. -1. -2. ... -3. -2.  0.]
 [ 0. -3. -3. ... -2. -2. -1.]
 [-2. -2.  1. ... -4. -3. -3.]
 ...
 [ 0.  0.  0. ...  0.  0.  0.]
 [ 0.  0.  0. ...  0.  0.  0.]
 [ 0.  0.  0. ...  0.  0.  0.]]


##### Creating a dataset which encodes sequences & returns batches for training

In [31]:
# Custom dataset class for DeepLoc
class DeepLocDataset(Dataset):
    def __init__(self, df, label_columns, matrix):
        self.sequences = df['PaddedSequence'].values # Fetch the processed sequences
        self.masks = df['Mask'].values # Fetch the masks
        self.labels = df[label_columns].values.astype(np.float32) # Convert labels to float32
        self.matrix = matrix # BLOSUM matrix
        self.max_len = MAX_LEN
        self.matrix_keys = list(matrix.keys())
        self.matrix_keys = [key for key in self.matrix_keys if key not in ['B', 'Z', 'X', 'J', '*']]
        self.num_features = len(self.matrix_keys)

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        seq = self.sequences[idx]
        mask = self.masks[idx]
        label = self.labels[idx]
        encoded_seq = encode_sequence_with_blosum(seq)
        mask_tensor = torch.tensor([int(m) for m in mask], dtype=torch.float32)
        return torch.tensor(encoded_seq), torch.tensor(label), mask_tensor # Return the encoded sequence, label, and mask tensor

In [32]:
# Create Dataset and DataLoader
dataset = DeepLocDataset(df_filtered, location_columns, matrix)
loader = DataLoader(dataset, batch_size=32, shuffle=True)

In [33]:
# Test the DataLoader
for x_batch, y_batch, mask_batch in loader:
    print("x_batch shape:", x_batch.shape) # x_batch: (B, 1000, 20)
    print("y_batch shape:", y_batch.shape) # y_batch: (B, 10)
    print("mask_batch shape:", mask_batch.shape) # mask_batch: (B, 1000)
    break


x_batch shape: torch.Size([32, 1000, 20])
y_batch shape: torch.Size([32, 10])
mask_batch shape: torch.Size([32, 1000])
