In [1]:
import os

mgf_file = "/mnt/data/cdens/casanovo-scaling/massivekb_data/massiveKB_3cac0386.mgf"
cache_dir = "massiveKB_3cac0386_charge"
os.makedirs(cache_dir, exist_ok=True)

In [16]:
from analysis import create_sub_mgf

# Create the sub file and set the mgf file to the sub mgf
# mgf_file = create_sub_mgf(mgf_file, cache_dir, num_spectra=10000)
# Also change cache dir so we dont override the full mgf cache
# cache_dir = "massiveKB_3cac0386_sub"


In [2]:
from analysis import create_sequence_index

df = create_sequence_index(mgf_file, cache_dir, total=int(66e6))
unique_sequences = list(df["unmodified_sequence"].unique())

In [4]:
from dist_matrix import square_dist_matrix
import random
import numpy as np

n_datasail = 100000

random_seqs_cache_file = os.path.join(cache_dir, f"sequences_{n_datasail}.npy")
if os.path.exists(random_seqs_cache_file):
    random_seqs = np.load(random_seqs_cache_file, allow_pickle=True)

else:
    random_seqs = random.sample(unique_sequences, n_datasail)
    np.save(random_seqs_cache_file, random_seqs, allow_pickle=True)

dist_matrix = square_dist_matrix(random_seqs, cache_dir, n_threads=64)
dist_matrix

100%|██████████| 99999/99999 [03:32<00:00, 471.35it/s] 


array([[ 0, 87, 77, ..., 82, 87, 75],
       [87,  0, 86, ..., 82, 91, 83],
       [77, 86,  0, ..., 86, 81, 83],
       ...,
       [82, 82, 86, ...,  0, 91, 87],
       [87, 91, 81, ..., 91,  0, 75],
       [75, 83, 83, ..., 87, 75,  0]], dtype=uint8)

In [5]:
from analysis import split_datasail, get_train_val_test

split_df = split_datasail(random_seqs, dist_matrix, [0.9, 0.05, 0.05], e_clusters=200, epsilon=0.1, threads=64,
                          cache_dir=cache_dir)
train, val, test = get_train_val_test(split_df)
total = len(random_seqs)
print(f"\nDatasail produced following splits: (total={total})")
print(f"TRAIN: {len(train)} ({len(train) * 100 / total:.2f}%)")
print(f"VAL:   {len(val)} ({len(val) * 100 / total:.2f}%)")
print(f"TEST:  {len(test)} ({len(test) * 100 / total:.2f}%)")

2025-10-13 09:53:27,149 Validating arguments
2025-10-13 09:53:27,150 Read data
2025-10-13 09:53:27,171 Cluster first set of entities.
2025-10-13 09:53:27,472 Cluster 100000 items based on distances
2025-10-13 09:54:32,692 Clustering based on distances. Distances above 75.52977484446 cannot end up in same cluster.
2025-10-13 10:02:12,964 Clustering finished
2025-10-13 10:47:54,748 Reduced number of clusters to 200.


KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt

plt.hist(train['sequence'].str.len())
plt.show()

plt.hist(val['sequence'].str.len())
plt.show()

plt.hist(test['sequence'].str.len())
plt.show()

In [20]:
from analysis import add_all_to_split

full_split_df = add_all_to_split(split_df, unique_sequences, n_threads=16, cache_dir=cache_dir)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6927/6927 [00:00<00:00, 15373.91it/s]


In [None]:
import matplotlib.pyplot as plt

train, val, test = get_train_val_test(full_split_df)
total = len(full_split_df)
print(f"\nAdding all unmodified sequences produced following splits: (total={total})")
print(f"TRAIN: {len(train)} ({len(train) * 100 / total:.2f}%)")
print(f"VAL:   {len(val)} ({len(val) * 100 / total:.2f}%)")
print(f"TEST:  {len(test)} ({len(test) * 100 / total:.2f}%)")

plt.hist(train['sequence'].str.len())
plt.show()

plt.hist(val['sequence'].str.len())
plt.show()

plt.hist(test['sequence'].str.len())
plt.show()

In [23]:
from analysis import create_val_test_traini

# From now on, no more caching

output_dir = os.path.join(cache_dir, "subsets")

# First, iterate over the full mgf again and create val.mgf and test.mgf, also create the train index
train_spectra, train_index = create_val_test_traini(mgf_file, full_split_df, output_dir, total=1e4)


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000.0 [00:01<00:00, 6445.30it/s]


In [34]:
from analysis import create_train_subsets

n_train_spectra = [2, 5]
n_train_peps = [100, 1000]
create_train_subsets(train_spectra, train_index, n_train_spectra, n_train_peps, output_dir)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
There are 1548 peptides with at least 2 spectra
Getting 2 spectra for 100 peptides
Sufficient peptides with sufficient spectra, number of spectra selected: 200

Getting 2 spectra for 1000 peptides
Sufficient peptides with sufficient spectra, number of spectra selected: 2000

There are 40 peptides with at least 5 spectra
Getting 5 spectra for 100 peptides
Got 200 spectra from peptides with enough spectra
Added all spectra from random peptides, selected 276 spectra in total now

Getting 5 spectra for 1000 peptides
Got 200 spectra from peptides with enough spectra
Added all spectra from random peptides, selected 1400 spectra in total now

