### Processing the json files

In [85]:
import json
from pyteomics import mgf

file1 = open('13911806/20241003_enamdisc_neg_ms2.json','r')

spectra = []

for line in file1:
    spectra.append(json.loads(line))

print(spectra[0]['peaks'])


# working with the mgf files, doesn't quite work yet)
# file2 = mgf.read(file1)
# for line in file2:
#     print(line)

[[41.998386, 41255500.0], [47.012128, 313500.0], [50.003493, 660000.0], [52.019158, 441500.0], [63.011478, 228000.0], [64.01921, 687500.0], [64.774982, 290000.0], [65.026867, 1121500.0], [67.006332, 225500.0], [68.014162, 346500.0], [72.032089, 380800.0], [72.036254, 467520.0], [78.034905, 1959500.0], [81.02201, 194000.0], [88.019321, 348500.0], [89.027164, 395500.0], [90.034944, 2892500.0], [92.014269, 538000.0], [93.02206, 612500.0], [103.030235, 489000.0], [104.038093, 1230500.0], [105.022153, 492000.0], [105.045982, 231500.0], [106.029966, 510500.0], [107.025025, 1024000.0], [115.030261, 274000.0], [117.045955, 2354500.0], [118.029975, 252500.0], [119.025188, 423500.0], [131.025187, 6713500.0], [132.032991, 142546500.0], [132.03688, 2738500.0], [132.039262, 1442500.0], [132.043243, 679600.0], [133.02658, 213000.0], [133.029937, 248500.0], [133.033248, 721500.0], [133.036481, 1982500.0], [133.038665, 1936000.0], [133.040724, 307099000.0], [133.044614, 5324000.0], [133.048598, 148050

### Byte-Pair Encoding Algorithm

In [75]:
# encodeSpectraDict
# Takes in: spectra  (list of spectra to encode)
#           peak_bin (how to round the peak values for binning)
#           int_bin  (how to round intensity levels)
# Outputs:  The encoding and decoding dictionaries for the spectra provided
# Also:     NA
def encodeSpectraDict(spectra, peak_bin = 3, int_bin = 0):
    decode_dict = {}
    encode_dict = {}
    vocabNum = 0
    for i in spectra:
        for j in i['peaks']:
            ms = str(round(j[0],peak_bin))
            intensity = str(round(j[1],int_bin))
            if (not(ms in encode_dict)):
                encode_dict[ms] = str(vocabNum)
                decode_dict[str(vocabNum)] = ms
                vocabNum += 1
            if (not (intensity in encode_dict)):
                encode_dict[intensity] = str(vocabNum)
                decode_dict[str(vocabNum)] = intensity
                vocabNum += 1

    return (encode_dict,decode_dict)

In [76]:
# testing

bpe_dicts = encodeSpectraDict(spectra)

print(bpe_dicts[1]['0'])
print(len(bpe_dicts[0]))

print(bpe_dicts[0]['41.998'])
print(bpe_dicts[1]['0'])

41.998
274323
0
41.998


In [77]:
# encodeSpectra
# Takes in: spectra     (list of spectra to encode)
#           encode_dict (encoding dictionary)
#           peak_bin    (how to round the peak values for binning)
#           int_bin     (how to round intensity levels)
# Outputs:  The encoded spectra
# Also:     NA
def encodeSpectra(spectra, encode_dict, peak_bin = 3, int_bin = 0):
    encoded = []
    for i in spectra:
        thisOne = []
        for j in i['peaks']:
            thisTwo = []
            thisTwo.append(encode_dict[str(round(j[0],peak_bin))])
            thisTwo.append(encode_dict[str(round(j[1],int_bin))])
            thisOne.append(thisTwo)
        encoded.append(thisOne)
    return encoded

In [78]:
# test

encoded = encodeSpectra(spectra, bpe_dicts[0])
encoded

[[['0', '1'],
  ['2', '3'],
  ['4', '5'],
  ['6', '7'],
  ['8', '9'],
  ['10', '11'],
  ['12', '13'],
  ['14', '15'],
  ['16', '17'],
  ['18', '19'],
  ['20', '21'],
  ['22', '23'],
  ['24', '25'],
  ['26', '27'],
  ['28', '29'],
  ['30', '31'],
  ['32', '33'],
  ['34', '35'],
  ['36', '37'],
  ['38', '39'],
  ['40', '41'],
  ['42', '43'],
  ['44', '45'],
  ['46', '47'],
  ['48', '49'],
  ['50', '51'],
  ['52', '53'],
  ['54', '55'],
  ['56', '57'],
  ['58', '59'],
  ['60', '61'],
  ['62', '63'],
  ['64', '65'],
  ['66', '67'],
  ['68', '69'],
  ['70', '71'],
  ['72', '73'],
  ['74', '75'],
  ['76', '77'],
  ['78', '79'],
  ['80', '81'],
  ['82', '83'],
  ['84', '85'],
  ['86', '87'],
  ['88', '89'],
  ['90', '91'],
  ['92', '93'],
  ['94', '95'],
  ['96', '97'],
  ['98', '99'],
  ['100', '101'],
  ['102', '103'],
  ['104', '105'],
  ['106', '107'],
  ['108', '109'],
  ['110', '111'],
  ['112', '113'],
  ['114', '115'],
  ['116', '117'],
  ['118', '119'],
  ['120', '121'],
  ['122', '1

In [82]:
# decodeSpectra
# Takes in: spectra     (list of spectra to decode)
#           decode_dict (decoding dictionary)
#           peak_bin    (how to round the peak values for binning)
#           int_bin     (how to round intensity levels)
# Outputs:  The decoded spectra
# Also:     NA
def decodeSpectra(spectra, decode_dict, peak_bin = 3, int_bin = 10):
    decoded = []
    for i in spectra:
        thisOne = []
        for j in i:
            thisTwo = []
            thisTwo.append(decode_dict[str(j[0])])
            thisTwo.append(decode_dict[str(j[1])])
            thisOne.append(thisTwo)
        decoded.append(thisOne)
    return decoded

In [83]:
# test

decoded = decodeSpectra(encoded, bpe_dicts[1])
decoded

[[['41.998', '41255500.0'],
  ['47.012', '313500.0'],
  ['50.003', '660000.0'],
  ['52.019', '441500.0'],
  ['63.011', '228000.0'],
  ['64.019', '687500.0'],
  ['64.775', '290000.0'],
  ['65.027', '1121500.0'],
  ['67.006', '225500.0'],
  ['68.014', '346500.0'],
  ['72.032', '380800.0'],
  ['72.036', '467520.0'],
  ['78.035', '1959500.0'],
  ['81.022', '194000.0'],
  ['88.019', '348500.0'],
  ['89.027', '395500.0'],
  ['90.035', '2892500.0'],
  ['92.014', '538000.0'],
  ['93.022', '612500.0'],
  ['103.03', '489000.0'],
  ['104.038', '1230500.0'],
  ['105.022', '492000.0'],
  ['105.046', '231500.0'],
  ['106.03', '510500.0'],
  ['107.025', '1024000.0'],
  ['115.03', '274000.0'],
  ['117.046', '2354500.0'],
  ['118.03', '252500.0'],
  ['119.025', '423500.0'],
  ['131.025', '6713500.0'],
  ['132.033', '142546500.0'],
  ['132.037', '2738500.0'],
  ['132.039', '1442500.0'],
  ['132.043', '679600.0'],
  ['133.027', '213000.0'],
  ['133.03', '248500.0'],
  ['133.033', '721500.0'],
  ['133.036