In [48]:
from pyteomics import mgf
from tqdm import tqdm
import pandas as pd

massivekb_file = "/mnt/data/cdens/casanovo-scaling/massivekb_data/massivekb_82c0124b.mgf"
seqs = []
rts = []
names = []
pepmass = []
charge = []
with mgf.read(massivekb_file, use_index=False, convert_arrays=0, read_charges=False, read_ions=False) as massivekb:
    for spectrum in tqdm(massivekb, total=3e7):
        seqs.append(spectrum['params']['seq'])
        rts.append(spectrum['params']['rtinseconds'])
        names.append(spectrum['params']['title'])
        pepmass.append(float(spectrum['params']['pepmass'][0]))
        charge.append(int(spectrum['params']['charge'][0]))

# Not iRT yet but need the same column name as chronologer for calibration
df = pd.DataFrame({'modified_sequence': seqs, 'iRT': rts, 'title': names, 'pepmass': pepmass, 'charge': charge})
df.to_csv("massivekb_dl.csv")



30504897it [53:47, 9450.67it/s]                                                                                                                                                         


In [49]:
import pandas as pd

def get_overlap(df1, df2, on="modified_sequence"):
    return pd.merge(df1, df2, on=on, how="inner")


def calibrate_datasets(dfs, to=0, on="modified_sequence", label="HI"):
    calibrate_to = dfs[to]
    calibrated_dfs = []
    for i, df in tqdm(enumerate(dfs), total=len(dfs)):
        if i == to:
            calibrated_dfs.append(df)
            continue

        overlap = get_overlap(df, calibrate_to, on=on)
        slope, intercept, r_value, p_value, std_err = linregress(
            overlap[f"{label}_x"], overlap[f"{label}_y"]
        )
        df[label] = slope * df[label] + intercept
        calibrated_dfs.append(df)
    return calibrated_dfs

In [None]:
import pandas as pd
from scipy.stats import linregress

ptm_dict = {
    "+57.021": "[carbamidomethylation]",
    "+15.995": "[oxidation]",
    "+42.011": "[acetylation]",
    "+0.984": "[deamidation]",
    "+43.006": "[carbamylation]",
    "-17.027": "[ammonialoss]",
}
ptm_dict = {re.escape(k): v for k, v in ptm_dict.items()}

df = pd.read_csv("massivekb_dl.csv", index_col=0)
df["modified_sequence"] = df["modified_sequence"].replace(ptm_dict, regex=True)

print(df[df['modified_sequence'].str.contains(r'\+')])
print(df[df['modified_sequence'].str.contains(r'\-')])

chron_df = pd.read_csv("/mnt/data/cdens/casanovo-scaling/massivekb_data/all_data.csv")

df["filename"] = df["title"].str.split(":scan:").str[0]

groups = [g for f, g in df.groupby("filename")]

calibrated_datasets = calibrate_datasets([chron_df] + groups, label="iRT")[1:]
calibrated_df = pd.concat(calibrated_datasets).reset_index(drop=True)
calibrated_df.to_csv("massivekb_dl_calibrated.csv")


    

Empty DataFrame
Columns: [modified_sequence, iRT, title, pepmass, charge]
Index: []
Empty DataFrame
Columns: [modified_sequence, iRT, title, pepmass, charge]
Index: []


  1%|▉                                                                                                                                            | 191/27903 [01:48<4:25:38,  1.74it/s]

In [2]:
df = pd.read_csv("massivekb_dl_calibrated.csv", index_col=0)
df = df[df['iRT'].notna()]
df.to_csv("massivekb_dl_calibrated_notna.csv")

In [3]:
import pandas as pd
pd.read_csv("massivekb_dl_calibrated_notna.csv", index_col=0)


Unnamed: 0,modified_sequence,iRT,title,pepmass,charge,filename
0,PQGPPQQ[deamidation]GGHPPPPQGR,-3.618004,00576_A01_P004283_B0A_A00_R1.mzXML:scan:1890,578.290710,3,00576_A01_P004283_B0A_A00_R1.mzXML
1,SKDLAEVGEGGGHSQAR,-3.419825,00576_A01_P004283_B0A_A00_R1.mzXML:scan:1908,425.210571,4,00576_A01_P004283_B0A_A00_R1.mzXML
2,TPM[oxidation]C[carbamidomethylation]AHC[carba...,-1.406863,00576_A01_P004283_B0A_A00_R1.mzXML:scan:2089,501.560028,3,00576_A01_P004283_B0A_A00_R1.mzXML
3,IEVDKN[deamidation]GKDLK,0.500409,00576_A01_P004283_B0A_A00_R1.mzXML:scan:2259,420.566132,3,00576_A01_P004283_B0A_A00_R1.mzXML
4,QDYNPKPKPSNEITR,0.777531,00576_A01_P004283_B0A_A00_R1.mzXML:scan:2285,596.306091,3,00576_A01_P004283_B0A_A00_R1.mzXML
...,...,...,...,...,...,...
30504892,[acetylation]MFLVNSFLK,169.695627,wildtype_0d.mzML:scan:80439,570.811401,2,wildtype_0d.mzML
30504893,[ammonialoss]QAVENILVSPVVVASSLGLVSLGGK,169.735483,wildtype_0d.mzML:scan:80463,807.134094,3,wildtype_0d.mzML
30504894,MPDGPVALEESYSAVMGIVSEVEQYVK,169.738293,wildtype_0d.mzML:scan:80465,976.475830,3,wildtype_0d.mzML
30504895,VPGPVQQALQSAEMSLDEIEQVILVGGATR,169.739697,wildtype_0d.mzML:scan:80466,1045.885986,3,wildtype_0d.mzML
