<a href="https://colab.research.google.com/github/benjaminnigjeh/keyProteoforms/blob/main/PTM_Databank.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re
import pandas as pd
from tqdm import tqdm

def ID_import(tdportals, databank, cast_path):
    """
    Args:
        tdportals: list of DataFrames, each one a parsed TDPortal report
        databank: dictionary or DataFrame with 'scan' and 'sample_name' keys
        cast_path: path to save the resulting HDF file
    """
    def str_to_int(st):
        digits = re.findall(r'\d+', str(st))
        return [int(d) for d in digits]

    # Merge multiple TDPortal DataFrames
    tdportal = pd.concat(tdportals, ignore_index=True)

    scan_number = [str_to_int(s) for s in tdportal['Fragment Scans']]
    tdportal['scan_number'] = scan_number

    td_samples = tdportal['File Name'].unique()

    my_dic_scan = {key: [] for key in td_samples}
    my_dic_index = {key: [] for key in td_samples}

    for i, row in tdportal.iterrows():
        file_name = row['File Name']
        my_dic_scan[file_name].append(row['scan_number'])
        my_dic_index[file_name].append([i]*len(row['scan_number']))

    for sample in td_samples:
        # Flatten lists
        my_dic_scan[sample] = [item for sublist in my_dic_scan[sample] for item in (sublist if isinstance(sublist, list) else [sublist])]
        my_dic_index[sample] = [item for sublist in my_dic_index[sample] for item in (sublist if isinstance(sublist, list) else [sublist])]

    # Ensure databank is a DataFrame
    if isinstance(databank, dict):
        databank = pd.DataFrame(databank)

    sequence = []
    MASS = []
    Uniprot_ID = []
    Accession = []
    n_term_mod = []
    c_term_mod = []
    int_mod = []



    for i in tqdm(range(len(databank)), desc="Processing scans", ncols=100):
        sample = databank['sample_name'][i]
        scan = databank['scan'][i]
        if sample in my_dic_scan and scan in my_dic_scan[sample]:
            idx = my_dic_scan[sample].index(scan)
            tt = my_dic_index[sample][idx]
            sequence.append(tdportal.at[tt, 'Sequence'])
            MASS.append(tdportal.at[tt, 'Average Mass'])
            Uniprot_ID.append(tdportal.at[tt, 'Uniprot Id'])
            Accession.append(tdportal.at[tt, 'Accession'])
            n_term_mod.append(str(tdportal.at[tt, 'n_term']) if pd.notnull(tdportal.at[tt, 'n_term']) else 'None')
            c_term_mod.append(str(tdportal.at[tt, 'c_term']) if pd.notnull(tdportal.at[tt, 'c_term']) else 'None')
            int_mod.append(str(tdportal.at[tt, 'int_mod']) if pd.notnull(tdportal.at[tt, 'int_mod']) else 'None')
        else:
            sequence.append('None')
            MASS.append('None')
            Uniprot_ID.append('None')
            Accession.append('None')
            n_term_mod.append('None')
            c_term_mod.append('None')
            int_mod.append('None')

    databank['sequence'] = sequence
    databank['MASS'] = MASS
    databank['Uniprot ID'] = Uniprot_ID
    databank['Accession'] = Accession
    databank['n_term'] = n_term_mod
    databank['c_term'] = c_term_mod
    databank['int_mod'] = int_mod

    databank.to_pickle(cast_path)


import pandas as pd

td1 = pd.read_csv('E:/raw/tdportal/Heart.csv')
td2 = pd.read_csv('E:/raw/tdportal/Lung.csv')
td3 = pd.read_csv('E:/raw/tdportal/Kidney.csv')
td4 = pd.read_csv('E:/raw/tdportal/SmInt.csv')
td5 = pd.read_csv('E:/raw/tdportal/Spleen.csv')

tdbank = pd.read_pickle('E:/raw/databank/databank.pkl')  # or wherever your databank is
ID_import([td1, td2, td3, td4, td5], tdbank, 'E:/raw/databank/databank_learner2.pkl')