<a href="https://colab.research.google.com/github/benjaminnigjeh/keyProteoforms/blob/main/databankGeneration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Install Fisher-py module
This is a module for parsing Thermo raw files. Since it's run time is pythonnet, it requires installation of .NET framework.

In [None]:
!apt-get update
!apt-get install -y mono-complete
!pip install fisher-py

#Import external libraries

In [None]:
from fisher_py.data.business import Scan
from fisher_py import RawFile
import re
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import pickle

#Helper Function to build Databank

In [None]:
def wholeCasting(folder_path, cast_path):
    os.chdir(folder_path)

    def helper_regex(text):
        match = re.search(rf"{'Full'}\s+(\w+)", text)
        if match:
            return match.group(1)
        return None
    def find_matching_keys(sequence: str, substring_dict: dict) -> list:
        return [key for key, substrings in substring_dict.items() if any(substring in sequence for substring in substrings)]


    files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
    substring_dict_sample = {"Disease": ["PDAD", "AD", "PD"], "Normal Healthy": ["NC"]}
    substring_dict_prep = {"Pellet": ["Pellet"], "Soluble": ["Soluble"]}

    file_name = []
    sample_group = []
    prep_group = []

    scan_type = []
    scan_number = []
    retention_time = []
    cast_spectra = []

    mz_value = []

    for raw_name in files:
        raw = RawFile(raw_name)
        print(raw_name)
        for i in tqdm(range(1, raw.number_of_scans), desc="Processing scans", ncols=100):
            raw_scan = Scan.from_file(raw._raw_file_access, scan_number=i)
            file_name.append(raw_name)
            sample_group.append(find_matching_keys(raw_name, substring_dict_sample)[0])
            prep_group.append(find_matching_keys(raw_name, substring_dict_prep)[0])

            if str(helper_regex(raw_scan.scan_type)) == 'ms':
                scan_type.append('MS1')
                scan_number.append(raw_scan.scan_statistics.scan_number)
                retention_time.append(raw.get_retention_time_from_scan_number(raw_scan.scan_statistics.scan_number))
                mz_value.append('')

                data_intensities = [0]*13690
                scan_masses = raw_scan.preferred_masses
                scan_intensities = raw_scan.preferred_intensities

                for j in range(0,len(scan_masses)):
                    index = int(round(scan_masses[j], 2)*10)
                    if index > 6000 and index < 19360:
                        data_intensities[index-6000] = scan_intensities[j] + data_intensities[index-6000]

                cast_spectra.append(data_intensities)


            if str(helper_regex(raw_scan.scan_type)) == 'ms2':
                scan_type.append('MS2')
                scan_number.append(raw_scan.scan_statistics.scan_number)
                retention_time.append(raw.get_retention_time_from_scan_number(raw_scan.scan_statistics.scan_number))
                mz_value.append(float(re.findall(r'[\d]*[.][\d]+', raw_scan.scan_type)[1]))

                data_intensities = [0]*1600
                scan_masses = raw_scan.preferred_masses
                scan_intensities = raw_scan.preferred_intensities

                for j in range(0,len(scan_masses)):
                    index = round(scan_masses[j])
                    if index > 400 and index < 2000:
                        data_intensities[index-400] = scan_intensities[j] + data_intensities[index-400]
                data_intensities = np.array(data_intensities)
                max_value = np.max(data_intensities)
                data_intensities_norm = data_intensities / max_value
                data_intensities_norm = data_intensities_norm.astype(np.float16)
                data_intensities_norm.tolist()
                cast_spectra.append(data_intensities_norm)

    scan_dict = {'sample_name': file_name, 'group_name': sample_group, 'sample_prep': prep_group, 'scan': scan_number,'scan_type': scan_type, 'retntion time': retention_time, 'm/z': mz_value, 'cast spectra': cast_spectra}

    with open(cast_path, "wb") as f:
        pickle.dump(scan_dict, f)

    return()

#Helper Function to incorporate tdporal report into Databank

In [None]:
def ID_import(tdportal, databank, cast_path):
  def str_to_int(st):
      internal = []
      digits = re.findall(r'\d+', st)
      for i in range(0, len(digits)):
          internal.append(int(digits[i]))
      return(internal)

  scan_number = [0]*len(tdportal['File Name'])
  td_samples = []

  for i in range(0, len(tdportal['File Name'])):
      scan_number[i] = str_to_int(str(tdportal['Fragment Scans'][i]))
      if tdportal['File Name'][i] not in td_samples:
        td_samples.append(tdportal['File Name'][i])

  my_dic_scan = {key: [] for key in td_samples}
  my_dic_index = {key: [] for key in td_samples}

  for i in range(0, len(tdportal['File Name'])):
      my_dic_scan[tdportal['File Name'][i]].append(scan_number[i])
      my_dic_index[tdportal['File Name'][i]].append([i]*len(scan_number[i]))

  for i in range(0, len(td_samples)):
      nested_list = my_dic_scan[td_samples[i]]
      flat_list = []
      for item in nested_list:
          if isinstance(item, list):
              flat_list.extend(item)
          else:
              flat_list.append(item)
      my_dic_scan[td_samples[i]] = [elem for sublist in flat_list for elem in (sublist if isinstance(sublist, list) else [sublist])]


  for i in range(0, len(td_samples)):
      nested_list = my_dic_index[td_samples[i]]
      flat_list = []
      for item in nested_list:
          if isinstance(item, list):
              flat_list.extend(item)
          else:
              flat_list.append(item)
      my_dic_index[td_samples[i]] = [elem for sublist in flat_list for elem in (sublist if isinstance(sublist, list) else [sublist])]

  sequence = []
  MASS = []
  Uniprot_ID = []
  Accession = []

  for i in tqdm(range(0, len(databank['scan'])), desc="Processing scans", ncols=100):
      if databank['scan'][i] in my_dic_scan[databank['sample_name'][i]]:
          tt = my_dic_index[databank['sample_name'][i]][my_dic_scan[databank['sample_name'][i]].index(databank['scan'][i])]
          sequence.append(tdportal['Sequence'][tt])
          MASS.append(tdportal['Average Mass'][tt])
          Uniprot_ID.append(tdportal['Uniprot Id'][tt])
          Accession.append(tdportal['Accession'][tt])
      else:
          sequence.append('None')
          MASS.append('None')
          Uniprot_ID.append('None')
          Accession.append('None')


  databank['sequence'] = sequence
  databank['MASS'] = MASS
  databank['Uniprot ID'] = Uniprot_ID
  databank['Accession'] = Accession

  databank = pd.DataFrame(databank)

  databank.to_hdf(cast_path, key="databank", mode="w")

  return()


#Data piepline to build Databank

In [None]:
wholeCasting("D:/samples/",'D:/final/databank')

#Data piepline to incorporate tdportal report to Databank

In [None]:
with open("D:/final/databank", "rb") as f:
    databank = pickle.load(f)
tdportal = pd.read_csv('D:/final/tdreport.csv')
cast_path = 'D:/final/databank_updated'

ID_import(tdportal, databank, cast_path)

