<a href="https://colab.research.google.com/github/benjaminnigjeh/automated_data_mining/blob/main/automated_datamining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Install Fisher-py module
This is a module for parsing Thermo raw files. Since it's run time is pythonnet, it requires installation of .NET framework.

In [1]:
!apt-get update
!apt-get install -y mono-complete
!pip install fisher-py

Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Hit:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:8 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:11 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,108 kB]
Get:12 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,615 kB]
Get:13 http://security.ubuntu.com/ubuntu jammy-s

#Import external libraries

In [2]:
from fisher_py.data.business import Scan
from fisher_py import RawFile
import re
import pickle
import pandas as pd
import numpy as np
from tqdm import tqdm



#Helper Functions to build Databank

In [3]:
def helper_regex(text):
    match = re.search(rf"{'Full'}\s+(\w+)", text)
    if match:
        return match.group(1)
    return None

def cast_func_simple(scan_number):
    data_intensities = [0]*1600
    raw_scan = Scan.from_file(raw._raw_file_access, scan_number=scan_number)
    scan_masses = raw_scan.preferred_masses
    scan_intensities = raw_scan.preferred_intensities

    for j in range(0,len(scan_masses)):
        index = round(scan_masses[j])
        if index > 400 and index < 2000:
            data_intensities[index-400] = scan_intensities[j] + data_intensities[index-400]

    return(list(data_intensities))

def casting(cast_path):

    cast_spectra = []
    mz_value = []
    tic = []
    scan_number = []
    retention_time = []
    sample_name = []

    for i in tqdm(range(1, raw.number_of_scans), desc="Processing scans", ncols=100):
        raw_scan = Scan.from_file(raw._raw_file_access, scan_number=i)
        if str(helper_regex(raw_scan.scan_type)) == 'ms2':
            cast_spectra.append(cast_func_simple(i))
            mz_value.append(float(re.findall(r'[\d]*[.][\d]+', raw_scan.scan_type)[1]))
            tic.append(raw_scan.scan_statistics.tic)
            scan_number.append(raw_scan.scan_statistics.scan_number)
            retention_time.append(raw.get_retention_time_from_scan_number(raw_scan.scan_statistics.scan_number))
            sample_name = 'None'
    print("\nScan processing complete!")
    scan_dict = {'scan': scan_number, 'retntion time': retention_time, 'm/z': mz_value, 'TIC': tic, 'cast spectra': cast_spectra, 'sample name' : sample_name}

    with open(cast_path, 'wb') as f:
        pickled_file = pickle.dump(scan_dict, f)
    return(pickled_file)

def casting_normalized(cast_path):

    cast_spectra = []
    mz_value = []
    tic = []
    scan_number = []
    retention_time = []
    sample_name = []
    normalized_cast = []

    for i in tqdm(range(1, raw.number_of_scans), desc="Processing scans", ncols=100):
        raw_scan = Scan.from_file(raw._raw_file_access, scan_number=i)
        if str(helper_regex(raw_scan.scan_type)) == 'ms2':
            cast = cast_func_simple(i)
            cast_spectra.append(cast)
            data_intensities = np.array(cast)
            max_value = np.max(data_intensities)
            data_intensities_norm = data_intensities / max_value
            data_intensities_norm = data_intensities_norm.astype(np.float32)
            data_intensities_norm.tolist()
            normalized_cast.append(data_intensities_norm)
            mz_value.append(float(re.findall(r'[\d]*[.][\d]+', raw_scan.scan_type)[1]))
            tic.append(raw_scan.scan_statistics.tic)
            scan_number.append(raw_scan.scan_statistics.scan_number)
            retention_time.append(raw.get_retention_time_from_scan_number(raw_scan.scan_statistics.scan_number))
            sample_name.append(file_path[23:])
    print("\nScan processing complete!")
    scan_dict = {'scan': scan_number, 'retntion time': retention_time, 'm/z': mz_value, 'TIC': tic, 'cast spectra': cast_spectra, 'Normalized cast spectra': normalized_cast, 'sample name' : sample_name}

    with open(cast_path, 'wb') as f:
        pickled_file = pickle.dump(scan_dict, f)
    return(pickled_file)

#Helper Functions to incorporate tdporal report

In [None]:
def ID_import(tdportal, databank):
  def str_to_int(st):
      internal = []
      digits = re.findall(r'\d+', st)
      for i in range(0, len(digits)):
          internal.append(int(digits[i]))
      return(internal)

  scan_number = [0]*len(tdportal['File Name'])
  td_samples = []

  for i in range(0, len(tdportal['File Name'])):
      scan_number[i] = str_to_int(str(tdportal['Fragment Scans'][i]))
      if tdportal['File Name'][i] not in td_samples:
        td_samples.append(tdportal['File Name'][i])

  my_dic_scan = {key: [] for key in td_samples}
  my_dic_index = {key: [] for key in td_samples}

  for i in range(0, len(tdportal['File Name'])):
      my_dic_scan[tdportal['File Name'][i]].append(scan_number[i])
      my_dic_index[tdportal['File Name'][i]].append([i]*len(scan_number[i]))

  for i in range(0, len(td_samples)):
      nested_list = my_dic_scan[td_samples[i]]
      flat_list = []
      for item in nested_list:
          if isinstance(item, list):
              flat_list.extend(item)
          else:
              flat_list.append(item)
      my_dic_scan[td_samples[i]] = [elem for sublist in flat_list for elem in (sublist if isinstance(sublist, list) else [sublist])]


  for i in range(0, len(td_samples)):
      nested_list = my_dic_index[td_samples[i]]
      flat_list = []
      for item in nested_list:
          if isinstance(item, list):
              flat_list.extend(item)
          else:
              flat_list.append(item)
      my_dic_index[td_samples[i]] = [elem for sublist in flat_list for elem in (sublist if isinstance(sublist, list) else [sublist])]

  sequence = []
  PFR = []
  Uniprot_ID = []
  Accession = []

  for i in tqdm(range(0, len(databank['scan'])), desc="Processing scans", ncols=100):
      if databank['scan'][i] in my_dic_scan[databank['sample name'][i]]:
          tt = my_dic_index[databank['sample name'][i]][my_dic_scan[databank['sample name'][i]].index(databank['scan'][i])]
          sequence.append(tdportal['Sequence'][tt])
          PFR.append(tdportal['PFR'][tt])
          Uniprot_ID.append(tdportal['Uniprot Id'][tt])
          Accession.append(tdportal['Accession'][tt])
      else:
          sequence.append('None')
          PFR.append('None')
          Uniprot_ID.append('None')
          Accession.append('None')


  databank['sequence'] = sequence
  databank['PFR'] = PFR
  databank['Uniprot ID'] = Uniprot_ID
  databank['Accession'] = Accession
  return()

#Data piepline to build Databank

In [16]:
file_path = '/content/drive/MyDrive/20231221_msz1718_MAbPac-02_Neo_ExtraValveT_ACN30S1_TreatmentA_biorep01_techrep01.raw'
cast_path = '/content/databank'
raw = RawFile(file_path)
casting_normalized(cast_path)

Processing scans: 100%|█████████████████████████████████████████| 3269/3269 [01:00<00:00, 54.26it/s]



Scan processing complete!


#Data piepline to incorporate tdportal report to Databank

In [17]:
with open('/content/databank', 'rb') as f:
    databank = pickle.load(f)

tdportal = pd.read_csv('/content/drive/MyDrive/tdportal.csv')

ID_import(tdportal, databank)



Processing scans: 100%|█████████████████████████████████████| 1779/1779 [00:00<00:00, 101300.14it/s]


()

In [18]:
pd.DataFrame(databank)

Unnamed: 0,scan,retntion time,m/z,TIC,cast spectra,Normalized cast spectra,sample name,sequence,PFR,Uniprot ID,Accession
0,3,0.020602,532.6573,44326.285156,"[0, 0, 92.89836120605469, 174.21189880371094, ...","[0.0, 0.0, 0.029539598, 0.05539548, 0.0, 0.0, ...",20231221_msz1718_MAbPac-02_Neo_ExtraValveT_ACN...,DEAGSEADHEGTHSTKRGHAKSRPV,7464718,FIBA_HUMAN,P02671-1
1,32,0.284152,533.2592,48184.800781,"[0, 0, 0, 0, 105.70592498779297, 0, 0, 0, 0, 0...","[0.0, 0.0, 0.0, 0.0, 0.037231993, 0.0, 0.0, 0....",20231221_msz1718_MAbPac-02_Neo_ExtraValveT_ACN...,DEAGSEADHEGTHSTKRGHAKSRPV,7464718,FIBA_HUMAN,P02671-1
2,145,1.285471,532.8577,43661.265625,"[0, 0, 104.3642807006836, 0, 0, 0, 0, 0, 0, 0,...","[0.0, 0.0, 0.037137017, 0.0, 0.0, 0.0, 0.0, 0....",20231221_msz1718_MAbPac-02_Neo_ExtraValveT_ACN...,DEAGSEADHEGTHSTKRGHAKSRPV,7464718,FIBA_HUMAN,P02671-1
3,259,2.295235,532.8579,48168.136719,"[0, 124.89591217041016, 0, 109.54644775390625,...","[0.0, 0.06310464, 0.0, 0.0553492, 0.0, 0.0, 0....",20231221_msz1718_MAbPac-02_Neo_ExtraValveT_ACN...,DEAGSEADHEGTHSTKRGHAKSRPV,7464718,FIBA_HUMAN,P02671-1
4,372,3.296624,532.6570,54236.125000,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 612.34...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",20231221_msz1718_MAbPac-02_Neo_ExtraValveT_ACN...,DEAGSEADHEGTHSTKRGHAKSRPV,7464718,FIBA_HUMAN,P02671-1
...,...,...,...,...,...,...,...,...,...,...,...
1774,3151,28.889778,739.8889,58567.109375,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",20231221_msz1718_MAbPac-02_Neo_ExtraValveT_ACN...,DEPPQSPWDRVKDLATVYVDVLKDSGRDYVSQFEGSALGKQLNLKL...,53599,APOA1_HUMAN,P02647
1775,3157,28.951688,1687.7987,1335.449219,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",20231221_msz1718_MAbPac-02_Neo_ExtraValveT_ACN...,,,,
1776,3205,29.386061,1816.3376,1464.566406,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",20231221_msz1718_MAbPac-02_Neo_ExtraValveT_ACN...,,,,
1777,3241,29.713241,830.5879,7015.328125,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",20231221_msz1718_MAbPac-02_Neo_ExtraValveT_ACN...,,,,
