<a href="https://colab.research.google.com/github/benjaminnigjeh/keyProteoforms/blob/main/quantitativeDiscovery.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Import extrenal libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

#Import databank

In [None]:
from google.colab import drive
drive.mount('/content/drive')

df = pd.read_hdf("/content/drive/MyDrive/databank_updated", key="databank")

#Generate extracted ion chromatograms

In [None]:
df = df[df["scan_type"] == "MS1"]
df = df[df["sample_prep"] == "Pellet"]
df = df[df["group_name"] == "Normal Healthy"]

extracted_ions = []

for i in range(0, 40, 1):
    filtered_df = df[(df["retntion time"] >= i) & (df["retntion time"] <= i + 1)]
    element_wise_sum = [sum(x) for x in zip(*filtered_df["cast spectra"])]
    extracted_ions.append(element_wise_sum)

#Helper functions

In [None]:
def profile(extracted_ions, mass):
    n = int((mass-600)*10)
    addition = []
    for i in range(0, len(extracted_ions)):
        start = max(n - 5, 0)  # Ensure we don't go below index 0
        end = min(n + 5 + 1, len(extracted_ions[i]))  # Ensure we don't exceed the list length
        neighbors_sum = sum(extracted_ions[i][start:n]) + sum(extracted_ions[i][n:end])
        addition.append(neighbors_sum)
    return(addition)


def plot_multiple_y(x_values, y_values_list, labels=None):
    plt.figure(figsize=(8, 5))

    # If no labels are provided, generate default labels
    if labels is None:
        labels = [f"Series {i+1}" for i in range(len(y_values_list))]

    # Plot each y-values list
    for y_values, label in zip(y_values_list, labels):
        plt.plot(x_values, y_values, marker="o", linestyle="-", label=label)

    # Labels and title
    plt.xlabel("Retention time / minute")
    plt.ylabel("Intensity")
    plt.title("")
    plt.ylim((0,2.2e9))
    plt.legend()
    plt.grid(True)

    # Show plot
    plt.show()



#Generate charge state profiles

In [None]:
x_values = list(range(1, 41))  # X values from 1 to 40
y1 = profile(extracted_ions, 813.3)  # First series
y2 = profile(extracted_ions,861)  # Second series
y3 = profile(extracted_ions,915)  # Third series
y4 = profile(extracted_ions,975.8)  # Fourth series
y5 = profile(extracted_ions,1045.6)  # Fifth series
y6 = profile(extracted_ions,1125.9)  # Sixth series

from itertools import zip_longest
lists = [y1, y2, y3, y4, y5, y6]
sums = [sum(values) for values in zip_longest(*lists, fillvalue=0)]
# Call the function with multiple Y lists
plot_multiple_y(x_values, [y1, y2, y3, y4, y5, y6, sums], labels=[24, 23, 21, 10, 19, 17, 'sums'])


In [None]:
def identification(df, mz, retention):
    mzz = []
    ret = []
    sample = []
    scan = []
    Uniprot = []
    MASS = []
    seq = []
    df["m/z"] = pd.to_numeric(df["m/z"], errors="coerce")
    for i in range(0, len(df['m/z'])):
        if mz - 1 < df['m/z'][i] < mz + 1 :
            if retention - 2  < df['retntion time'][i] < retention + 2:
                mzz.append(mz)
                ret.append(retention)
                seq.append(df['sequence'][i])
                sample.append(df['sample_name'][i])
                scan.append(df['scan'][i])
                Uniprot.append(df['Uniprot ID'][i])
                MASS.append(df['MASS'][i])
    mydic = {'mz': mzz,'retention': ret, 'sample': sample, 'scan': scan, 'uniprot': Uniprot, 'MW': MASS, 'seq': seq}
    return(mydic)


In [None]:
positive = [813.3, 861, 915, 975.8, 1045.6, 1125.9]
negative = [874, 932.2, 998.6, 1075.5, 1397.7, 1571.2]

df_list = []
for ii in negative:
    a = pd.DataFrame(identification(df, ii, 25))
    if a is not None:
        df_list.append(a)

df_combined = pd.concat(df_list, ignore_index=True)

df_combined.to_csv('D:/negative.csv')