From 2806b0b99cd2cb03119c27a5234d1037ecde5ffa Mon Sep 17 00:00:00 2001
From: WangHong007 <88552471+WangHong007@users.noreply.github.com>
Date: Mon, 27 Mar 2023 21:18:22 +0800
Subject: [PATCH 1/4] keep peptide raw intensity

---
 bin/peptide_normalization.py | 31 +++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/bin/peptide_normalization.py b/bin/peptide_normalization.py
index 64f18c9..2f2bbeb 100644
--- a/bin/peptide_normalization.py
+++ b/bin/peptide_normalization.py
@@ -118,14 +118,18 @@ def get_peptidoform_normalize_intensities(dataset: DataFrame, higher_intensity:
     return dataset
 
 
-def sum_peptidoform_intensities(dataset: DataFrame) -> DataFrame:
+def sum_peptidoform_intensities(dataset: DataFrame, smethod: str) -> DataFrame:
     """
-    Sum the peptidoform intensities for all peptidofrom across replicates of the same sample.
+    Summary the peptidoform intensities for all peptidofrom across replicates of the same sample.
     :param dataset: Dataframe to be analyzed
+    :param smethod: Method to summary peptidoform intensities per peptide
     :return: dataframe with the intensities
     """
     dataset = dataset[dataset[NORM_INTENSITY].notna()]
-    normalize_df = dataset.groupby([PEPTIDE_CANONICAL, SAMPLE_ID, BIOREPLICATE, CONDITION])[NORM_INTENSITY].sum()
+    if smethod == "sum":
+        normalize_df = dataset.groupby([PEPTIDE_CANONICAL, SAMPLE_ID, BIOREPLICATE, CONDITION])[NORM_INTENSITY].sum()
+    elif smethod == "median":
+        normalize_df = dataset.groupby([PEPTIDE_CANONICAL, SAMPLE_ID, BIOREPLICATE, CONDITION])[NORM_INTENSITY].median()
     normalize_df = normalize_df.reset_index()
     normalize_df = pd.merge(normalize_df,
                             dataset[[PROTEIN_NAME, PEPTIDE_CANONICAL, SAMPLE_ID, BIOREPLICATE, CONDITION]], how='left',
@@ -248,25 +252,26 @@ def impute_peptide_intensities(dataset_df, field, class_field):
 @click.option("--output", help="Peptide intensity file including other all properties for normalization")
 @click.option('--nmethod', help="Normalization method used to normalize intensities for all samples (options: qnorm)",
               default="qnorm")
+@click.option("--smethod", help="Summary method used to calculate the intensity for peptides with multiple peptidoforms (options: sum)",
+              default="sum")
 @click.option("--impute", help="Impute the missing values using MissForest", is_flag=True)
 @click.option("--pnormalization", help="Normalize the peptide intensities using different methods (options: qnorm)",
               is_flag=True)
 @click.option("--compress", help="Read the input peptides file in compress gzip file", is_flag=True)
-@click.option("--log2", help="Transform to log2 the peptide intensity values before normalization", is_flag=True)
 @click.option("--violin", help="Use violin plot instead of boxplot for distribution representations", is_flag=True)
 @click.option("--verbose",
               help="Print addition information about the distributions of the intensities, number of peptides remove "
                    "after normalization, etc.",
               is_flag=True)
-def peptide_normalization(peptides: str, contaminants: str, output: str, nmethod: str, impute: bool,
-                          pnormalization: bool, compress: bool, log2: bool,
-                          violin: bool, verbose: bool) -> None:
+def peptide_normalization(peptides: str, contaminants: str, output: str, nmethod: str, smethod: str, impute: bool,
+                          pnormalization: bool, compress: bool, violin: bool, verbose: bool) -> None:
     """
     Normalize the peptide intensities using different methods.
     :param peptides:
     :param contaminants:
     :param output:
     :param nmethod:
+    :param smethod:
     :param impute:
     :param pnormalization:
     :param compress:
@@ -289,9 +294,7 @@ def peptide_normalization(peptides: str, contaminants: str, output: str, nmethod
         dataset_df = pd.read_csv(peptides, sep=",")
     print_dataset_size(dataset_df, "Number of peptides: ", verbose)
 
-    print("Logarithmic if specified..")
-    dataset_df.loc[dataset_df.Intensity == 0, INTENSITY] = 1
-    dataset_df[NORM_INTENSITY] = np.log2(dataset_df[INTENSITY]) if log2 else dataset_df[INTENSITY]
+    dataset_df[NORM_INTENSITY] = dataset_df[INTENSITY]
 
     # Print the distribution of the original peptide intensities from quantms analysis
     if verbose:
@@ -324,10 +327,10 @@ def peptide_normalization(peptides: str, contaminants: str, output: str, nmethod
     print("Add Canonical peptides to the dataframe...")
     dataset_df[PEPTIDE_CANONICAL] = dataset_df[PEPTIDE_SEQUENCE].apply(lambda x: get_canonical_peptide(x))
 
-    print("Sum all peptidoforms per Sample...")
-    print("Number of peptides before sum selection: " + str(len(dataset_df.index)))
-    dataset_df = sum_peptidoform_intensities(dataset_df)
-    print("Number of peptides after sum: " + str(len(dataset_df.index)))
+    print("Summary all peptidoforms per Sample...")
+    print("Number of peptides before summary selection: " + str(len(dataset_df.index)))
+    dataset_df = sum_peptidoform_intensities(dataset_df, smethod)
+    print("Number of peptides after summary: " + str(len(dataset_df.index)))
 
     print("Average all peptidoforms per Peptide/Sample...")
     print("Number of peptides before average: " + str(len(dataset_df.index)))

From 6362b642a183a31aa8a62c360876edd8e37365ca Mon Sep 17 00:00:00 2001
From: WangHong007 <88552471+WangHong007@users.noreply.github.com>
Date: Wed, 19 Apr 2023 13:34:51 +0800
Subject: [PATCH 2/4] Revert "keep peptide raw intensity"

This reverts commit 2806b0b99cd2cb03119c27a5234d1037ecde5ffa.
---
 bin/peptide_normalization.py | 31 ++++++++++++++-----------------
 1 file changed, 14 insertions(+), 17 deletions(-)

diff --git a/bin/peptide_normalization.py b/bin/peptide_normalization.py
index 2f2bbeb..64f18c9 100644
--- a/bin/peptide_normalization.py
+++ b/bin/peptide_normalization.py
@@ -118,18 +118,14 @@ def get_peptidoform_normalize_intensities(dataset: DataFrame, higher_intensity:
     return dataset
 
 
-def sum_peptidoform_intensities(dataset: DataFrame, smethod: str) -> DataFrame:
+def sum_peptidoform_intensities(dataset: DataFrame) -> DataFrame:
     """
-    Summary the peptidoform intensities for all peptidofrom across replicates of the same sample.
+    Sum the peptidoform intensities for all peptidofrom across replicates of the same sample.
     :param dataset: Dataframe to be analyzed
-    :param smethod: Method to summary peptidoform intensities per peptide
     :return: dataframe with the intensities
     """
     dataset = dataset[dataset[NORM_INTENSITY].notna()]
-    if smethod == "sum":
-        normalize_df = dataset.groupby([PEPTIDE_CANONICAL, SAMPLE_ID, BIOREPLICATE, CONDITION])[NORM_INTENSITY].sum()
-    elif smethod == "median":
-        normalize_df = dataset.groupby([PEPTIDE_CANONICAL, SAMPLE_ID, BIOREPLICATE, CONDITION])[NORM_INTENSITY].median()
+    normalize_df = dataset.groupby([PEPTIDE_CANONICAL, SAMPLE_ID, BIOREPLICATE, CONDITION])[NORM_INTENSITY].sum()
     normalize_df = normalize_df.reset_index()
     normalize_df = pd.merge(normalize_df,
                             dataset[[PROTEIN_NAME, PEPTIDE_CANONICAL, SAMPLE_ID, BIOREPLICATE, CONDITION]], how='left',
@@ -252,26 +248,25 @@ def impute_peptide_intensities(dataset_df, field, class_field):
 @click.option("--output", help="Peptide intensity file including other all properties for normalization")
 @click.option('--nmethod', help="Normalization method used to normalize intensities for all samples (options: qnorm)",
               default="qnorm")
-@click.option("--smethod", help="Summary method used to calculate the intensity for peptides with multiple peptidoforms (options: sum)",
-              default="sum")
 @click.option("--impute", help="Impute the missing values using MissForest", is_flag=True)
 @click.option("--pnormalization", help="Normalize the peptide intensities using different methods (options: qnorm)",
               is_flag=True)
 @click.option("--compress", help="Read the input peptides file in compress gzip file", is_flag=True)
+@click.option("--log2", help="Transform to log2 the peptide intensity values before normalization", is_flag=True)
 @click.option("--violin", help="Use violin plot instead of boxplot for distribution representations", is_flag=True)
 @click.option("--verbose",
               help="Print addition information about the distributions of the intensities, number of peptides remove "
                    "after normalization, etc.",
               is_flag=True)
-def peptide_normalization(peptides: str, contaminants: str, output: str, nmethod: str, smethod: str, impute: bool,
-                          pnormalization: bool, compress: bool, violin: bool, verbose: bool) -> None:
+def peptide_normalization(peptides: str, contaminants: str, output: str, nmethod: str, impute: bool,
+                          pnormalization: bool, compress: bool, log2: bool,
+                          violin: bool, verbose: bool) -> None:
     """
     Normalize the peptide intensities using different methods.
     :param peptides:
     :param contaminants:
     :param output:
     :param nmethod:
-    :param smethod:
     :param impute:
     :param pnormalization:
     :param compress:
@@ -294,7 +289,9 @@ def peptide_normalization(peptides: str, contaminants: str, output: str, nmethod
         dataset_df = pd.read_csv(peptides, sep=",")
     print_dataset_size(dataset_df, "Number of peptides: ", verbose)
 
-    dataset_df[NORM_INTENSITY] = dataset_df[INTENSITY]
+    print("Logarithmic if specified..")
+    dataset_df.loc[dataset_df.Intensity == 0, INTENSITY] = 1
+    dataset_df[NORM_INTENSITY] = np.log2(dataset_df[INTENSITY]) if log2 else dataset_df[INTENSITY]
 
     # Print the distribution of the original peptide intensities from quantms analysis
     if verbose:
@@ -327,10 +324,10 @@ def peptide_normalization(peptides: str, contaminants: str, output: str, nmethod
     print("Add Canonical peptides to the dataframe...")
     dataset_df[PEPTIDE_CANONICAL] = dataset_df[PEPTIDE_SEQUENCE].apply(lambda x: get_canonical_peptide(x))
 
-    print("Summary all peptidoforms per Sample...")
-    print("Number of peptides before summary selection: " + str(len(dataset_df.index)))
-    dataset_df = sum_peptidoform_intensities(dataset_df, smethod)
-    print("Number of peptides after summary: " + str(len(dataset_df.index)))
+    print("Sum all peptidoforms per Sample...")
+    print("Number of peptides before sum selection: " + str(len(dataset_df.index)))
+    dataset_df = sum_peptidoform_intensities(dataset_df)
+    print("Number of peptides after sum: " + str(len(dataset_df.index)))
 
     print("Average all peptidoforms per Peptide/Sample...")
     print("Number of peptides before average: " + str(len(dataset_df.index)))

From 9a473fe60a367e12298063038d9a1e3df1f9aacc Mon Sep 17 00:00:00 2001
From: WangHong007 <88552471+WangHong007@users.noreply.github.com>
Date: Fri, 21 Apr 2023 20:10:23 +0800
Subject: [PATCH 3/4] Compute TPA and concentration

---
 bin/compute_tpa.py | 105 +++++++++
 bin/histones.json  | 526 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 631 insertions(+)
 create mode 100644 bin/compute_tpa.py
 create mode 100644 bin/histones.json

diff --git a/bin/compute_tpa.py b/bin/compute_tpa.py
new file mode 100644
index 0000000..3d6d87a
--- /dev/null
+++ b/bin/compute_tpa.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import click
+import pandas as pd
+from pandas import DataFrame, Series
+from pyopenms import *
+
+from bin.compute_ibaq import print_help_msg, parse_uniprot_name
+from ibaq.ibaqpy_commons import PROTEIN_NAME, NORM_INTENSITY, SAMPLE_ID, CONDITION
+from ibaq.ibaqpy_commons import plot_distributions, plot_box_plot
+import numpy as np
+import os
+
+@click.command()
+@click.option("-f", "--fasta", help="Protein database")
+@click.option("-p", "--peptides", help="Peptide identifications with intensities following the peptide intensity output")
+@click.option("-r", "--ruler", help="Whether to use proteomicRuler", default=True)
+@click.option("-n", "--ploidy", help="ploidy number", default=2)
+@click.option("-c", "--cpc", help="cellular protein concentration(g/L)", default=200)
+@click.option("-o", "--output", help="Output file with the proteins and ibaq values")
+def tpa_compute(fasta: str, peptides: str, ruler: bool, ploidy: int, cpc: float, output: str) -> None:
+    """
+    This command computes the protein copies and concentrations according to a file output of peptides with the
+    format described in peptide_contaminants_file_generation.py.
+    :param fasta: Fasta file used to perform the peptide identification
+    :param peptides: Peptide intensity file.
+    :param ruler: Whether to compute protein copies, weight and concentration.
+    :param ploidy: ploidy number.
+    :param cpc: cellular protein concentration(g/L).
+    :param output: output format containing the ibaq values.
+    :return:
+    """
+    if peptides is None or fasta is None:
+        print_help_msg(ibaq_compute)
+        exit(1)
+
+    data = pd.read_csv(peptides, sep=",")
+    print(data.head())
+
+    res = pd.DataFrame(data.groupby([PROTEIN_NAME, SAMPLE_ID, CONDITION])[NORM_INTENSITY].sum())
+    res = res.reset_index()
+    proteins = res["ProteinName"].unique().tolist()
+    proteins = sum([i.split(";") for i in proteins], [])
+
+    # calculate molecular weight of quantified proteins
+    mw_dict = dict()
+    fasta_proteins = list()  # type: list[FASTAEntry]
+    FASTAFile().load(fasta, fasta_proteins)
+    for entry in fasta_proteins:
+        accession, name = entry.identifier.split("|")[1:]
+        if name in proteins:
+            mw = AASequence().fromString(entry.sequence).getMonoWeight()
+            mw_dict[name] = mw
+    
+    # calculate TPA for every protein group
+    def get_protein_group_mw(group: str) -> float:
+        mw_list = [mw_dict[i] for i in group.split(";")]
+        return sum(mw_list)
+
+    res["MolecularWeight"] = res.apply(lambda x: get_protein_group_mw(x["ProteinName"]), axis=1)
+    res["MolecularWeight"] = res["MolecularWeight"].fillna(1)
+    res["MolecularWeight"] = res["MolecularWeight"].replace(0, 1)
+    res["TPA"] = res["NormIntensity"] / res["MolecularWeight"]
+    plot_distributions(res, "TPA", SAMPLE_ID, log2=True)
+    plot_box_plot(res, "TPA", SAMPLE_ID, log2=True,
+                    title="TPA Distribution", violin=False)
+
+    # calculate protein weight(ng) and concentration(nM)
+    if ruler:
+        avogadro = 6.02214129e23
+        average_base_pair_mass = 617.96 # 615.8771
+
+        organism = res.loc[0, "ProteinName"].split("_")[1].lower()
+        histone_df = pd.read_json(open(os.path.split(__file__)[0] + os.sep + "histones.json", "rb")).T
+        target_histones = histone_df[histone_df["name"] == organism.lower()]
+        genome_size = target_histones["genome_size"].values[0]
+        histones_list = target_histones["histone_entries"].values[0]
+                                                                                           
+        dna_mass = ploidy * genome_size * average_base_pair_mass / avogadro
+
+        def calculate(protein_intensity, histone_intensity, mw):
+            copy = (protein_intensity / histone_intensity) * dna_mass * avogadro / mw
+            # The number of moles is equal to the number of particles divided by Avogadro's constant
+            moles = copy * 1e9 / avogadro # unit nmol
+            weight = moles * mw # unit ng
+            return tuple([copy, moles, weight])
+
+        def proteomicRuler(df):
+            histone_intensity = df[df["ProteinName"].isin(histones_list)]["NormIntensity"].sum()
+            histone_intensity = histone_intensity if histone_intensity > 0 else 1
+            df[["Copy", "Moles[nmol]", "Weight[ng]"]] = df.apply(lambda x: calculate(x["NormIntensity"], histone_intensity, x["MolecularWeight"]), axis = 1, result_type="expand")
+            volume = df["Weight[ng]"].sum() * 1e-9 / cpc # unit L
+            df["Concentration[nM]"] = df["Moles[nmol]"] / volume # unit nM
+            return df
+
+        res = res.groupby(["Condition"]).apply(proteomicRuler)
+
+        plot_distributions(res, "Concentration[nM]", SAMPLE_ID, log2=True)
+        plot_box_plot(res, "Concentration[nM]", SAMPLE_ID, log2=True,
+                        title="Concentration[nM] Distribution", violin=False)
+        res.to_csv(output, index=False)
+
+if __name__ == '__main__':
+    tpa_compute()
diff --git a/bin/histones.json b/bin/histones.json
new file mode 100644
index 0000000..4f22566
--- /dev/null
+++ b/bin/histones.json
@@ -0,0 +1,526 @@
+{
+  "HUMAN": {
+    "name": "human",
+    "genome_size": 3220000000,
+    "histone_proteins": [
+      "P07305",
+      "Q8IZA3",
+      "Q92522",
+      "P0C5Y9",
+      "P0C5Z0",
+      "H0YFX9",
+      "Q9BTM1",
+      "A8MQC5",
+      "C9J0D1",
+      "C9J386",
+      "E5RJU1",
+      "Q71UI9",
+      "P16104",
+      "B4DJC3",
+      "D6RCF2",
+      "O75367",
+      "Q5SQT3",
+      "Q9P0M6",
+      "P0C0S5",
+      "P0C1H6",
+      "A9UJN3",
+      "P57053",
+      "Q7Z2G1",
+      "B4DEB1",
+      "P84243",
+      "B2R4P9",
+      "K7EMV3",
+      "K7ES00",
+      "K7EK07",
+      "K7EP01",
+      "Q6NXT2",
+      "Q02539",
+      "P16401",
+      "P16403",
+      "P16402",
+      "Q4VB24",
+      "P10412",
+      "A3R0T8",
+      "A1L407",
+      "P22492",
+      "Q96QV6",
+      "P04908",
+      "Q08AJ9",
+      "Q93077",
+      "P20671",
+      "P0C0S8",
+      "A3KPC7",
+      "Q96KK5",
+      "Q99878",
+      "A4FTV9",
+      "Q92646",
+      "Q96A08",
+      "P33778",
+      "P62807",
+      "P58876",
+      "B2R4S9",
+      "Q93079",
+      "P06899",
+      "O60814",
+      "Q99880",
+      "I6L9F7",
+      "Q99879",
+      "Q99877",
+      "P23527",
+      "P68431",
+      "P62805",
+      "Q99525",
+      "Q0VAS5",
+      "B2R4R0",
+      "Q6FI13",
+      "Q8IUE6",
+      "Q16777",
+      "Q16778",
+      "B4DR52",
+      "Q5QNW6",
+      "Q71DI3",
+      "Q5TEC6",
+      "Q7L7L0",
+      "Q8N257",
+      "Q16695",
+      "Q6TXQ4",
+      "Q14463",
+      "B4E0B3",
+      "B2R5B6",
+      "A2RUA4",
+      "B2R5B3",
+      "Q9HA11",
+      "A8K9J7",
+      "B2R6Y1",
+      "B4E380",
+      "A8K4Y7",
+      "Q6B823",
+      "Q6LBZ2",
+      "A3R0T7"
+    ],
+    "histone_entries": [
+      "H2AW_HUMAN",
+      "Q9HA11_HUMAN",
+      "H2AJ_HUMAN",
+      "H2B1L_HUMAN",
+      "H2B1M_HUMAN",
+      "H2A1J_HUMAN",
+      "H2B1N_HUMAN",
+      "H4G_HUMAN",
+      "H2A1A_HUMAN",
+      "H2A1H_HUMAN",
+      "H2B1A_HUMAN",
+      "H2B1H_HUMAN",
+      "H2A1C_HUMAN",
+      "Q92646_HUMAN",
+      "H1X_HUMAN",
+      "H2B3B_HUMAN",
+      "H18_HUMAN",
+      "H2A2B_HUMAN",
+      "H2BWT_HUMAN",
+      "H2A3_HUMAN",
+      "H2AV_HUMAN",
+      "H2AV_HUMAN",
+      "H32_HUMAN",
+      "Q6TXQ4_HUMAN",
+      "H3C_HUMAN",
+      "Q6LBZ2_HUMAN",
+      "H2A2A_HUMAN",
+      "Q6B823_HUMAN",
+      "H37_HUMAN",
+      "Q5SQT3_HUMAN",
+      "H2B2F_HUMAN",
+      "Q4VB24_HUMAN",
+      "H2B2E_HUMAN",
+      "H2A2C_HUMAN",
+      "H31T_HUMAN",
+      "Q14463_HUMAN",
+      "Q0VAS5_HUMAN",
+      "Q08AJ9_HUMAN",
+      "H11_HUMAN",
+      "H33_HUMAN",
+      "H31_HUMAN",
+      "H2B1C_HUMAN",
+      "H4_HUMAN",
+      "H2B1D_HUMAN",
+      "H2BFS_HUMAN",
+      "H2B1B_HUMAN",
+      "H2B1O_HUMAN",
+      "H1T_HUMAN",
+      "H2A1D_HUMAN",
+      "H12_HUMAN",
+      "H13_HUMAN",
+      "H15_HUMAN",
+      "H2AX_HUMAN",
+      "H14_HUMAN",
+      "H2AB2_HUMAN",
+      "H2AB1_HUMAN",
+      "H2BFM_HUMAN",
+      "H2A1_HUMAN",
+      "H2AZ_HUMAN",
+      "H10_HUMAN",
+      "H2B1J_HUMAN",
+      "H2A1B_HUMAN",
+      "H2AY_HUMAN",
+      "H2B1K_HUMAN",
+      "K7ES00_HUMAN",
+      "K7EP01_HUMAN",
+      "K7EMV3_HUMAN",
+      "K7EK07_HUMAN",
+      "I6L9F7_HUMAN",
+      "H0YFX9_HUMAN",
+      "E5RJU1_HUMAN",
+      "D6RCF2_HUMAN",
+      "C9J386_HUMAN",
+      "C9J0D1_HUMAN",
+      "B4E380_HUMAN",
+      "B4E0B3_HUMAN",
+      "B4DR52_HUMAN",
+      "B4DJC3_HUMAN",
+      "B4DEB1_HUMAN",
+      "B2R6Y1_HUMAN",
+      "B2R5B6_HUMAN",
+      "B2R5B3_HUMAN",
+      "B2R4S9_HUMAN",
+      "B2R4R0_HUMAN",
+      "B2R4P9_HUMAN",
+      "A9UJN3_HUMAN",
+      "A8K9J7_HUMAN",
+      "A8K4Y7_HUMAN",
+      "A4FTV9_HUMAN",
+      "A3R0T8_HUMAN",
+      "A3R0T7_HUMAN",
+      "A3KPC7_HUMAN",
+      "A2RUA4_HUMAN",
+      "A1L407_HUMAN"
+    ]
+  },
+  "MOUSE": {
+    "name": "mouse",
+    "genome_size": 2800000000,
+    "histone_proteins": [
+      "Q9DAD9",
+      "B2RTM0",
+      "Q8CBB6",
+      "Q921L4",
+      "Q5M8Q2",
+      "Q810S6",
+      "B1AV31",
+      "Q497L1",
+      "A9Z055",
+      "Q8CGP9",
+      "P10922",
+      "Q8CJI4",
+      "E0CZ52",
+      "E0CYL2",
+      "Q8VIK3",
+      "Q80ZM5",
+      "Q9CQ70",
+      "Q8R1M2",
+      "Q3THW5",
+      "Q8R029",
+      "B2RVP5",
+      "P27661",
+      "Q9QZQ8",
+      "Q8CA90",
+      "Q8BP16",
+      "Q9CTR1",
+      "Q8CCK0",
+      "Q9D3V6",
+      "Q9D3U7",
+      "Q3UA95",
+      "Q3TFU6",
+      "G3UWL7",
+      "G3UX40",
+      "P0C0S6",
+      "F8WI35",
+      "E0CZ27",
+      "E0CYN1",
+      "E0CYR7",
+      "P84244",
+      "P02301",
+      "Q9QYL0",
+      "P43275",
+      "P43276",
+      "P15864",
+      "Q5SZA3",
+      "P43277",
+      "Q149Z9",
+      "P43274",
+      "Q07133",
+      "I7HFT9",
+      "Q8CGP4",
+      "P22752",
+      "B2RVF0",
+      "Q61668",
+      "Q8CGP5",
+      "A0AUV1",
+      "Q8CGP6",
+      "A3KPD0",
+      "Q8CGP7",
+      "F8WIX8",
+      "A0JNS9",
+      "P70696",
+      "Q64475",
+      "Q6ZWY9",
+      "P10853",
+      "Q64478",
+      "A0JLV3",
+      "Q8CGP1",
+      "B2RVD5",
+      "P10854",
+      "B2RTK3",
+      "Q8CGP2",
+      "P68433",
+      "P84228",
+      "A1L0U3",
+      "A1L0V4",
+      "P62806",
+      "B2RWH3",
+      "Q6GSS7",
+      "Q64522",
+      "Q64523",
+      "Q149V4",
+      "Q64525",
+      "G3X9D5",
+      "Q64524",
+      "B9EI85",
+      "Q61667",
+      "Q8BFU2",
+      "A2AB79",
+      "Q9D2U9",
+      "Q8CGP0",
+      "Q6B822",
+      "P07978",
+      "Q9D9Z7"
+    ]
+  },
+  "DROME": {
+    "name": "drome",
+    "genome_size": 144000000,
+    "histone_proteins": [
+      "Q6TXQ1",
+      "P02255",
+      "Q4AB54",
+      "Q4ABE3",
+      "Q4ABD8",
+      "Q4AB94",
+      "P84051",
+      "Q4AB57",
+      "P08985",
+      "P02283",
+      "P02299",
+      "E2QCP0",
+      "P84249",
+      "P84040"
+    ],
+    "histone_entries": [
+      "Q9DAD9_MOUSE",
+      "B2RTM0_MOUSE",
+      "Q8CBB6_MOUSE",
+      "Q921L4_MOUSE",
+      "H2AL1_MOUSE",
+      "Q810S6_MOUSE",
+      "Q9DAD9_MOUSE",
+      "Q497L1_MOUSE",
+      "A9Z055_MOUSE",
+      "Q8CGP9_MOUSE",
+      "H10_MOUSE",
+      "H1FNT_MOUSE",
+      "E0CZ52_MOUSE",
+      "E0CYL2_MOUSE",
+      "H18_MOUSE",
+      "Q80ZM5_MOUSE",
+      "H2AB1_MOUSE",
+      "H2AJ_MOUSE",
+      "H2AV_MOUSE",
+      "Q8R029_MOUSE",
+      "B2RVP5_MOUSE",
+      "H2AX_MOUSE",
+      "H2AY_MOUSE",
+      "Q8CA90_MOUSE",
+      "Q8BP16_MOUSE",
+      "Q9CTR1_MOUSE",
+      "H2AW_MOUSE",
+      "Q9D3V6_MOUSE",
+      "Q9D3U7_MOUSE",
+      "Q3UA95_MOUSE",
+      "Q3TFU6_MOUSE",
+      "G3UWL7_MOUSE",
+      "G3UX40_MOUSE",
+      "H2AZ_MOUSE",
+      "F8WI35_MOUSE",
+      "E0CZ27_MOUSE",
+      "E0CYN1_MOUSE",
+      "E0CYR7_MOUSE",
+      "H33_MOUSE",
+      "H3C_MOUSE",
+      "HILS1_MOUSE",
+      "H11_MOUSE",
+      "H15_MOUSE",
+      "H12_MOUSE",
+      "Q5SZA3_MOUSE",
+      "H13_MOUSE",
+      "Q149Z9_MOUSE",
+      "H14_MOUSE",
+      "H1T_MOUSE",
+      "I7HFT9_MOUSE",
+      "Q8CGP4_MOUSE",
+      "H2A1P_MOUSE",
+      "H2A1B_MOUSE",
+      "H2A1C_MOUSE",
+      "H2A1D_MOUSE",
+      "H2A1E_MOUSE",
+      "H2A1G_MOUSE",
+      "H2A1I_MOUSE",
+      "H2A1N_MOUSE",
+      "H2A1O_MOUSE",
+      "B2RVF0_MOUSE",
+      "Q61668_MOUSE",
+      "H2A1F_MOUSE",
+      "A0AUV1_MOUSE",
+      "H2A1H_MOUSE",
+      "A3KPD0_MOUSE",
+      "H2A1K_MOUSE",
+      "A0JNS9_MOUSE",
+      "H2B1A_MOUSE",
+      "H2B1B_MOUSE",
+      "H2B1C_MOUSE",
+      "H2B1F_MOUSE",
+      "H2B1H_MOUSE",
+      "A0JLV3_MOUSE",
+      "H2B1K_MOUSE",
+      "B2RVD5_MOUSE",
+      "H2B1M_MOUSE",
+      "B2RTK3_MOUSE",
+      "H2B1P_MOUSE",
+      "H31_MOUSE",
+      "H32_MOUSE",
+      "A1L0U3_MOUSE",
+      "A1L0V4_MOUSE",
+      "H4_MOUSE",
+      "B2RWH3_MOUSE",
+      "H2A2A_MOUSE",
+      "H2A2B_MOUSE",
+      "H2A2C_MOUSE",
+      "Q149V4_MOUSE",
+      "H2B2B_MOUSE",
+      "H2B2E_MOUSE",
+      "B9EI85_MOUSE",
+      "Q61667_MOUSE",
+      "H2A3_MOUSE",
+      "A2AB79_MOUSE",
+      "H2B3A_MOUSE",
+      "H2B3B_MOUSE",
+      "Q6B822_MOUSE",
+      "PRM2_MOUSE",
+      "H2BL1_MOUSE"
+    ]
+  },
+  "CAEEL": {
+    "name": "caeel",
+    "genome_size": 104000000,
+    "histone_proteins": [
+      "P10771",
+      "P15796",
+      "Q19743",
+      "O17536",
+      "O01833",
+      "Q9U3W3",
+      "Q18336",
+      "P09588",
+      "J7S164",
+      "J7SA65",
+      "Q27485",
+      "Q23429",
+      "Q27511",
+      "P04255",
+      "Q27894",
+      "P08898",
+      "K7ZUH9",
+      "Q10453",
+      "Q9U281",
+      "Q27490",
+      "Q27532",
+      "P62784",
+      "Q27484",
+      "Q27876",
+      "O16277",
+      "Q27489"
+    ],
+    "histone_entries": [
+      "H24_CAEEL",
+      "H12_CAEEL",
+      "H13_CAEEL",
+      "H14_CAEEL",
+      "H15_CAEEL",
+      "Q9U3W3_CAEEL",
+      "H1X_CAEEL",
+      "H2A_CAEEL",
+      "J7S164_CAEEL",
+      "J7SA65_CAEEL",
+      "Q27485_CAEEL",
+      "Q23429_CAEEL",
+      "H2AV_CAEEL",
+      "H2B1_CAEEL",
+      "H2B2_CAEEL",
+      "H3_CAEEL",
+      "K7ZUH9_CAEEL",
+      "H331_CAEEL",
+      "H332_CAEEL",
+      "H33L1_CAEEL",
+      "H33L2_CAEEL",
+      "H4_CAEEL",
+      "H2B3_CAEEL",
+      "H2B4_CAEEL",
+      "H16_CAEEL",
+      "H33L3_CAEEL"
+    ]
+  },
+  "YEAST": {
+    "name": "yeast",
+    "genome_size": 12100000,
+    "histone_proteins": [
+      "P53551",
+      "P04911",
+      "P04912",
+      "Q12692",
+      "P02293",
+      "P02294",
+      "P61830",
+      "P02309"
+    ],
+    "histone_entries": [
+      "H1_YEAST",
+      "H2A1_YEAST",
+      "H2A2_YEAST",
+      "H2AZ_YEAST",
+      "H2B1_YEAST",
+      "H2B2_YEAST",
+      "H3_YEAST",
+      "H4_YEAST"
+    ]
+  },
+  "SCHPO": {
+    "name": "schpo",
+    "genome_size": 14100000,
+    "histone_proteins": [
+      "P48003",
+      "P04909",
+      "P04910",
+      "P04913",
+      "P09988",
+      "P10651",
+      "P09322"
+    ],
+    "histone_entries": [
+      "H2AZ_SCHPO",
+      "H2A1_SCHPO",
+      "H2A2_SCHPO",
+      "H2B1_SCHPO",
+      "H31_SCHPO",
+      "H33_SCHPO",
+      "H4_SCHPO"
+    ]
+  }
+}
\ No newline at end of file

From d1498446b2bd22f5f15e20252e54d12514b18120 Mon Sep 17 00:00:00 2001
From: WangHong007 <88552471+WangHong007@users.noreply.github.com>
Date: Fri, 21 Apr 2023 20:38:49 +0800
Subject: [PATCH 4/4] Update compute_tpa.py

---
 bin/compute_tpa.py | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/bin/compute_tpa.py b/bin/compute_tpa.py
index 3d6d87a..795fa76 100644
--- a/bin/compute_tpa.py
+++ b/bin/compute_tpa.py
@@ -3,7 +3,6 @@
 
 import click
 import pandas as pd
-from pandas import DataFrame, Series
 from pyopenms import *
 
 from bin.compute_ibaq import print_help_msg, parse_uniprot_name
@@ -32,7 +31,7 @@ def tpa_compute(fasta: str, peptides: str, ruler: bool, ploidy: int, cpc: float,
     :return:
     """
     if peptides is None or fasta is None:
-        print_help_msg(ibaq_compute)
+        print_help_msg(tpa_compute)
         exit(1)
 
     data = pd.read_csv(peptides, sep=",")
@@ -52,7 +51,7 @@ def tpa_compute(fasta: str, peptides: str, ruler: bool, ploidy: int, cpc: float,
         if name in proteins:
             mw = AASequence().fromString(entry.sequence).getMonoWeight()
             mw_dict[name] = mw
-    
+
     # calculate TPA for every protein group
     def get_protein_group_mw(group: str) -> float:
         mw_list = [mw_dict[i] for i in group.split(";")]
@@ -63,43 +62,41 @@ def get_protein_group_mw(group: str) -> float:
     res["MolecularWeight"] = res["MolecularWeight"].replace(0, 1)
     res["TPA"] = res["NormIntensity"] / res["MolecularWeight"]
     plot_distributions(res, "TPA", SAMPLE_ID, log2=True)
-    plot_box_plot(res, "TPA", SAMPLE_ID, log2=True,
-                    title="TPA Distribution", violin=False)
+    plot_box_plot(res, "TPA", SAMPLE_ID, log2=True, title="TPA Distribution", violin=False)
 
     # calculate protein weight(ng) and concentration(nM)
     if ruler:
         avogadro = 6.02214129e23
-        average_base_pair_mass = 617.96 # 615.8771
+        average_base_pair_mass = 617.96  # 615.8771
 
         organism = res.loc[0, "ProteinName"].split("_")[1].lower()
         histone_df = pd.read_json(open(os.path.split(__file__)[0] + os.sep + "histones.json", "rb")).T
         target_histones = histone_df[histone_df["name"] == organism.lower()]
         genome_size = target_histones["genome_size"].values[0]
-        histones_list = target_histones["histone_entries"].values[0]
-                                                                                           
+        histones_list = target_histones["histone_entries"].values[0]                                                                          
         dna_mass = ploidy * genome_size * average_base_pair_mass / avogadro
 
         def calculate(protein_intensity, histone_intensity, mw):
             copy = (protein_intensity / histone_intensity) * dna_mass * avogadro / mw
             # The number of moles is equal to the number of particles divided by Avogadro's constant
-            moles = copy * 1e9 / avogadro # unit nmol
-            weight = moles * mw # unit ng
+            moles = copy * 1e9 / avogadro  # unit nmol
+            weight = moles * mw  # unit ng
             return tuple([copy, moles, weight])
 
         def proteomicRuler(df):
             histone_intensity = df[df["ProteinName"].isin(histones_list)]["NormIntensity"].sum()
             histone_intensity = histone_intensity if histone_intensity > 0 else 1
             df[["Copy", "Moles[nmol]", "Weight[ng]"]] = df.apply(lambda x: calculate(x["NormIntensity"], histone_intensity, x["MolecularWeight"]), axis = 1, result_type="expand")
-            volume = df["Weight[ng]"].sum() * 1e-9 / cpc # unit L
-            df["Concentration[nM]"] = df["Moles[nmol]"] / volume # unit nM
+            volume = df["Weight[ng]"].sum() * 1e-9 / cpc  # unit L
+            df["Concentration[nM]"] = df["Moles[nmol]"] / volume  # unit nM
             return df
 
         res = res.groupby(["Condition"]).apply(proteomicRuler)
 
         plot_distributions(res, "Concentration[nM]", SAMPLE_ID, log2=True)
-        plot_box_plot(res, "Concentration[nM]", SAMPLE_ID, log2=True,
-                        title="Concentration[nM] Distribution", violin=False)
+        plot_box_plot(res, "Concentration[nM]", SAMPLE_ID, log2=True, title="Concentration[nM] Distribution", violin=False)
         res.to_csv(output, index=False)
 
+
 if __name__ == '__main__':
     tpa_compute()