From 2806b0b99cd2cb03119c27a5234d1037ecde5ffa Mon Sep 17 00:00:00 2001 From: WangHong007 <88552471+WangHong007@users.noreply.github.com> Date: Mon, 27 Mar 2023 21:18:22 +0800 Subject: [PATCH 1/4] keep peptide raw intensity --- bin/peptide_normalization.py | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/bin/peptide_normalization.py b/bin/peptide_normalization.py index 64f18c9..2f2bbeb 100644 --- a/bin/peptide_normalization.py +++ b/bin/peptide_normalization.py @@ -118,14 +118,18 @@ def get_peptidoform_normalize_intensities(dataset: DataFrame, higher_intensity: return dataset -def sum_peptidoform_intensities(dataset: DataFrame) -> DataFrame: +def sum_peptidoform_intensities(dataset: DataFrame, smethod: str) -> DataFrame: """ - Sum the peptidoform intensities for all peptidofrom across replicates of the same sample. + Summary the peptidoform intensities for all peptidofrom across replicates of the same sample. :param dataset: Dataframe to be analyzed + :param smethod: Method to summary peptidoform intensities per peptide :return: dataframe with the intensities """ dataset = dataset[dataset[NORM_INTENSITY].notna()] - normalize_df = dataset.groupby([PEPTIDE_CANONICAL, SAMPLE_ID, BIOREPLICATE, CONDITION])[NORM_INTENSITY].sum() + if smethod == "sum": + normalize_df = dataset.groupby([PEPTIDE_CANONICAL, SAMPLE_ID, BIOREPLICATE, CONDITION])[NORM_INTENSITY].sum() + elif smethod == "median": + normalize_df = dataset.groupby([PEPTIDE_CANONICAL, SAMPLE_ID, BIOREPLICATE, CONDITION])[NORM_INTENSITY].median() normalize_df = normalize_df.reset_index() normalize_df = pd.merge(normalize_df, dataset[[PROTEIN_NAME, PEPTIDE_CANONICAL, SAMPLE_ID, BIOREPLICATE, CONDITION]], how='left', @@ -248,25 +252,26 @@ def impute_peptide_intensities(dataset_df, field, class_field): @click.option("--output", help="Peptide intensity file including other all properties for normalization") @click.option('--nmethod', help="Normalization method used to normalize intensities for all samples (options: qnorm)", default="qnorm") +@click.option("--smethod", help="Summary method used to calculate the intensity for peptides with multiple peptidoforms (options: sum)", + default="sum") @click.option("--impute", help="Impute the missing values using MissForest", is_flag=True) @click.option("--pnormalization", help="Normalize the peptide intensities using different methods (options: qnorm)", is_flag=True) @click.option("--compress", help="Read the input peptides file in compress gzip file", is_flag=True) -@click.option("--log2", help="Transform to log2 the peptide intensity values before normalization", is_flag=True) @click.option("--violin", help="Use violin plot instead of boxplot for distribution representations", is_flag=True) @click.option("--verbose", help="Print addition information about the distributions of the intensities, number of peptides remove " "after normalization, etc.", is_flag=True) -def peptide_normalization(peptides: str, contaminants: str, output: str, nmethod: str, impute: bool, - pnormalization: bool, compress: bool, log2: bool, - violin: bool, verbose: bool) -> None: +def peptide_normalization(peptides: str, contaminants: str, output: str, nmethod: str, smethod: str, impute: bool, + pnormalization: bool, compress: bool, violin: bool, verbose: bool) -> None: """ Normalize the peptide intensities using different methods. :param peptides: :param contaminants: :param output: :param nmethod: + :param smethod: :param impute: :param pnormalization: :param compress: @@ -289,9 +294,7 @@ def peptide_normalization(peptides: str, contaminants: str, output: str, nmethod dataset_df = pd.read_csv(peptides, sep=",") print_dataset_size(dataset_df, "Number of peptides: ", verbose) - print("Logarithmic if specified..") - dataset_df.loc[dataset_df.Intensity == 0, INTENSITY] = 1 - dataset_df[NORM_INTENSITY] = np.log2(dataset_df[INTENSITY]) if log2 else dataset_df[INTENSITY] + dataset_df[NORM_INTENSITY] = dataset_df[INTENSITY] # Print the distribution of the original peptide intensities from quantms analysis if verbose: @@ -324,10 +327,10 @@ def peptide_normalization(peptides: str, contaminants: str, output: str, nmethod print("Add Canonical peptides to the dataframe...") dataset_df[PEPTIDE_CANONICAL] = dataset_df[PEPTIDE_SEQUENCE].apply(lambda x: get_canonical_peptide(x)) - print("Sum all peptidoforms per Sample...") - print("Number of peptides before sum selection: " + str(len(dataset_df.index))) - dataset_df = sum_peptidoform_intensities(dataset_df) - print("Number of peptides after sum: " + str(len(dataset_df.index))) + print("Summary all peptidoforms per Sample...") + print("Number of peptides before summary selection: " + str(len(dataset_df.index))) + dataset_df = sum_peptidoform_intensities(dataset_df, smethod) + print("Number of peptides after summary: " + str(len(dataset_df.index))) print("Average all peptidoforms per Peptide/Sample...") print("Number of peptides before average: " + str(len(dataset_df.index))) From 6362b642a183a31aa8a62c360876edd8e37365ca Mon Sep 17 00:00:00 2001 From: WangHong007 <88552471+WangHong007@users.noreply.github.com> Date: Wed, 19 Apr 2023 13:34:51 +0800 Subject: [PATCH 2/4] Revert "keep peptide raw intensity" This reverts commit 2806b0b99cd2cb03119c27a5234d1037ecde5ffa. --- bin/peptide_normalization.py | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/bin/peptide_normalization.py b/bin/peptide_normalization.py index 2f2bbeb..64f18c9 100644 --- a/bin/peptide_normalization.py +++ b/bin/peptide_normalization.py @@ -118,18 +118,14 @@ def get_peptidoform_normalize_intensities(dataset: DataFrame, higher_intensity: return dataset -def sum_peptidoform_intensities(dataset: DataFrame, smethod: str) -> DataFrame: +def sum_peptidoform_intensities(dataset: DataFrame) -> DataFrame: """ - Summary the peptidoform intensities for all peptidofrom across replicates of the same sample. + Sum the peptidoform intensities for all peptidofrom across replicates of the same sample. :param dataset: Dataframe to be analyzed - :param smethod: Method to summary peptidoform intensities per peptide :return: dataframe with the intensities """ dataset = dataset[dataset[NORM_INTENSITY].notna()] - if smethod == "sum": - normalize_df = dataset.groupby([PEPTIDE_CANONICAL, SAMPLE_ID, BIOREPLICATE, CONDITION])[NORM_INTENSITY].sum() - elif smethod == "median": - normalize_df = dataset.groupby([PEPTIDE_CANONICAL, SAMPLE_ID, BIOREPLICATE, CONDITION])[NORM_INTENSITY].median() + normalize_df = dataset.groupby([PEPTIDE_CANONICAL, SAMPLE_ID, BIOREPLICATE, CONDITION])[NORM_INTENSITY].sum() normalize_df = normalize_df.reset_index() normalize_df = pd.merge(normalize_df, dataset[[PROTEIN_NAME, PEPTIDE_CANONICAL, SAMPLE_ID, BIOREPLICATE, CONDITION]], how='left', @@ -252,26 +248,25 @@ def impute_peptide_intensities(dataset_df, field, class_field): @click.option("--output", help="Peptide intensity file including other all properties for normalization") @click.option('--nmethod', help="Normalization method used to normalize intensities for all samples (options: qnorm)", default="qnorm") -@click.option("--smethod", help="Summary method used to calculate the intensity for peptides with multiple peptidoforms (options: sum)", - default="sum") @click.option("--impute", help="Impute the missing values using MissForest", is_flag=True) @click.option("--pnormalization", help="Normalize the peptide intensities using different methods (options: qnorm)", is_flag=True) @click.option("--compress", help="Read the input peptides file in compress gzip file", is_flag=True) +@click.option("--log2", help="Transform to log2 the peptide intensity values before normalization", is_flag=True) @click.option("--violin", help="Use violin plot instead of boxplot for distribution representations", is_flag=True) @click.option("--verbose", help="Print addition information about the distributions of the intensities, number of peptides remove " "after normalization, etc.", is_flag=True) -def peptide_normalization(peptides: str, contaminants: str, output: str, nmethod: str, smethod: str, impute: bool, - pnormalization: bool, compress: bool, violin: bool, verbose: bool) -> None: +def peptide_normalization(peptides: str, contaminants: str, output: str, nmethod: str, impute: bool, + pnormalization: bool, compress: bool, log2: bool, + violin: bool, verbose: bool) -> None: """ Normalize the peptide intensities using different methods. :param peptides: :param contaminants: :param output: :param nmethod: - :param smethod: :param impute: :param pnormalization: :param compress: @@ -294,7 +289,9 @@ def peptide_normalization(peptides: str, contaminants: str, output: str, nmethod dataset_df = pd.read_csv(peptides, sep=",") print_dataset_size(dataset_df, "Number of peptides: ", verbose) - dataset_df[NORM_INTENSITY] = dataset_df[INTENSITY] + print("Logarithmic if specified..") + dataset_df.loc[dataset_df.Intensity == 0, INTENSITY] = 1 + dataset_df[NORM_INTENSITY] = np.log2(dataset_df[INTENSITY]) if log2 else dataset_df[INTENSITY] # Print the distribution of the original peptide intensities from quantms analysis if verbose: @@ -327,10 +324,10 @@ def peptide_normalization(peptides: str, contaminants: str, output: str, nmethod print("Add Canonical peptides to the dataframe...") dataset_df[PEPTIDE_CANONICAL] = dataset_df[PEPTIDE_SEQUENCE].apply(lambda x: get_canonical_peptide(x)) - print("Summary all peptidoforms per Sample...") - print("Number of peptides before summary selection: " + str(len(dataset_df.index))) - dataset_df = sum_peptidoform_intensities(dataset_df, smethod) - print("Number of peptides after summary: " + str(len(dataset_df.index))) + print("Sum all peptidoforms per Sample...") + print("Number of peptides before sum selection: " + str(len(dataset_df.index))) + dataset_df = sum_peptidoform_intensities(dataset_df) + print("Number of peptides after sum: " + str(len(dataset_df.index))) print("Average all peptidoforms per Peptide/Sample...") print("Number of peptides before average: " + str(len(dataset_df.index))) From 9a473fe60a367e12298063038d9a1e3df1f9aacc Mon Sep 17 00:00:00 2001 From: WangHong007 <88552471+WangHong007@users.noreply.github.com> Date: Fri, 21 Apr 2023 20:10:23 +0800 Subject: [PATCH 3/4] Compute TPA and concentration --- bin/compute_tpa.py | 105 +++++++++ bin/histones.json | 526 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 631 insertions(+) create mode 100644 bin/compute_tpa.py create mode 100644 bin/histones.json diff --git a/bin/compute_tpa.py b/bin/compute_tpa.py new file mode 100644 index 0000000..3d6d87a --- /dev/null +++ b/bin/compute_tpa.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import click +import pandas as pd +from pandas import DataFrame, Series +from pyopenms import * + +from bin.compute_ibaq import print_help_msg, parse_uniprot_name +from ibaq.ibaqpy_commons import PROTEIN_NAME, NORM_INTENSITY, SAMPLE_ID, CONDITION +from ibaq.ibaqpy_commons import plot_distributions, plot_box_plot +import numpy as np +import os + +@click.command() +@click.option("-f", "--fasta", help="Protein database") +@click.option("-p", "--peptides", help="Peptide identifications with intensities following the peptide intensity output") +@click.option("-r", "--ruler", help="Whether to use proteomicRuler", default=True) +@click.option("-n", "--ploidy", help="ploidy number", default=2) +@click.option("-c", "--cpc", help="cellular protein concentration(g/L)", default=200) +@click.option("-o", "--output", help="Output file with the proteins and ibaq values") +def tpa_compute(fasta: str, peptides: str, ruler: bool, ploidy: int, cpc: float, output: str) -> None: + """ + This command computes the protein copies and concentrations according to a file output of peptides with the + format described in peptide_contaminants_file_generation.py. + :param fasta: Fasta file used to perform the peptide identification + :param peptides: Peptide intensity file. + :param ruler: Whether to compute protein copies, weight and concentration. + :param ploidy: ploidy number. + :param cpc: cellular protein concentration(g/L). + :param output: output format containing the ibaq values. + :return: + """ + if peptides is None or fasta is None: + print_help_msg(ibaq_compute) + exit(1) + + data = pd.read_csv(peptides, sep=",") + print(data.head()) + + res = pd.DataFrame(data.groupby([PROTEIN_NAME, SAMPLE_ID, CONDITION])[NORM_INTENSITY].sum()) + res = res.reset_index() + proteins = res["ProteinName"].unique().tolist() + proteins = sum([i.split(";") for i in proteins], []) + + # calculate molecular weight of quantified proteins + mw_dict = dict() + fasta_proteins = list() # type: list[FASTAEntry] + FASTAFile().load(fasta, fasta_proteins) + for entry in fasta_proteins: + accession, name = entry.identifier.split("|")[1:] + if name in proteins: + mw = AASequence().fromString(entry.sequence).getMonoWeight() + mw_dict[name] = mw + + # calculate TPA for every protein group + def get_protein_group_mw(group: str) -> float: + mw_list = [mw_dict[i] for i in group.split(";")] + return sum(mw_list) + + res["MolecularWeight"] = res.apply(lambda x: get_protein_group_mw(x["ProteinName"]), axis=1) + res["MolecularWeight"] = res["MolecularWeight"].fillna(1) + res["MolecularWeight"] = res["MolecularWeight"].replace(0, 1) + res["TPA"] = res["NormIntensity"] / res["MolecularWeight"] + plot_distributions(res, "TPA", SAMPLE_ID, log2=True) + plot_box_plot(res, "TPA", SAMPLE_ID, log2=True, + title="TPA Distribution", violin=False) + + # calculate protein weight(ng) and concentration(nM) + if ruler: + avogadro = 6.02214129e23 + average_base_pair_mass = 617.96 # 615.8771 + + organism = res.loc[0, "ProteinName"].split("_")[1].lower() + histone_df = pd.read_json(open(os.path.split(__file__)[0] + os.sep + "histones.json", "rb")).T + target_histones = histone_df[histone_df["name"] == organism.lower()] + genome_size = target_histones["genome_size"].values[0] + histones_list = target_histones["histone_entries"].values[0] + + dna_mass = ploidy * genome_size * average_base_pair_mass / avogadro + + def calculate(protein_intensity, histone_intensity, mw): + copy = (protein_intensity / histone_intensity) * dna_mass * avogadro / mw + # The number of moles is equal to the number of particles divided by Avogadro's constant + moles = copy * 1e9 / avogadro # unit nmol + weight = moles * mw # unit ng + return tuple([copy, moles, weight]) + + def proteomicRuler(df): + histone_intensity = df[df["ProteinName"].isin(histones_list)]["NormIntensity"].sum() + histone_intensity = histone_intensity if histone_intensity > 0 else 1 + df[["Copy", "Moles[nmol]", "Weight[ng]"]] = df.apply(lambda x: calculate(x["NormIntensity"], histone_intensity, x["MolecularWeight"]), axis = 1, result_type="expand") + volume = df["Weight[ng]"].sum() * 1e-9 / cpc # unit L + df["Concentration[nM]"] = df["Moles[nmol]"] / volume # unit nM + return df + + res = res.groupby(["Condition"]).apply(proteomicRuler) + + plot_distributions(res, "Concentration[nM]", SAMPLE_ID, log2=True) + plot_box_plot(res, "Concentration[nM]", SAMPLE_ID, log2=True, + title="Concentration[nM] Distribution", violin=False) + res.to_csv(output, index=False) + +if __name__ == '__main__': + tpa_compute() diff --git a/bin/histones.json b/bin/histones.json new file mode 100644 index 0000000..4f22566 --- /dev/null +++ b/bin/histones.json @@ -0,0 +1,526 @@ +{ + "HUMAN": { + "name": "human", + "genome_size": 3220000000, + "histone_proteins": [ + "P07305", + "Q8IZA3", + "Q92522", + "P0C5Y9", + "P0C5Z0", + "H0YFX9", + "Q9BTM1", + "A8MQC5", + "C9J0D1", + "C9J386", + "E5RJU1", + "Q71UI9", + "P16104", + "B4DJC3", + "D6RCF2", + "O75367", + "Q5SQT3", + "Q9P0M6", + "P0C0S5", + "P0C1H6", + "A9UJN3", + "P57053", + "Q7Z2G1", + "B4DEB1", + "P84243", + "B2R4P9", + "K7EMV3", + "K7ES00", + "K7EK07", + "K7EP01", + "Q6NXT2", + "Q02539", + "P16401", + "P16403", + "P16402", + "Q4VB24", + "P10412", + "A3R0T8", + "A1L407", + "P22492", + "Q96QV6", + "P04908", + "Q08AJ9", + "Q93077", + "P20671", + "P0C0S8", + "A3KPC7", + "Q96KK5", + "Q99878", + "A4FTV9", + "Q92646", + "Q96A08", + "P33778", + "P62807", + "P58876", + "B2R4S9", + "Q93079", + "P06899", + "O60814", + "Q99880", + "I6L9F7", + "Q99879", + "Q99877", + "P23527", + "P68431", + "P62805", + "Q99525", + "Q0VAS5", + "B2R4R0", + "Q6FI13", + "Q8IUE6", + "Q16777", + "Q16778", + "B4DR52", + "Q5QNW6", + "Q71DI3", + "Q5TEC6", + "Q7L7L0", + "Q8N257", + "Q16695", + "Q6TXQ4", + "Q14463", + "B4E0B3", + "B2R5B6", + "A2RUA4", + "B2R5B3", + "Q9HA11", + "A8K9J7", + "B2R6Y1", + "B4E380", + "A8K4Y7", + "Q6B823", + "Q6LBZ2", + "A3R0T7" + ], + "histone_entries": [ + "H2AW_HUMAN", + "Q9HA11_HUMAN", + "H2AJ_HUMAN", + "H2B1L_HUMAN", + "H2B1M_HUMAN", + "H2A1J_HUMAN", + "H2B1N_HUMAN", + "H4G_HUMAN", + "H2A1A_HUMAN", + "H2A1H_HUMAN", + "H2B1A_HUMAN", + "H2B1H_HUMAN", + "H2A1C_HUMAN", + "Q92646_HUMAN", + "H1X_HUMAN", + "H2B3B_HUMAN", + "H18_HUMAN", + "H2A2B_HUMAN", + "H2BWT_HUMAN", + "H2A3_HUMAN", + "H2AV_HUMAN", + "H2AV_HUMAN", + "H32_HUMAN", + "Q6TXQ4_HUMAN", + "H3C_HUMAN", + "Q6LBZ2_HUMAN", + "H2A2A_HUMAN", + "Q6B823_HUMAN", + "H37_HUMAN", + "Q5SQT3_HUMAN", + "H2B2F_HUMAN", + "Q4VB24_HUMAN", + "H2B2E_HUMAN", + "H2A2C_HUMAN", + "H31T_HUMAN", + "Q14463_HUMAN", + "Q0VAS5_HUMAN", + "Q08AJ9_HUMAN", + "H11_HUMAN", + "H33_HUMAN", + "H31_HUMAN", + "H2B1C_HUMAN", + "H4_HUMAN", + "H2B1D_HUMAN", + "H2BFS_HUMAN", + "H2B1B_HUMAN", + "H2B1O_HUMAN", + "H1T_HUMAN", + "H2A1D_HUMAN", + "H12_HUMAN", + "H13_HUMAN", + "H15_HUMAN", + "H2AX_HUMAN", + "H14_HUMAN", + "H2AB2_HUMAN", + "H2AB1_HUMAN", + "H2BFM_HUMAN", + "H2A1_HUMAN", + "H2AZ_HUMAN", + "H10_HUMAN", + "H2B1J_HUMAN", + "H2A1B_HUMAN", + "H2AY_HUMAN", + "H2B1K_HUMAN", + "K7ES00_HUMAN", + "K7EP01_HUMAN", + "K7EMV3_HUMAN", + "K7EK07_HUMAN", + "I6L9F7_HUMAN", + "H0YFX9_HUMAN", + "E5RJU1_HUMAN", + "D6RCF2_HUMAN", + "C9J386_HUMAN", + "C9J0D1_HUMAN", + "B4E380_HUMAN", + "B4E0B3_HUMAN", + "B4DR52_HUMAN", + "B4DJC3_HUMAN", + "B4DEB1_HUMAN", + "B2R6Y1_HUMAN", + "B2R5B6_HUMAN", + "B2R5B3_HUMAN", + "B2R4S9_HUMAN", + "B2R4R0_HUMAN", + "B2R4P9_HUMAN", + "A9UJN3_HUMAN", + "A8K9J7_HUMAN", + "A8K4Y7_HUMAN", + "A4FTV9_HUMAN", + "A3R0T8_HUMAN", + "A3R0T7_HUMAN", + "A3KPC7_HUMAN", + "A2RUA4_HUMAN", + "A1L407_HUMAN" + ] + }, + "MOUSE": { + "name": "mouse", + "genome_size": 2800000000, + "histone_proteins": [ + "Q9DAD9", + "B2RTM0", + "Q8CBB6", + "Q921L4", + "Q5M8Q2", + "Q810S6", + "B1AV31", + "Q497L1", + "A9Z055", + "Q8CGP9", + "P10922", + "Q8CJI4", + "E0CZ52", + "E0CYL2", + "Q8VIK3", + "Q80ZM5", + "Q9CQ70", + "Q8R1M2", + "Q3THW5", + "Q8R029", + "B2RVP5", + "P27661", + "Q9QZQ8", + "Q8CA90", + "Q8BP16", + "Q9CTR1", + "Q8CCK0", + "Q9D3V6", + "Q9D3U7", + "Q3UA95", + "Q3TFU6", + "G3UWL7", + "G3UX40", + "P0C0S6", + "F8WI35", + "E0CZ27", + "E0CYN1", + "E0CYR7", + "P84244", + "P02301", + "Q9QYL0", + "P43275", + "P43276", + "P15864", + "Q5SZA3", + "P43277", + "Q149Z9", + "P43274", + "Q07133", + "I7HFT9", + "Q8CGP4", + "P22752", + "B2RVF0", + "Q61668", + "Q8CGP5", + "A0AUV1", + "Q8CGP6", + "A3KPD0", + "Q8CGP7", + "F8WIX8", + "A0JNS9", + "P70696", + "Q64475", + "Q6ZWY9", + "P10853", + "Q64478", + "A0JLV3", + "Q8CGP1", + "B2RVD5", + "P10854", + "B2RTK3", + "Q8CGP2", + "P68433", + "P84228", + "A1L0U3", + "A1L0V4", + "P62806", + "B2RWH3", + "Q6GSS7", + "Q64522", + "Q64523", + "Q149V4", + "Q64525", + "G3X9D5", + "Q64524", + "B9EI85", + "Q61667", + "Q8BFU2", + "A2AB79", + "Q9D2U9", + "Q8CGP0", + "Q6B822", + "P07978", + "Q9D9Z7" + ] + }, + "DROME": { + "name": "drome", + "genome_size": 144000000, + "histone_proteins": [ + "Q6TXQ1", + "P02255", + "Q4AB54", + "Q4ABE3", + "Q4ABD8", + "Q4AB94", + "P84051", + "Q4AB57", + "P08985", + "P02283", + "P02299", + "E2QCP0", + "P84249", + "P84040" + ], + "histone_entries": [ + "Q9DAD9_MOUSE", + "B2RTM0_MOUSE", + "Q8CBB6_MOUSE", + "Q921L4_MOUSE", + "H2AL1_MOUSE", + "Q810S6_MOUSE", + "Q9DAD9_MOUSE", + "Q497L1_MOUSE", + "A9Z055_MOUSE", + "Q8CGP9_MOUSE", + "H10_MOUSE", + "H1FNT_MOUSE", + "E0CZ52_MOUSE", + "E0CYL2_MOUSE", + "H18_MOUSE", + "Q80ZM5_MOUSE", + "H2AB1_MOUSE", + "H2AJ_MOUSE", + "H2AV_MOUSE", + "Q8R029_MOUSE", + "B2RVP5_MOUSE", + "H2AX_MOUSE", + "H2AY_MOUSE", + "Q8CA90_MOUSE", + "Q8BP16_MOUSE", + "Q9CTR1_MOUSE", + "H2AW_MOUSE", + "Q9D3V6_MOUSE", + "Q9D3U7_MOUSE", + "Q3UA95_MOUSE", + "Q3TFU6_MOUSE", + "G3UWL7_MOUSE", + "G3UX40_MOUSE", + "H2AZ_MOUSE", + "F8WI35_MOUSE", + "E0CZ27_MOUSE", + "E0CYN1_MOUSE", + "E0CYR7_MOUSE", + "H33_MOUSE", + "H3C_MOUSE", + "HILS1_MOUSE", + "H11_MOUSE", + "H15_MOUSE", + "H12_MOUSE", + "Q5SZA3_MOUSE", + "H13_MOUSE", + "Q149Z9_MOUSE", + "H14_MOUSE", + "H1T_MOUSE", + "I7HFT9_MOUSE", + "Q8CGP4_MOUSE", + "H2A1P_MOUSE", + "H2A1B_MOUSE", + "H2A1C_MOUSE", + "H2A1D_MOUSE", + "H2A1E_MOUSE", + "H2A1G_MOUSE", + "H2A1I_MOUSE", + "H2A1N_MOUSE", + "H2A1O_MOUSE", + "B2RVF0_MOUSE", + "Q61668_MOUSE", + "H2A1F_MOUSE", + "A0AUV1_MOUSE", + "H2A1H_MOUSE", + "A3KPD0_MOUSE", + "H2A1K_MOUSE", + "A0JNS9_MOUSE", + "H2B1A_MOUSE", + "H2B1B_MOUSE", + "H2B1C_MOUSE", + "H2B1F_MOUSE", + "H2B1H_MOUSE", + "A0JLV3_MOUSE", + "H2B1K_MOUSE", + "B2RVD5_MOUSE", + "H2B1M_MOUSE", + "B2RTK3_MOUSE", + "H2B1P_MOUSE", + "H31_MOUSE", + "H32_MOUSE", + "A1L0U3_MOUSE", + "A1L0V4_MOUSE", + "H4_MOUSE", + "B2RWH3_MOUSE", + "H2A2A_MOUSE", + "H2A2B_MOUSE", + "H2A2C_MOUSE", + "Q149V4_MOUSE", + "H2B2B_MOUSE", + "H2B2E_MOUSE", + "B9EI85_MOUSE", + "Q61667_MOUSE", + "H2A3_MOUSE", + "A2AB79_MOUSE", + "H2B3A_MOUSE", + "H2B3B_MOUSE", + "Q6B822_MOUSE", + "PRM2_MOUSE", + "H2BL1_MOUSE" + ] + }, + "CAEEL": { + "name": "caeel", + "genome_size": 104000000, + "histone_proteins": [ + "P10771", + "P15796", + "Q19743", + "O17536", + "O01833", + "Q9U3W3", + "Q18336", + "P09588", + "J7S164", + "J7SA65", + "Q27485", + "Q23429", + "Q27511", + "P04255", + "Q27894", + "P08898", + "K7ZUH9", + "Q10453", + "Q9U281", + "Q27490", + "Q27532", + "P62784", + "Q27484", + "Q27876", + "O16277", + "Q27489" + ], + "histone_entries": [ + "H24_CAEEL", + "H12_CAEEL", + "H13_CAEEL", + "H14_CAEEL", + "H15_CAEEL", + "Q9U3W3_CAEEL", + "H1X_CAEEL", + "H2A_CAEEL", + "J7S164_CAEEL", + "J7SA65_CAEEL", + "Q27485_CAEEL", + "Q23429_CAEEL", + "H2AV_CAEEL", + "H2B1_CAEEL", + "H2B2_CAEEL", + "H3_CAEEL", + "K7ZUH9_CAEEL", + "H331_CAEEL", + "H332_CAEEL", + "H33L1_CAEEL", + "H33L2_CAEEL", + "H4_CAEEL", + "H2B3_CAEEL", + "H2B4_CAEEL", + "H16_CAEEL", + "H33L3_CAEEL" + ] + }, + "YEAST": { + "name": "yeast", + "genome_size": 12100000, + "histone_proteins": [ + "P53551", + "P04911", + "P04912", + "Q12692", + "P02293", + "P02294", + "P61830", + "P02309" + ], + "histone_entries": [ + "H1_YEAST", + "H2A1_YEAST", + "H2A2_YEAST", + "H2AZ_YEAST", + "H2B1_YEAST", + "H2B2_YEAST", + "H3_YEAST", + "H4_YEAST" + ] + }, + "SCHPO": { + "name": "schpo", + "genome_size": 14100000, + "histone_proteins": [ + "P48003", + "P04909", + "P04910", + "P04913", + "P09988", + "P10651", + "P09322" + ], + "histone_entries": [ + "H2AZ_SCHPO", + "H2A1_SCHPO", + "H2A2_SCHPO", + "H2B1_SCHPO", + "H31_SCHPO", + "H33_SCHPO", + "H4_SCHPO" + ] + } +} \ No newline at end of file From d1498446b2bd22f5f15e20252e54d12514b18120 Mon Sep 17 00:00:00 2001 From: WangHong007 <88552471+WangHong007@users.noreply.github.com> Date: Fri, 21 Apr 2023 20:38:49 +0800 Subject: [PATCH 4/4] Update compute_tpa.py --- bin/compute_tpa.py | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/bin/compute_tpa.py b/bin/compute_tpa.py index 3d6d87a..795fa76 100644 --- a/bin/compute_tpa.py +++ b/bin/compute_tpa.py @@ -3,7 +3,6 @@ import click import pandas as pd -from pandas import DataFrame, Series from pyopenms import * from bin.compute_ibaq import print_help_msg, parse_uniprot_name @@ -32,7 +31,7 @@ def tpa_compute(fasta: str, peptides: str, ruler: bool, ploidy: int, cpc: float, :return: """ if peptides is None or fasta is None: - print_help_msg(ibaq_compute) + print_help_msg(tpa_compute) exit(1) data = pd.read_csv(peptides, sep=",") @@ -52,7 +51,7 @@ def tpa_compute(fasta: str, peptides: str, ruler: bool, ploidy: int, cpc: float, if name in proteins: mw = AASequence().fromString(entry.sequence).getMonoWeight() mw_dict[name] = mw - + # calculate TPA for every protein group def get_protein_group_mw(group: str) -> float: mw_list = [mw_dict[i] for i in group.split(";")] @@ -63,43 +62,41 @@ def get_protein_group_mw(group: str) -> float: res["MolecularWeight"] = res["MolecularWeight"].replace(0, 1) res["TPA"] = res["NormIntensity"] / res["MolecularWeight"] plot_distributions(res, "TPA", SAMPLE_ID, log2=True) - plot_box_plot(res, "TPA", SAMPLE_ID, log2=True, - title="TPA Distribution", violin=False) + plot_box_plot(res, "TPA", SAMPLE_ID, log2=True, title="TPA Distribution", violin=False) # calculate protein weight(ng) and concentration(nM) if ruler: avogadro = 6.02214129e23 - average_base_pair_mass = 617.96 # 615.8771 + average_base_pair_mass = 617.96 # 615.8771 organism = res.loc[0, "ProteinName"].split("_")[1].lower() histone_df = pd.read_json(open(os.path.split(__file__)[0] + os.sep + "histones.json", "rb")).T target_histones = histone_df[histone_df["name"] == organism.lower()] genome_size = target_histones["genome_size"].values[0] - histones_list = target_histones["histone_entries"].values[0] - + histones_list = target_histones["histone_entries"].values[0] dna_mass = ploidy * genome_size * average_base_pair_mass / avogadro def calculate(protein_intensity, histone_intensity, mw): copy = (protein_intensity / histone_intensity) * dna_mass * avogadro / mw # The number of moles is equal to the number of particles divided by Avogadro's constant - moles = copy * 1e9 / avogadro # unit nmol - weight = moles * mw # unit ng + moles = copy * 1e9 / avogadro # unit nmol + weight = moles * mw # unit ng return tuple([copy, moles, weight]) def proteomicRuler(df): histone_intensity = df[df["ProteinName"].isin(histones_list)]["NormIntensity"].sum() histone_intensity = histone_intensity if histone_intensity > 0 else 1 df[["Copy", "Moles[nmol]", "Weight[ng]"]] = df.apply(lambda x: calculate(x["NormIntensity"], histone_intensity, x["MolecularWeight"]), axis = 1, result_type="expand") - volume = df["Weight[ng]"].sum() * 1e-9 / cpc # unit L - df["Concentration[nM]"] = df["Moles[nmol]"] / volume # unit nM + volume = df["Weight[ng]"].sum() * 1e-9 / cpc # unit L + df["Concentration[nM]"] = df["Moles[nmol]"] / volume # unit nM return df res = res.groupby(["Condition"]).apply(proteomicRuler) plot_distributions(res, "Concentration[nM]", SAMPLE_ID, log2=True) - plot_box_plot(res, "Concentration[nM]", SAMPLE_ID, log2=True, - title="Concentration[nM] Distribution", violin=False) + plot_box_plot(res, "Concentration[nM]", SAMPLE_ID, log2=True, title="Concentration[nM] Distribution", violin=False) res.to_csv(output, index=False) + if __name__ == '__main__': tpa_compute()