From 544fc851a55d6901586187ba82b0a25ceb3f76ec Mon Sep 17 00:00:00 2001 From: WangHong007 <88552471+WangHong007@users.noreply.github.com> Date: Fri, 24 Nov 2023 21:30:33 +0800 Subject: [PATCH 1/7] Combine streaming normalization into main code --- README.md | 4 +- bin/compute_tpa.py | 3 +- bin/peptide_normalization.py | 1090 +++++++++++++++++++-------- bin/peptide_normalization_stream.py | 702 ----------------- ibaq/ibaqpy_commons.py | 47 +- 5 files changed, 788 insertions(+), 1058 deletions(-) delete mode 100644 bin/peptide_normalization_stream.py diff --git a/README.md b/README.md index 1f5ead4..2b36b1c 100644 --- a/README.md +++ b/README.md @@ -65,7 +65,7 @@ E.g. http://ftp.pride.ebi.ac.uk/pub/databases/pride/resources/proteomes/absolute python peptide_normalization.py --msstats PXD003947.sdrf_openms_design_msstats_in.csv --sdrf PXD003947.sdrf.tsv --remove_ids data/contaminants_ids.tsv --remove_decoy_contaminants --remove_low_frequency_peptides --output PXD003947-peptides-norm.csv ``` -The command provides an additional `flag` for skip_normalization, pnormalization, compress, log2, violin, verbose. +The command provides an additional `flag` for skip_normalization, pnormalization, compress, log2, violin, verbose. If you use feature parquet as input, you can pass the `--sdrf`. ```asciidoc Usage: peptide_normalization.py [OPTIONS] @@ -74,6 +74,8 @@ Options: -m, --msstats TEXT MsStats file import generated by quantms -p, --parquet TEXT Parquet file import generated by quantmsio -s, --sdrf TEXT SDRF file import generated by quantms + --stream Stream processing normalization + --chunksize The number of rows of MSstats or parquet read using pandas streaming --min_aa INTEGER Minimum number of amino acids to filter peptides --min_unique INTEGER Minimum number of unique peptides to filter diff --git a/bin/compute_tpa.py b/bin/compute_tpa.py index df50051..2c78d45 100644 --- a/bin/compute_tpa.py +++ b/bin/compute_tpa.py @@ -9,9 +9,8 @@ from matplotlib.backends.backend_pdf import PdfPages from pyopenms import * -from bin.compute_ibaq import print_help_msg from ibaq.ibaqpy_commons import (CONDITION, NORM_INTENSITY, PROTEIN_NAME, SAMPLE_ID, - plot_box_plot, plot_distributions, + plot_box_plot, plot_distributions, print_help_msg, remove_contaminants_decoys, get_accession) diff --git a/bin/peptide_normalization.py b/bin/peptide_normalization.py index 5da12fa..c5b4d34 100644 --- a/bin/peptide_normalization.py +++ b/bin/peptide_normalization.py @@ -5,14 +5,18 @@ import numpy as np import pandas as pd import qnorm +import os +import random +import uuid from matplotlib.backends.backend_pdf import PdfPages from pandas import DataFrame +import pyarrow.parquet as pq from ibaq.ibaqpy_commons import ( BIOREPLICATE, CHANNEL, CONDITION, - FEATURE_COLUMNS, + PARQUET_COLUMNS, FRACTION, FRAGMENT_ION, INTENSITY, @@ -27,12 +31,12 @@ SAMPLE_ID, SEARCH_ENGINE, STUDY_ID, + TMT16plex, + TMT11plex, + TMT10plex, + TMT6plex, ITRAQ4plex, ITRAQ8plex, - TMT6plex, - TMT10plex, - TMT11plex, - TMT16plex, get_canonical_peptide, get_spectrum_prefix, get_study_accession, @@ -49,35 +53,228 @@ ) -# TODO: The following two func are useless. -def remove_outliers_iqr(dataset: DataFrame): +def print_dataset_size(dataset: DataFrame, message: str, verbose: bool) -> None: + if verbose: + print(message + str(len(dataset.index))) + + +def analyse_sdrf(sdrf_path: str, compression: bool) -> tuple: """ - This method removes outliers from the dataframe inplace, the variable used for the outlier removal is Intensity - :param dataset: Peptide dataframe - :return: None + This function is aimed to parse SDRF and return four objects: + 1. sdrf_df: A dataframe with channels and references annoted. + 2. label: Label type of the experiment. LFQ, TMT or iTRAQ. + 3. sample_names: A list contains all sample names. + 4. choice: A dictionary caontains key-values between channel + names and numbers. + :param sdrf_path: File path of SDRF. + :param compression: Whether compressed. + :return: """ - q1 = dataset[INTENSITY].quantile(0.25) - q3 = dataset[INTENSITY].quantile(0.75) - iqr = q3 - q1 + sdrf_df = pd.read_csv(sdrf_path, sep="\t", compression=compression) + sdrf_df[REFERENCE] = sdrf_df["comment[data file]"].apply(get_spectrum_prefix) + + labels = set(sdrf_df["comment[label]"]) + # Determine label type + label, choice = get_label(labels) + if label == "TMT": + choice_df = ( + pd.DataFrame.from_dict(choice, orient="index", columns=[CHANNEL]) + .reset_index() + .rename(columns={"index": "comment[label]"}) + ) + sdrf_df = sdrf_df.merge(choice_df, on="comment[label]", how="left") + elif label == "ITRAQ": + choice_df = ( + pd.DataFrame.from_dict(choice, orient="index", columns=[CHANNEL]) + .reset_index() + .rename(columns={"index": "comment[label]"}) + ) + sdrf_df = sdrf_df.merge(choice_df, on="comment[label]", how="left") + sample_names = sdrf_df["source name"].unique().tolist() + + return sdrf_df, label, sample_names, choice - dataset.query("(@q1 - 1.5 * @iqr) <= Intensity <= (@q3 + 1.5 * @iqr)", inplace=True) +def analyse_feature_parquet(parquet_path: str, batch_size: int = 100000) -> tuple: + """Return label type, sample names and choice dict by iterating parquet. -def remove_missing_values(normalize_df: DataFrame, ratio: float = 0.3) -> DataFrame: + :param parquet_path: Feature parquet path. + :param batch_size: Iterate batch size, defaults to 100000 + :return: Label type, sample names and choice dict """ - Remove missing values if the peptide do not have values in the most of the samples - :param normalize_df: data frame with the data - :param ratio: ratio of samples without intensity values. - :return: + parquet_chunks = read_large_parquet(parquet_path, batch_size) + labels, samples = list(), list() + for chunk in parquet_chunks: + samples.extend(chunk["sample_accession"].unique().tolist()) + labels.extend(chunk["isotope_label_type"].unique().tolist()) + samples = list(set(samples)) + labels = list(set(labels)) + # Determine label type + label, choice = get_label(labels) + + return label, samples, choice + + +def read_large_parquet(parquet_path: str, batch_size: int = 100000): + parquet_file = pq.ParquetFile(parquet_path) + for batch in parquet_file.iter_batches(batch_size=batch_size): + batch_df = batch.to_pandas() + yield batch_df + + +def get_label(labels: list) -> (str, dict): + """Return label type and choice dict according to labels list. + + :param labels: Labels from SDRF. + :return: Tuple contains label type and choice dict. """ - n_samples = len(normalize_df.columns) - normalize_df = normalize_df.dropna(thresh=round(n_samples * ratio)) - return normalize_df + choice = None + if len(labels) == 1: + label = "LFQ" + elif "TMT" in ",".join(labels) or "tmt" in ",".join(labels): + if ( + len(labels) > 11 + or "TMT134N" in labels + or "TMT133C" in labels + or "TMT133N" in labels + or "TMT132C" in labels + or "TMT132N" in labels + ): + choice = TMT16plex + elif len(labels) == 11 or "TMT131C" in labels: + choice = TMT11plex + elif len(labels) > 6: + choice = TMT10plex + else: + choice = TMT6plex + label = "TMT" + elif "ITRAQ" in ",".join(labels) or "itraq" in ",".join(labels): + if len(labels) > 4: + choice = ITRAQ8plex + else: + choice = ITRAQ4plex + label = "ITRAQ" + else: + exit("Warning: Only support label free, TMT and ITRAQ experiment!") + return label, choice -def print_dataset_size(dataset: DataFrame, message: str, verbose: bool) -> None: - if verbose: - print(message + str(len(dataset.index))) +def msstats_common_process(data_df: pd.DataFrame) -> pd.DataFrame: + """Apply common process on data. + + :param data_df: Feature data in dataframe. + :return: Processed data. + """ + data_df.rename( + columns={ + "ProteinName": PROTEIN_NAME, + "PeptideSequence": PEPTIDE_SEQUENCE, + "PrecursorCharge": PEPTIDE_CHARGE, + "Run": RUN, + "Condition": CONDITION, + "Intensity": INTENSITY, + }, + inplace=True, + ) + data_df[REFERENCE] = data_df[REFERENCE].apply(get_spectrum_prefix) + + return data_df + + +def parquet_common_process( + data_df: pd.DataFrame, label: str, choice: dict +) -> pd.DataFrame: + """Apply common process on data. + + :param data_df: Feature data in dataframe. + :return: Processed data. + """ + data_df = data_df.rename(columns=parquet_map) + data_df[PROTEIN_NAME] = data_df.apply(lambda x: ",".join(x[PROTEIN_NAME]), axis=1) + if label == "LFQ": + data_df.drop(CHANNEL, inplace=True, axis=1) + else: + data_df[CHANNEL] = data_df[CHANNEL].map(choice) + + return data_df + + +def merge_sdrf( + label: str, sdrf_df: pd.DataFrame, data_df: pd.DataFrame +) -> pd.DataFrame: + if label == "LFQ": + result_df = pd.merge( + data_df, + sdrf_df[["source name", REFERENCE]], + how="left", + on=[REFERENCE], + ) + elif label == "TMT": + result_df = pd.merge( + data_df, + sdrf_df[["source name", REFERENCE, CHANNEL]], + how="left", + on=[REFERENCE, CHANNEL], + ) + elif label == "ITRAQ": + result_df = pd.merge( + data_df, + sdrf_df[["source name", REFERENCE, CHANNEL]], + how="left", + on=[REFERENCE, CHANNEL], + ) + result_df.rename(columns={"source name": SAMPLE_ID}, inplace=True) + result_df = result_df[result_df["Condition"] != "Empty"] + + return result_df + + +def data_common_process(data_df: pd.DataFrame, min_aa: int) -> pd.DataFrame: + # Remove 0 intensity signals from the data + data_df = data_df[data_df[INTENSITY] > 0] + data_df = data_df[data_df["Condition"] != "Empty"] + + def map_canonical_seq(data_df: pd.DataFrame) -> (pd.DataFrame, dict): + modified_seqs = data_df[PEPTIDE_SEQUENCE].unique().tolist() + canonical_seqs = [get_canonical_peptide(i) for i in modified_seqs] + inner_canonical_dict = dict(zip(modified_seqs, canonical_seqs)) + data_df[PEPTIDE_CANONICAL] = data_df[PEPTIDE_SEQUENCE].map(inner_canonical_dict) + + return data_df, inner_canonical_dict + + if PEPTIDE_CANONICAL not in data_df.columns: + data_df, inner_canonical_dict = map_canonical_seq(data_df) + data_df[PEPTIDE_CANONICAL] = data_df[PEPTIDE_SEQUENCE].map(inner_canonical_dict) + # Filter peptides with less amino acids than min_aa (default: 7) + data_df = data_df[ + data_df.apply(lambda x: len(x[PEPTIDE_CANONICAL]) >= min_aa, axis=1) + ] + data_df[PROTEIN_NAME] = data_df[PROTEIN_NAME].apply(parse_uniprot_accession) + data_df[STUDY_ID] = data_df[SAMPLE_ID].apply(get_study_accession) + if FRACTION not in data_df.columns: + data_df[FRACTION] = 1 + data_df = data_df[ + [ + PROTEIN_NAME, + PEPTIDE_SEQUENCE, + PEPTIDE_CANONICAL, + PEPTIDE_CHARGE, + INTENSITY, + REFERENCE, + CONDITION, + RUN, + BIOREPLICATE, + FRACTION, + FRAGMENT_ION, + ISOTOPE_LABEL_TYPE, + STUDY_ID, + ] + ] + data_df[CONDITION] = pd.Categorical(data_df[CONDITION]) + data_df[STUDY_ID] = pd.Categorical(data_df[STUDY_ID]) + data_df[SAMPLE_ID] = pd.Categorical(data_df[SAMPLE_ID]) + + return data_df def intensity_normalization( @@ -291,6 +488,12 @@ def impute_peptide_intensities(dataset_df, field, class_field): @click.option( "-s", "--sdrf", help="SDRF file import generated by quantms", default=None ) +@click.option("--stream", help="Stream processing normalization", is_flag=True) +@click.option( + "--chunksize", + help="The number of rows of MSstats or parquet read using pandas streaming", + default=1000000, +) @click.option( "--min_aa", help="Minimum number of amino acids to filter peptides", default=7 ) @@ -362,6 +565,8 @@ def peptide_normalization( msstats: str, parquet: str, sdrf: str, + stream: bool, + chunksize: int, min_aa: int, min_unique: int, remove_ids: str, @@ -385,209 +590,393 @@ def peptide_normalization( print_help_msg(peptide_normalization) exit(1) + pd.set_option("display.max_columns", None) compression_method = "gzip" if compress else None + print("Loading data..") + + if not stream: + if parquet is None: + # Read the msstats file + feature_df = pd.read_csv( + msstats, + sep=",", + compression=compression_method, + dtype={CONDITION: "category", ISOTOPE_LABEL_TYPE: "category"}, + ) + + # Read the sdrf file + sdrf_df, label, sample_names, choice = analyse_sdrf( + sdrf, compression_method + ) + print(sdrf_df) - if parquet is None: - # Read the msstats file - feature_df = pd.read_csv( - msstats, - sep=",", - compression=compression_method, - dtype={CONDITION: "category", ISOTOPE_LABEL_TYPE: "category"}, + # Merged the SDRF with the Resulted file + dataset_df = msstats_common_process(feature_df) + dataset_df = merge_sdrf(label, sdrf_df, feature_df) + # Remove the intermediate variables and free the memory + del feature_df, sdrf_df + gc.collect() + else: + dataset_df = pd.read_parquet(parquet)[FEATURE_COLUMNS] + dataset_df = parquet_common_process(dataset_df, label, choice) + + dataset_df = data_common_process(dataset_df, min_aa) + # Only proteins with unique peptides number greater than min_unique (default: 2) are retained + unique_peptides = set( + dataset_df.groupby(PEPTIDE_CANONICAL) + .filter(lambda x: len(set(x[PROTEIN_NAME])) == 1)[PEPTIDE_CANONICAL] + .tolist() + ) + strong_proteins = set( + dataset_df[dataset_df[PEPTIDE_CANONICAL].isin(unique_peptides)] + .groupby(PROTEIN_NAME) + .filter(lambda x: len(set(x[PEPTIDE_CANONICAL])) >= min_unique)[ + PROTEIN_NAME + ] + .tolist() ) + dataset_df = dataset_df[dataset_df[PROTEIN_NAME].isin(strong_proteins)] + + print_dataset_size(dataset_df, "Number of peptides: ", verbose) - feature_df.rename( - columns={ - "ProteinName": PROTEIN_NAME, - "PeptideSequence": PEPTIDE_SEQUENCE, - "PrecursorCharge": PEPTIDE_CHARGE, - "Run": RUN, - "Condition": CONDITION, - "Intensity": INTENSITY, - }, - inplace=True, + print("Logarithmic if specified..") + dataset_df.loc[dataset_df.Intensity == 0, INTENSITY] = 1 + dataset_df[NORM_INTENSITY] = ( + np.log2(dataset_df[INTENSITY]) if log2 else dataset_df[INTENSITY] ) + dataset_df.drop(INTENSITY, axis=1, inplace=True) + + # Print the distribution of the original peptide intensities from quantms analysis + if verbose: + sample_names = set(dataset_df[SAMPLE_ID]) + plot_width = len(sample_names) * 0.5 + 10 + pdf = PdfPages(qc_report) + density = plot_distributions( + dataset_df, + NORM_INTENSITY, + SAMPLE_ID, + log2=not log2, + width=plot_width, + title="Original peptidoform intensity distribution (no normalization)", + ) + plt.show() + pdf.savefig(density) + box = plot_box_plot( + dataset_df, + NORM_INTENSITY, + SAMPLE_ID, + log2=not log2, + width=plot_width, + title="Original peptidoform intensity distribution (no normalization)", + violin=violin, + ) + plt.show() + pdf.savefig(box) + + # Remove high abundant and contaminants proteins and the outliers + if remove_ids is not None: + print("Remove proteins from file...") + dataset_df = remove_protein_by_ids(dataset_df, remove_ids) + if remove_decoy_contaminants: + print("Remove decoy and contaminants...") + dataset_df = remove_contaminants_decoys(dataset_df) + + print_dataset_size(dataset_df, "Peptides after contaminants removal: ", verbose) - feature_df[PROTEIN_NAME] = feature_df[PROTEIN_NAME].apply( - parse_uniprot_accession + print("Normalize intensities.. ") + # dataset_df = dataset_df.dropna(how="any") + if not skip_normalization: + dataset_df = intensity_normalization( + dataset_df, + field=NORM_INTENSITY, + class_field=SAMPLE_ID, + scaling_method=nmethod, + ) + if verbose: + log_after_norm = ( + nmethod == "msstats" + or nmethod == "qnorm" + or ((nmethod == "quantile" or nmethod == "robust") and not log2) + ) + density = plot_distributions( + dataset_df, + NORM_INTENSITY, + SAMPLE_ID, + log2=log_after_norm, + width=plot_width, + title="Peptidoform intensity distribution after normalization, method: " + + nmethod, + ) + plt.show() + pdf.savefig(density) + box = plot_box_plot( + dataset_df, + NORM_INTENSITY, + SAMPLE_ID, + log2=log_after_norm, + width=plot_width, + title="Peptidoform intensity distribution after normalization, method: " + + nmethod, + violin=violin, + ) + plt.show() + pdf.savefig(box) + + print("Select the best peptidoform across fractions...") + print( + "Number of peptides before peptidofrom selection: " + + str(len(dataset_df.index)) + ) + dataset_df = get_peptidoform_normalize_intensities(dataset_df) + print( + "Number of peptides after peptidofrom selection: " + + str(len(dataset_df.index)) ) - # Read the sdrf file - sdrf_df = pd.read_csv(sdrf, sep="\t", compression=compression_method) - sdrf_df[REFERENCE] = sdrf_df["comment[data file]"].apply(get_spectrum_prefix) - print(sdrf_df) - - if FRACTION not in feature_df.columns: - feature_df[FRACTION] = 1 - feature_df = feature_df[ - [ - PROTEIN_NAME, - PEPTIDE_SEQUENCE, - PEPTIDE_CHARGE, - INTENSITY, - REFERENCE, - CONDITION, - RUN, - BIOREPLICATE, - FRACTION, - FRAGMENT_ION, - ISOTOPE_LABEL_TYPE, - ] - ] + # Add the peptide sequence canonical without the modifications + if PEPTIDE_CANONICAL not in dataset_df.columns: + print("Add Canonical peptides to the dataframe...") + dataset_df[PEPTIDE_CANONICAL] = dataset_df[PEPTIDE_SEQUENCE].apply( + lambda x: get_canonical_peptide(x) + ) - # Merged the SDRF with the Resulted file - labels = set(sdrf_df["comment[label]"]) - if CHANNEL not in feature_df.columns: - feature_df[REFERENCE] = feature_df[REFERENCE].apply(get_spectrum_prefix) - dataset_df = pd.merge( - feature_df, - sdrf_df[["source name", REFERENCE]], - how="left", - on=[REFERENCE], - ) - elif "TMT" in ",".join(labels) or "tmt" in ",".join(labels): - if ( - len(labels) > 11 - or "TMT134N" in labels - or "TMT133C" in labels - or "TMT133N" in labels - or "TMT132C" in labels - or "TMT132N" in labels - ): - choice = TMT16plex - elif len(labels) == 11 or "TMT131C" in labels: - choice = TMT11plex - elif len(labels) > 6: - choice = TMT10plex - else: - choice = TMT6plex - choice = ( - pd.DataFrame.from_dict(choice, orient="index", columns=[CHANNEL]) - .reset_index() - .rename(columns={"index": "comment[label]"}) - ) - sdrf_df = sdrf_df.merge(choice, on="comment[label]", how="left") - feature_df[REFERENCE] = feature_df[REFERENCE].apply(get_spectrum_prefix) - dataset_df = pd.merge( - feature_df, - sdrf_df[["source name", REFERENCE, CHANNEL]], - how="left", - on=[REFERENCE, CHANNEL], - ) - # result_df.drop(CHANNEL, axis=1, inplace=True) - dataset_df = dataset_df[dataset_df["Condition"] != "Empty"] - dataset_df.rename(columns={"Charge": PEPTIDE_CHARGE}, inplace=True) - elif "ITRAQ" in ",".join(labels) or "itraq" in ",".join(labels): - if len(labels) > 4: - choice = ITRAQ8plex - else: - choice = ITRAQ4plex - choice = ( - pd.DataFrame.from_dict(choice, orient="index", columns=[CHANNEL]) - .reset_index() - .rename(columns={"index": "comment[label]"}) - ) - sdrf_df = sdrf_df.merge(choice, on="comment[label]", how="left") - feature_df[REFERENCE] = feature_df[REFERENCE].apply(get_spectrum_prefix) - dataset_df = pd.merge( - feature_df, - sdrf_df[["source name", REFERENCE, CHANNEL]], - how="left", - on=[REFERENCE, CHANNEL], - ) - dataset_df = dataset_df[dataset_df["Condition"] != "Empty"] - dataset_df.rename(columns={"Charge": PEPTIDE_CHARGE}, inplace=True) - else: - print("Warning: Only support label free, TMT and ITRAQ experiment!") - exit(1) + print("Sum all peptidoforms per Sample...") + print("Number of peptides before sum selection: " + str(len(dataset_df.index))) + dataset_df = sum_peptidoform_intensities(dataset_df) + print("Number of peptides after sum: " + str(len(dataset_df.index))) - # Remove the intermediate variables and free the memory - del feature_df, sdrf_df - gc.collect() + print("Average all peptidoforms per Peptide/Sample...") + print("Number of peptides before average: " + str(len(dataset_df.index))) + dataset_df = average_peptide_intensities(dataset_df) + print("Number of peptides after average: " + str(len(dataset_df.index))) + + if verbose: + log_after_norm = ( + nmethod == "msstats" + or nmethod == "qnorm" + or ((nmethod == "quantile" or nmethod == "robust") and not log2) + ) + density = plot_distributions( + dataset_df, + NORM_INTENSITY, + SAMPLE_ID, + log2=log_after_norm, + width=plot_width, + title="Peptide intensity distribution method: " + nmethod, + ) + plt.show() + pdf.savefig(density) + box = plot_box_plot( + dataset_df, + NORM_INTENSITY, + SAMPLE_ID, + log2=log_after_norm, + width=plot_width, + title="Peptide intensity distribution method: " + nmethod, + violin=violin, + ) + plt.show() + pdf.savefig(box) + + if remove_low_frequency_peptides: + print( + "Peptides before removing low frequency peptides: " + + str(len(dataset_df.index)) + ) + print(dataset_df) + dataset_df = remove_low_frequency_peptides_(dataset_df, 0.20) + print_dataset_size( + dataset_df, "Peptides after remove low frequency peptides: ", verbose + ) + + # Perform imputation using Random Forest in Peptide Intensities + # TODO: Check if this is necessary (Probably we can do some research if imputation at peptide level is necessary + # if impute: + # dataset_df = impute_peptide_intensities(dataset_df, field=NORM_INTENSITY, class_field=SAMPLE_ID) + + if pnormalization: + print("Normalize at Peptide level...") + dataset_df = peptide_intensity_normalization( + dataset_df, + field=NORM_INTENSITY, + class_field=SAMPLE_ID, + scaling_method=nmethod, + ) + + if verbose: + log_after_norm = ( + nmethod == "msstats" + or nmethod == "qnorm" + or ((nmethod == "quantile" or nmethod == "robust") and not log2) + ) + density = plot_distributions( + dataset_df, + NORM_INTENSITY, + SAMPLE_ID, + log2=log_after_norm, + width=plot_width, + title="Normalization at peptide level method: " + nmethod, + ) + plt.show() + pdf.savefig(density) + box = plot_box_plot( + dataset_df, + NORM_INTENSITY, + SAMPLE_ID, + log2=log_after_norm, + width=plot_width, + title="Normalization at peptide level method: " + nmethod, + violin=violin, + ) + plt.show() + pdf.savefig(box) + pdf.close() + + print("Save the normalized peptide intensities...") + dataset_df.to_csv(output, index=False, sep=",") else: - dataset_df = pd.read_parquet(parquet)[FEATURE_COLUMNS] - dataset_df = dataset_df.rename(columns=parquet_map) - dataset_df[PROTEIN_NAME] = dataset_df.apply( - lambda x: ",".join(x[PROTEIN_NAME]), axis=1 - ) - label_type = dataset_df[CHANNEL].unique().tolist() - if len(label_type) == 1: - dataset_df.drop(CHANNEL, inplace=True, axis=1) - dataset_df = dataset_df[dataset_df["Condition"] != "Empty"] - - # Remove 0 intensity signals from the msstats file - dataset_df = dataset_df[dataset_df[INTENSITY] > 0] - dataset_df[PEPTIDE_CANONICAL] = dataset_df.apply( - lambda x: get_canonical_peptide(x[PEPTIDE_SEQUENCE]), axis=1 - ) - # Only peptides with more than min_aa (default: 7) amino acids are retained - dataset_df = dataset_df[ - dataset_df.apply(lambda x: len(x[PEPTIDE_CANONICAL]) >= min_aa, axis=1) - ] - # Only proteins with unique peptides number greater than min_unique (default: 2) are retained - unique_peptides = set( - dataset_df.groupby(PEPTIDE_CANONICAL) - .filter(lambda x: len(set(x[PROTEIN_NAME])) == 1)[PEPTIDE_CANONICAL] - .tolist() - ) - strong_proteins = set( - dataset_df[dataset_df[PEPTIDE_CANONICAL].isin(unique_peptides)] - .groupby(PROTEIN_NAME) - .filter(lambda x: len(set(x[PEPTIDE_CANONICAL])) >= min_unique)[PROTEIN_NAME] - .tolist() - ) - dataset_df = dataset_df[dataset_df[PROTEIN_NAME].isin(strong_proteins)] - - if msstats: - dataset_df.rename(columns={"source name": SAMPLE_ID}, inplace=True) - dataset_df[STUDY_ID] = dataset_df[SAMPLE_ID].apply(get_study_accession) - dataset_df = dataset_df.filter( - items=[ - PEPTIDE_SEQUENCE, - PEPTIDE_CHARGE, - FRACTION, - RUN, - BIOREPLICATE, - PROTEIN_NAME, - STUDY_ID, - CONDITION, - SAMPLE_ID, - INTENSITY, - ] - ) - dataset_df[CONDITION] = pd.Categorical(dataset_df[CONDITION]) - dataset_df[STUDY_ID] = pd.Categorical(dataset_df[STUDY_ID]) - dataset_df[SAMPLE_ID] = pd.Categorical(dataset_df[SAMPLE_ID]) + if parquet is None: + sdrf_df, label, sample_names, choice = analyse_sdrf( + sdrf, compression_method + ) + msstats_chunks = pd.read_csv( + msstats, + sep=",", + compression=compression_method, + dtype={CONDITION: "category", ISOTOPE_LABEL_TYPE: "category"}, + chunksize=chunksize, + ) + else: + label, sample_names, choice = analyse_feature_parquet( + parquet, batch_size=chunksize + ) + msstats_chunks = read_large_parquet(parquet, batch_size=chunksize) - pd.set_option("display.max_columns", None) - print("Loading data..") - print_dataset_size(dataset_df, "Number of peptides: ", verbose) + # TODO: Stream processing to obtain strong proteins with more than 2 uniqe peptides + temp = f"Temp-{str(uuid.uuid4())}/" + os.mkdir(temp) + print(f"IBAQPY WARNING: Writing files into {temp}...") + unique_peptides = {} + group_intensities = {} + quantile = {} + for msstats_df in msstats_chunks: + if parquet is None: + msstats_df = msstats_common_process(msstats_df) + msstats_df = merge_sdrf(label, sdrf_df, msstats_df) + else: + msstats_df = parquet_common_process(msstats_df) + result_df = data_common_process(msstats_df, min_aa) - print("Logarithmic if specified..") - dataset_df.loc[dataset_df.Intensity == 0, INTENSITY] = 1 - dataset_df[NORM_INTENSITY] = ( - np.log2(dataset_df[INTENSITY]) if log2 else dataset_df[INTENSITY] - ) - dataset_df.drop(INTENSITY, axis=1, inplace=True) + # Write CSVs by Sample ID + for sample in sample_names: + file_name = f"{temp}/{sample}.csv" + write_mode = "a" if os.path.exists(file_name) else "w" + header = False if os.path.exists(file_name) else True + print(result_df[SAMPLE_ID]) + result_df[result_df[SAMPLE_ID] == sample].to_csv( + file_name, index=False, header=header, mode=write_mode + ) + unique_df = result_df.groupby([PEPTIDE_CANONICAL]).filter( + lambda x: len(set(x[PROTEIN_NAME])) == 1 + )[[PEPTIDE_CANONICAL, PROTEIN_NAME]] + unique_dict = dict( + zip(unique_df[PEPTIDE_CANONICAL], unique_df[PROTEIN_NAME]) + ) + for i in unique_dict.keys(): + if i in unique_peptides.keys() and unique_dict[i] != unique_peptides[i]: + unique_peptides.pop(i) + else: + unique_peptides[i] = unique_dict[i] - # Print the distribution of the original peptide intensities from quantms analysis - if verbose: - sample_names = set(dataset_df[SAMPLE_ID]) - plot_width = len(sample_names) * 0.5 + 10 + proteins_list = list(unique_peptides.values()) + count_dict = { + element: proteins_list.count(element) for element in set(proteins_list) + } + strong_proteins = [ + element for element in count_dict if count_dict[element] >= min_unique + ] + del proteins_list, count_dict + print(f"Number of unique peptides: {len(list(unique_peptides.keys()))}") + print(f"Number of strong proteins: {len(strong_proteins)}") + + # TODO: Filter proteins with less unique peptides than min_unique (default: 2) + plot_samples = random.sample(sample_names, min(len(sample_names), 20)) + plot_width = 10 + len(plot_samples) * 0.5 pdf = PdfPages(qc_report) + original_intensities_df = pd.DataFrame() + + for sample in sample_names: + print( + f"{sample} -> Filter out proteins containing unique peptides fewer than {min_unique}.." + ) + msstats_df = pd.read_csv(f"{temp}/{sample}.csv", sep=",") + msstats_df = msstats_df[msstats_df[PROTEIN_NAME].isin(strong_proteins)] + print(f"{sample} -> Logarithmic if specified..") + msstats_df.loc[msstats_df.Intensity == 0, INTENSITY] = 1 + msstats_df[NORM_INTENSITY] = ( + np.log2(msstats_df[INTENSITY]) if log2 else msstats_df[INTENSITY] + ) + msstats_df.to_csv(f"{temp}/{sample}.csv", index=False, sep=",") + if sample in plot_samples: + original_intensities_df = pd.concat( + [original_intensities_df, msstats_df] + ) + if not skip_normalization: + if nmethod == "msstats": + if label in ["TMT", "ITRAQ"]: + g = msstats_df.groupby(["Run", "Channel"]) + else: + g = msstats_df.groupby(["Run", "Fraction"]) + for name, group in g: + group_intensity = group[NORM_INTENSITY].tolist() + if name not in group_intensities: + group_intensities[name] = group_intensity + else: + group_intensities.update( + { + name: group_intensities[NORM_INTENSITY] + + group_intensity + } + ) + elif nmethod == "qnorm": + + def recalculate(original, count, value): + return (original * count + value) / (count + 1), count + 1 + + dic = ( + msstats_df[NORM_INTENSITY] + .dropna() + .sort_values(ascending=False) + .reset_index(drop=True) + .to_dict() + ) + if len(quantile) == 0: + quantile = {k: (v, 1) for k, v in dic.items()} + else: + update = min(len(quantile), len(dic)) + for i in range(0, update): + original, count = quantile[i] + quantile[i] = recalculate(original, count, dic[i]) + if len(dic) <= len(quantile): + continue + else: + quantile.update( + {k: (v, 1) for k, v in dic.items() if k >= update} + ) + # Save original intensities QC plots + original_intensities_df = original_intensities_df.reset_index(drop=True) density = plot_distributions( - dataset_df, - NORM_INTENSITY, + original_intensities_df, + INTENSITY, SAMPLE_ID, log2=not log2, width=plot_width, title="Original peptidoform intensity distribution (no normalization)", ) - plt.show() pdf.savefig(density) box = plot_box_plot( - dataset_df, - NORM_INTENSITY, + original_intensities_df, + INTENSITY, SAMPLE_ID, log2=not log2, width=plot_width, @@ -596,143 +985,233 @@ def peptide_normalization( ) plt.show() pdf.savefig(box) + del original_intensities_df - # Remove high abundant and contaminants proteins and the outliers - if remove_ids is not None: - print("Remove proteins from file...") - dataset_df = remove_protein_by_ids(dataset_df, remove_ids) - if remove_decoy_contaminants: - print("Remove decoy and contaminants...") - dataset_df = remove_contaminants_decoys(dataset_df) + def normalization(dataset_df, label, sample, skip_normalization, nmethod): + # Remove high abundant and contaminants proteins and the outliers + if remove_ids is not None: + print(f"{sample} -> Remove contaminants...") + dataset_df = remove_protein_by_ids(dataset_df, remove_ids) + if remove_decoy_contaminants: + print(f"{sample} -> Remove decoy and contaminants...") + dataset_df = remove_contaminants_decoys(dataset_df) + print( + f"{sample} -> Peptides after contaminants removal: {len(dataset_df[PEPTIDE_SEQUENCE].unique().tolist())}" + ) - print_dataset_size(dataset_df, "Peptides after contaminants removal: ", verbose) + if not skip_normalization: + print(f"{sample} -> Normalize intensities.. ") + field = NORM_INTENSITY + if nmethod == "msstats": + # For ISO normalization + if label in ["TMT", "ITRAQ"]: + dataset_df.loc[:, NORM_INTENSITY] = dataset_df.apply( + lambda x: x[field] + - group_intensities[(x["Run"], x["Channel"])] + + median_baseline, + axis=1, + ) + else: + dataset_df.loc[:, NORM_INTENSITY] = dataset_df.apply( + lambda x: x[field] + - group_intensities[(x["Run"], x["Fraction"])] + + np.median( + [ + group_intensities[i] + for i in group_intensities.keys() + if i[1] == x["Fraction"] + ] + ), + axis=1, + ) + elif nmethod == "qnorm": + # pivot to have one col per sample + ref_dict = ( + dataset_df[NORM_INTENSITY] + .dropna() + .drop_duplicates() + .sort_values(ascending=False) + .reset_index(drop=True) + .to_dict() + ) + ref_dict = {v: norm_intensity[k] for k, v in ref_dict.items()} + dataset_df.loc[:, NORM_INTENSITY] = dataset_df.apply( + lambda x: ref_dict[x[NORM_INTENSITY]] + if x[NORM_INTENSITY] in ref_dict.keys() + else np.nan, + axis=1, + ) - print("Normalize intensities.. ") - # dataset_df = dataset_df.dropna(how="any") - if not skip_normalization: - dataset_df = intensity_normalization( - dataset_df, - field=NORM_INTENSITY, - class_field=SAMPLE_ID, - scaling_method=nmethod, - ) - if verbose: - log_after_norm = ( - nmethod == "msstats" - or nmethod == "qnorm" - or ((nmethod == "quantile" or nmethod == "robust") and not log2) - ) - density = plot_distributions( - dataset_df, - NORM_INTENSITY, - SAMPLE_ID, - log2=log_after_norm, - width=plot_width, - title="Peptidoform intensity distribution after normalization, method: " - + nmethod, - ) - plt.show() - pdf.savefig(density) - box = plot_box_plot( - dataset_df, - NORM_INTENSITY, - SAMPLE_ID, - log2=log_after_norm, - width=plot_width, - title="Peptidoform intensity distribution after normalization, method: " - + nmethod, - violin=violin, - ) - plt.show() - pdf.savefig(box) + print(f"{sample} -> Select the best peptidoform across fractions...") + print( + f"{sample} -> Number of peptides before peptidofrom selection: " + + str(len(dataset_df.index)) + ) + dataset_df = get_peptidoform_normalize_intensities(dataset_df) + print( + f"{sample} -> Number of peptides after peptidofrom selection: " + + str(len(dataset_df.index)) + ) - print("Select the best peptidoform across fractions...") - print( - "Number of peptides before peptidofrom selection: " + str(len(dataset_df.index)) - ) - dataset_df = get_peptidoform_normalize_intensities(dataset_df) - print( - "Number of peptides after peptidofrom selection: " + str(len(dataset_df.index)) - ) + # Add the peptide sequence canonical without the modifications + if PEPTIDE_CANONICAL not in dataset_df.columns: + print(f"{sample} -> Add Canonical peptides to the dataframe...") + dataset_df[PEPTIDE_CANONICAL] = dataset_df[PEPTIDE_SEQUENCE].apply( + get_canonical_peptide + ) - # Add the peptide sequence canonical without the modifications - if PEPTIDE_CANONICAL not in dataset_df.columns: - print("Add Canonical peptides to the dataframe...") - dataset_df[PEPTIDE_CANONICAL] = dataset_df[PEPTIDE_SEQUENCE].apply( - lambda x: get_canonical_peptide(x) - ) + print(f"{sample} -> Sum all peptidoforms per Sample...") + print( + f"{sample} -> Number of peptides before sum selection: " + + str(len(dataset_df.index)) + ) + dataset_df = sum_peptidoform_intensities(dataset_df) + print( + f"{sample} -> Number of peptides after sum: " + + str(len(dataset_df.index)) + ) - print("Sum all peptidoforms per Sample...") - print("Number of peptides before sum selection: " + str(len(dataset_df.index))) - dataset_df = sum_peptidoform_intensities(dataset_df) - print("Number of peptides after sum: " + str(len(dataset_df.index))) + print(f"{sample} -> Average all peptidoforms per Peptide/Sample...") + print( + f"{sample} -> Number of peptides before average: " + + str(len(dataset_df.index)) + ) + dataset_df = average_peptide_intensities(dataset_df) + print( + f"{sample} -> Number of peptides after average: " + + str(len(dataset_df.index)) + ) - print("Average all peptidoforms per Peptide/Sample...") - print("Number of peptides before average: " + str(len(dataset_df.index))) - dataset_df = average_peptide_intensities(dataset_df) - print("Number of peptides after average: " + str(len(dataset_df.index))) + dataset_df = dataset_df.drop_duplicates() + dataset_df = dataset_df[dataset_df[NORM_INTENSITY].notna()] + return dataset_df - if verbose: + # TODO: Peptide intensity normalization + peptides_count = {} + norm_intensities_df = pd.DataFrame() + if not skip_normalization: + if nmethod == "qnorm": + norm_intensity = {k: v[0] for k, v in quantile.items()} + elif nmethod == "msstats": + # For ISO normalization + print(f"Label -> {label}") + if label in ["TMT", "ITRAQ"]: + median_baseline = np.median( + list(set(sum(group_intensities.values(), []))) + ) + group_intensities = { + key: np.median(list(values)) + for key, values in group_intensities.items() + } + else: + fractions = [i[1] for i in group_intensities.keys()] + fraction_median = {} + for fraction in fractions: + fraction_keys = [ + i for i in group_intensities.keys() if i[1] == fraction + ] + fraction_intensities = [] + for key in fraction_keys: + fraction_intensities.extend(group_intensities[key]) + fraction_median[fraction] = np.median(fraction_intensities) + group_intensities = { + key: np.median(values) + for key, values in group_intensities.items() + } + for sample in sample_names: + dataset_df = pd.read_csv(f"{temp}/{sample}.csv", sep=",") + if len(dataset_df) != 0: + norm_df = normalization( + dataset_df, label, sample, skip_normalization, nmethod + ) + else: + continue + sample_peptides = norm_df[PEPTIDE_CANONICAL].unique().tolist() + if remove_low_frequency_peptides: + peptides_count = { + peptide: peptides_count.get(peptide, 0) + 1 + for peptide in sample_peptides + } + norm_df.to_csv(f"{temp}/{sample}.csv", sep=",", index=False) + if sample in plot_samples: + norm_intensities_df = pd.concat([norm_intensities_df, norm_df]) + del group_intensities, quantile + # Save normalized intensities QC plots + norm_intensities_df = norm_intensities_df.reset_index(drop=True) log_after_norm = ( nmethod == "msstats" or nmethod == "qnorm" or ((nmethod == "quantile" or nmethod == "robust") and not log2) ) density = plot_distributions( - dataset_df, + norm_intensities_df, NORM_INTENSITY, SAMPLE_ID, log2=log_after_norm, width=plot_width, - title="Peptide intensity distribution method: " + nmethod, + title="Peptidoform intensity distribution after normalization, method: " + + nmethod, ) plt.show() pdf.savefig(density) box = plot_box_plot( - dataset_df, + norm_intensities_df, NORM_INTENSITY, SAMPLE_ID, log2=log_after_norm, width=plot_width, - title="Peptide intensity distribution method: " + nmethod, + title="Peptidoform intensity distribution after normalization, method: " + + nmethod, violin=violin, ) plt.show() pdf.savefig(box) + del norm_intensities_df, strong_proteins - if remove_low_frequency_peptides: - print( - "Peptides before removing low frequency peptides: " - + str(len(dataset_df.index)) - ) - print(dataset_df) - dataset_df = remove_low_frequency_peptides_(dataset_df, 0.20) - print_dataset_size( - dataset_df, "Peptides after remove low frequency peptides: ", verbose - ) + print("IBAQPY WARNING: Writing normalized intensities into CSV...") + if remove_low_frequency_peptides: + sample_number = len(sample_names) + min_sample = 1 if sample_number > 1 else 0 + peptides_count = { + k: v / sample_number + for k, v in peptides_count.items() + if v / sample_number >= 0.2 and v > min_sample + } + strong_peptides = list(peptides_count.keys()) + del peptides_count - # Perform imputation using Random Forest in Peptide Intensities - # TODO: Check if this is necessary (Probably we can do some research if imputation at peptide level is necessary - # if impute: - # dataset_df = impute_peptide_intensities(dataset_df, field=NORM_INTENSITY, class_field=SAMPLE_ID) - - if pnormalization: - print("Normalize at Peptide level...") - dataset_df = peptide_intensity_normalization( - dataset_df, - field=NORM_INTENSITY, - class_field=SAMPLE_ID, - scaling_method=nmethod, - ) + final_norm_intensities_df = pd.DataFrame() + for sample in sample_names: + dataset_df = pd.read_csv(f"{temp}/{sample}.csv", sep=",") + if remove_low_frequency_peptides: + # Filter low-frequency peptides, which indicate whether the peptide occurs less than 20% in all samples or + # only in one sample + dataset_df = dataset_df[ + dataset_df[PEPTIDE_CANONICAL].isin(strong_peptides) + ] + dataset_df = dataset_df[ + [PEPTIDE_CANONICAL, PROTEIN_NAME, SAMPLE_ID, NORM_INTENSITY, CONDITION] + ] + write_mode = "a" if os.path.exists(output) else "w" + header = False if os.path.exists(output) else True + dataset_df.to_csv(output, index=False, header=header, mode=write_mode) + dataset_df.to_csv(f"{temp}/{sample}.csv", sep=",", index=False) + if sample in plot_samples: + final_norm_intensities_df = pd.concat( + [final_norm_intensities_df, dataset_df] + ) + if remove_low_frequency_peptides: + del strong_peptides - if verbose: + # Save final normalized intensities QC plots log_after_norm = ( nmethod == "msstats" or nmethod == "qnorm" or ((nmethod == "quantile" or nmethod == "robust") and not log2) ) + final_norm_intensities_df = final_norm_intensities_df.reset_index(drop=True) density = plot_distributions( - dataset_df, + final_norm_intensities_df, NORM_INTENSITY, SAMPLE_ID, log2=log_after_norm, @@ -742,7 +1221,7 @@ def peptide_normalization( plt.show() pdf.savefig(density) box = plot_box_plot( - dataset_df, + final_norm_intensities_df, NORM_INTENSITY, SAMPLE_ID, log2=log_after_norm, @@ -754,9 +1233,6 @@ def peptide_normalization( pdf.savefig(box) pdf.close() - print("Save the normalized peptide intensities...") - dataset_df.to_csv(output, index=False, sep=",") - if __name__ == "__main__": peptide_normalization() diff --git a/bin/peptide_normalization_stream.py b/bin/peptide_normalization_stream.py deleted file mode 100644 index a0edc2b..0000000 --- a/bin/peptide_normalization_stream.py +++ /dev/null @@ -1,702 +0,0 @@ -#!/usr/bin/env python - -import os -import random -import uuid -from matplotlib.backends.backend_pdf import PdfPages -import pyarrow.parquet as pq -from ibaq.ibaqpy_commons import * - - -def read_large_parquet(parquet_path: str, batch_size: int = 100000): - parquet_file = pq.ParquetFile(parquet_path) - for batch in parquet_file.iter_batches(batch_size=batch_size): - batch_df = batch.to_pandas() - yield batch_df - - -def extract_label_from_sdrf(sdrf_path: str, compression: bool) -> tuple: - sdrf_df = pd.read_csv(sdrf_path, sep="\t", compression=compression) - sdrf_df[REFERENCE] = sdrf_df["comment[data file]"].apply(get_spectrum_prefix) - - # Determine label type - labels = set(sdrf_df["comment[label]"]) - choice = None - if len(labels) == 1: - label = "LFQ" - elif "TMT" in ",".join(labels) or "tmt" in ",".join(labels): - if ( - len(labels) > 11 - or "TMT134N" in labels - or "TMT133C" in labels - or "TMT133N" in labels - or "TMT132C" in labels - or "TMT132N" in labels - ): - choice = TMT16plex - elif len(labels) == 11 or "TMT131C" in labels: - choice = TMT11plex - elif len(labels) > 6: - choice = TMT10plex - else: - choice = TMT6plex - choice_df = ( - pd.DataFrame.from_dict(choice, orient="index", columns=[CHANNEL]) - .reset_index() - .rename(columns={"index": "comment[label]"}) - ) - sdrf_df = sdrf_df.merge(choice_df, on="comment[label]", how="left") - label = "TMT" - elif "ITRAQ" in ",".join(labels) or "itraq" in ",".join(labels): - if len(labels) > 4: - choice = ITRAQ8plex - else: - choice = ITRAQ4plex - choice_df = ( - pd.DataFrame.from_dict(choice, orient="index", columns=[CHANNEL]) - .reset_index() - .rename(columns={"index": "comment[label]"}) - ) - sdrf_df = sdrf_df.merge(choice_df, on="comment[label]", how="left") - label = "ITRAQ" - else: - print("Warning: Only support label free, TMT and ITRAQ experiment!") - exit(1) - sample_names = sdrf_df["source name"].unique().tolist() - - return sdrf_df, label, sample_names, choice - - -def extract_label_from_parquet(parquet_path: str, batch_size: int = 100000) -> tuple: - parquet_chunks = read_large_parquet(parquet_path, batch_size) - labels, samples = list(), list() - for chunk in parquet_chunks: - samples.extend(chunk["sample_accession"].unique().tolist()) - labels.extend(chunk["isotope_label_type"].unique().tolist()) - samples = list(set(samples)) - labels = list(set(labels)) - choice = None - if len(labels) == 1: - label = "LFQ" - elif "TMT" in ",".join(labels) or "tmt" in ",".join(labels): - if ( - len(labels) > 11 - or "TMT134N" in labels - or "TMT133C" in labels - or "TMT133N" in labels - or "TMT132C" in labels - or "TMT132N" in labels - ): - choice = TMT16plex - elif len(labels) == 11 or "TMT131C" in labels: - choice = TMT11plex - elif len(labels) > 6: - choice = TMT10plex - else: - choice = TMT6plex - label = "TMT" - elif "ITRAQ" in ",".join(labels) or "itraq" in ",".join(labels): - if len(labels) > 4: - choice = ITRAQ8plex - else: - choice = ITRAQ4plex - label = "ITRAQ" - else: - print("Warning: Only support label free, TMT and ITRAQ experiment!") - exit(1) - - return label, samples, choice - - - -@click.command() -@click.option( - "-m", "--msstats", help="MsStats file import generated by quantms", default=None -) -@click.option( - "-p", - "--parquet", - help="Feature parquet import generated by quantms.io", - default=None, -) -@click.option( - "-s", - "--sdrf", - help="SDRF file import generated by quantms", -) -@click.option("--compress", help="Read all files compress", is_flag=True) -@click.option( - "--chunksize", - help="The number of rows of MSstats read using pandas streaming", - default=1000000, -) -@click.option( - "--min_aa", help="Minimum number of amino acids to filter peptides", default=7 -) -@click.option( - "--min_unique", - help="Minimum number of unique peptides to filter proteins", - default=2, -) -@click.option( - "--remove_ids", - help="Remove specific protein ids from the analysis using a file with one id per line", -) -@click.option( - "--remove_decoy_contaminants", - help="Remove decoy and contaminants proteins from the analysis", - is_flag=True, - default=False, -) -@click.option( - "--remove_low_frequency_peptides", - help="Remove peptides that are present in less than 20% of the samples", - is_flag=True, - default=False, -) -@click.option( - "--skip_normalization", help="Skip normalization step", is_flag=True, default=False -) -@click.option( - "--nmethod", - help="Normalization method used to normalize intensities for all samples (options: qnorm)", - default="qnorm", -) -@click.option( - "--output", - help="Peptide intensity file including other all properties for normalization", -) -@click.option( - "--compress", - help="Read the input peptides file in compress gzip file", - is_flag=True, -) -@click.option( - "--log2", - help="Transform to log2 the peptide intensity values before normalization", - is_flag=True, -) -@click.option( - "--violin", - help="Use violin plot instead of boxplot for distribution representations", - is_flag=True, -) -@click.option( - "--qc_report", - help="PDF file to store multiple QC images", - default=f"StreamPeptideNorm-QCprofile.pdf", -) -def peptide_normalization( - msstats: str, - parquet: str, - sdrf: str, - remove_ids: str, - remove_decoy_contaminants: bool, - remove_low_frequency_peptides: bool, - output: str, - skip_normalization: bool, - min_aa: int, - min_unique: int, - nmethod: str, - compress: bool, - chunksize: int, - log2: bool, - violin: bool, - qc_report: str, -) -> None: - """Intensity normalization of stream processing peptide performed from MSstats - - :param msstats: - :param sdrf: - :param contaminants: - :param output: - :param skip_normalization: - :param min_aa: - :param min_unique: - :param nmethod: - :param compress: - :param chunksize: - :param log2: - :param violin: - :param qc_report: - :return: - """ - - if output is None: - print_help_msg(peptide_normalization) - exit(1) - - pd.set_option("display.max_columns", None) - print("Loading data..") - compression_method = "gzip" if compress else None - - if parquet is None: - sdrf_df, label, sample_names, choice = extract_label_from_sdrf( - sdrf, compression_method - ) - msstats_chunks = pd.read_csv( - msstats, - sep=",", - compression=compression_method, - dtype={CONDITION: "category", ISOTOPE_LABEL_TYPE: "category"}, - chunksize=chunksize, - ) - else: - label, sample_names, choice = extract_label_from_parquet(parquet, batch_size=chunksize) - msstats_chunks = read_large_parquet(parquet, batch_size=chunksize) - - # TODO: Stream processing to obtain strong proteins with more than 2 uniqe peptides - temp = f"Temp-{str(uuid.uuid4())}/" - os.mkdir(temp) - print(f"IBAQPY WARNING: Writing files into {temp}...") - unique_peptides = {} - canonical_dict = {} - group_intensities = {} - quantile = {} - for msstats_df in msstats_chunks: - if parquet is None: - msstats_df.rename( - columns={ - "ProteinName": PROTEIN_NAME, - "PeptideSequence": PEPTIDE_SEQUENCE, - "PrecursorCharge": PEPTIDE_CHARGE, - "Run": RUN, - "Condition": CONDITION, - "Intensity": INTENSITY, - }, - inplace=True, - ) - else: - msstats_df = msstats_df[FEATURE_COLUMNS] - msstats_df = msstats_df.rename(columns=parquet_map) - msstats_df[PROTEIN_NAME] = msstats_df.apply( - lambda x: ",".join(x[PROTEIN_NAME]), axis=1 - ) - if label == "LFQ": - msstats_df.drop(CHANNEL, inplace=True, axis=1) - else: - msstats_df[CHANNEL] = msstats_df[CHANNEL].map(choice) - msstats_df = msstats_df[msstats_df["Condition"] != "Empty"] - msstats_df = msstats_df[msstats_df[INTENSITY] > 0] - if PEPTIDE_CANONICAL not in msstats_df.columns: - modified_seqs = msstats_df[PEPTIDE_SEQUENCE].unique().tolist() - canonical_seqs = [get_canonical_peptide(i) for i in modified_seqs] - inner_canonical_dict = dict(zip(modified_seqs, canonical_seqs)) - canonical_dict.update(inner_canonical_dict) - msstats_df[PEPTIDE_CANONICAL] = msstats_df.apply( - lambda x: inner_canonical_dict[x[PEPTIDE_SEQUENCE]], axis=1 - ) - # Filter peptides with less amino acids than min_aa (default: 7) - msstats_df = msstats_df[ - msstats_df.apply(lambda x: len(x[PEPTIDE_CANONICAL]) >= min_aa, axis=1) - ] - msstats_df[PROTEIN_NAME] = msstats_df[PROTEIN_NAME].apply( - parse_uniprot_accession - ) - - if FRACTION not in msstats_df.columns: - msstats_df[FRACTION] = 1 - msstats_df = msstats_df[ - [ - PROTEIN_NAME, - PEPTIDE_SEQUENCE, - PEPTIDE_CANONICAL, - PEPTIDE_CHARGE, - INTENSITY, - REFERENCE, - CONDITION, - RUN, - BIOREPLICATE, - FRACTION, - FRAGMENT_ION, - ISOTOPE_LABEL_TYPE, - ] - ] - - if parquet is None: - # Merged the SDRF with the Resulted file - if label == "LFQ": - msstats_df[REFERENCE] = msstats_df[REFERENCE].apply(get_spectrum_prefix) - result_df = pd.merge( - msstats_df, - sdrf_df[["source name", REFERENCE]], - how="left", - on=[REFERENCE], - ) - elif label == "TMT": - msstats_df[REFERENCE] = msstats_df[REFERENCE].apply(get_spectrum_prefix) - result_df = pd.merge( - msstats_df, - sdrf_df[["source name", REFERENCE, CHANNEL]], - how="left", - on=[REFERENCE, CHANNEL], - ) - result_df = result_df[result_df["Condition"] != "Empty"] - result_df.rename(columns={"Charge": PEPTIDE_CHARGE}, inplace=True) - elif label == "ITRAQ": - msstats_df[REFERENCE] = msstats_df[REFERENCE].apply(get_spectrum_prefix) - result_df = pd.merge( - msstats_df, - sdrf_df[["source name", REFERENCE, CHANNEL]], - how="left", - on=[REFERENCE, CHANNEL], - ) - result_df = result_df[result_df["Condition"] != "Empty"] - result_df.rename(columns={"Charge": PEPTIDE_CHARGE}, inplace=True) - result_df.rename(columns={"source name": SAMPLE_ID}, inplace=True) - else: - result_df = msstats_df[msstats_df["Condition"] != "Empty"] - result_df[STUDY_ID] = result_df[SAMPLE_ID].str.split("-").str[0] - - # Write CSVs by Sample ID - for sample in sample_names: - file_name = f"{temp}/{sample}.csv" - write_mode = "a" if os.path.exists(file_name) else "w" - header = False if os.path.exists(file_name) else True - result_df[result_df[SAMPLE_ID] == sample].to_csv( - file_name, index=False, header=header, mode=write_mode - ) - unique_df = result_df.groupby([PEPTIDE_CANONICAL]).filter( - lambda x: len(set(x[PROTEIN_NAME])) == 1 - )[[PEPTIDE_CANONICAL, PROTEIN_NAME]] - unique_dict = dict(zip(unique_df[PEPTIDE_CANONICAL], unique_df[PROTEIN_NAME])) - for i in unique_dict.keys(): - if i in unique_peptides.keys() and unique_dict[i] != unique_peptides[i]: - unique_peptides.pop(i) - else: - unique_peptides[i] = unique_dict[i] - - proteins_list = list(unique_peptides.values()) - count_dict = { - element: proteins_list.count(element) for element in set(proteins_list) - } - strong_proteins = [ - element for element in count_dict if count_dict[element] >= min_unique - ] - del proteins_list, count_dict - print(f"Number of unique peptides: {len(list(unique_peptides.keys()))}") - print(f"Number of strong proteins: {len(strong_proteins)}") - - # TODO: Filter proteins with less unique peptides than min_unique (default: 2) - plot_samples = random.sample(sample_names, min(len(sample_names), 20)) - plot_width = 10 + len(plot_samples) * 0.5 - pdf = PdfPages(qc_report) - original_intensities_df = pd.DataFrame() - - for sample in sample_names: - print( - f"{sample} -> Filter out proteins containing unique peptides fewer than {min_unique}.." - ) - msstats_df = pd.read_csv(f"{temp}/{sample}.csv", sep=",") - msstats_df = msstats_df[msstats_df[PROTEIN_NAME].isin(strong_proteins)] - print(f"{sample} -> Logarithmic if specified..") - msstats_df.loc[msstats_df.Intensity == 0, INTENSITY] = 1 - msstats_df[NORM_INTENSITY] = ( - np.log2(msstats_df[INTENSITY]) if log2 else msstats_df[INTENSITY] - ) - msstats_df.to_csv(f"{temp}/{sample}.csv", index=False, sep=",") - if sample in plot_samples: - original_intensities_df = pd.concat([original_intensities_df, msstats_df]) - if not skip_normalization: - if nmethod == "msstats": - if label in ["TMT", "ITRAQ"]: - g = msstats_df.groupby(["Run", "Channel"]) - else: - g = msstats_df.groupby(["Run", "Fraction"]) - for name, group in g: - group_intensity = group[NORM_INTENSITY].tolist() - if name not in group_intensities: - group_intensities[name] = group_intensity - else: - group_intensities.update( - {name: group_intensities[NORM_INTENSITY] + group_intensity} - ) - elif nmethod == "qnorm": - - def recalculate(original, count, value): - return (original * count + value) / (count + 1), count + 1 - - dic = ( - msstats_df[NORM_INTENSITY] - .dropna() - .sort_values(ascending=False) - .reset_index(drop=True) - .to_dict() - ) - if len(quantile) == 0: - quantile = {k: (v, 1) for k, v in dic.items()} - else: - update = min(len(quantile), len(dic)) - for i in range(0, update): - original, count = quantile[i] - quantile[i] = recalculate(original, count, dic[i]) - if len(dic) <= len(quantile): - continue - else: - quantile.update( - {k: (v, 1) for k, v in dic.items() if k >= update} - ) - # Save original intensities QC plots - original_intensities_df = original_intensities_df.reset_index(drop=True) - density = plot_distributions( - original_intensities_df, - INTENSITY, - SAMPLE_ID, - log2=not log2, - width=plot_width, - title="Original peptidoform intensity distribution (no normalization)", - ) - pdf.savefig(density) - box = plot_box_plot( - original_intensities_df, - INTENSITY, - SAMPLE_ID, - log2=not log2, - width=plot_width, - title="Original peptidoform intensity distribution (no normalization)", - violin=violin, - ) - pdf.savefig(box) - del original_intensities_df - - def normalization(dataset_df, label, sample, skip_normalization, nmethod): - # Remove high abundant and contaminants proteins and the outliers - if remove_ids is not None: - print(f"{sample} -> Remove contaminants...") - dataset_df = remove_protein_by_ids(dataset_df, remove_ids) - if remove_decoy_contaminants: - print(f"{sample} -> Remove decoy and contaminants...") - dataset_df = remove_contaminants_decoys(dataset_df) - print( - f"{sample} -> Peptides after contaminants removal: {len(dataset_df[PEPTIDE_SEQUENCE].unique().tolist())}" - ) - - if not skip_normalization: - print(f"{sample} -> Normalize intensities.. ") - field = NORM_INTENSITY - if nmethod == "msstats": - # For ISO normalization - if label in ["TMT", "ITRAQ"]: - dataset_df.loc[:, NORM_INTENSITY] = dataset_df.apply( - lambda x: x[field] - - group_intensities[(x["Run"], x["Channel"])] - + median_baseline, - axis=1, - ) - else: - dataset_df.loc[:, NORM_INTENSITY] = dataset_df.apply( - lambda x: x[field] - - group_intensities[(x["Run"], x["Fraction"])] - + np.median( - [ - group_intensities[i] - for i in group_intensities.keys() - if i[1] == x["Fraction"] - ] - ), - axis=1, - ) - elif nmethod == "qnorm": - # pivot to have one col per sample - ref_dict = ( - dataset_df[NORM_INTENSITY] - .dropna() - .drop_duplicates() - .sort_values(ascending=False) - .reset_index(drop=True) - .to_dict() - ) - ref_dict = {v: norm_intensity[k] for k, v in ref_dict.items()} - dataset_df.loc[:, NORM_INTENSITY] = dataset_df.apply( - lambda x: ref_dict[x[NORM_INTENSITY]] - if x[NORM_INTENSITY] in ref_dict.keys() - else np.nan, - axis=1, - ) - - print(f"{sample} -> Select the best peptidoform across fractions...") - print( - f"{sample} -> Number of peptides before peptidofrom selection: " - + str(len(dataset_df.index)) - ) - dataset_df = get_peptidoform_normalize_intensities(dataset_df) - print( - f"{sample} -> Number of peptides after peptidofrom selection: " - + str(len(dataset_df.index)) - ) - - # Add the peptide sequence canonical without the modifications - if PEPTIDE_CANONICAL not in dataset_df.columns: - print(f"{sample} -> Add Canonical peptides to the dataframe...") - dataset_df[PEPTIDE_CANONICAL] = dataset_df[PEPTIDE_SEQUENCE].apply( - get_canonical_peptide - ) - - print(f"{sample} -> Sum all peptidoforms per Sample...") - print( - f"{sample} -> Number of peptides before sum selection: " - + str(len(dataset_df.index)) - ) - dataset_df = sum_peptidoform_intensities(dataset_df) - print( - f"{sample} -> Number of peptides after sum: " + str(len(dataset_df.index)) - ) - - print(f"{sample} -> Average all peptidoforms per Peptide/Sample...") - print( - f"{sample} -> Number of peptides before average: " - + str(len(dataset_df.index)) - ) - dataset_df = average_peptide_intensities(dataset_df) - print( - f"{sample} -> Number of peptides after average: " - + str(len(dataset_df.index)) - ) - - dataset_df = dataset_df.drop_duplicates() - dataset_df = dataset_df[dataset_df[NORM_INTENSITY].notna()] - return dataset_df - - # TODO: Peptide intensity normalization - peptides_count = {} - norm_intensities_df = pd.DataFrame() - if not skip_normalization: - if nmethod == "qnorm": - norm_intensity = {k: v[0] for k, v in quantile.items()} - elif nmethod == "msstats": - # For ISO normalization - print(f"Label -> {label}") - if label in ["TMT", "ITRAQ"]: - median_baseline = np.median( - list(set(sum(group_intensities.values(), []))) - ) - group_intensities = { - key: np.median(list(values)) - for key, values in group_intensities.items() - } - else: - fractions = [i[1] for i in group_intensities.keys()] - fraction_median = {} - for fraction in fractions: - fraction_keys = [ - i for i in group_intensities.keys() if i[1] == fraction - ] - fraction_intensities = [] - for key in fraction_keys: - fraction_intensities.extend(group_intensities[key]) - fraction_median[fraction] = np.median(fraction_intensities) - group_intensities = { - key: np.median(values) for key, values in group_intensities.items() - } - for sample in sample_names: - dataset_df = pd.read_csv(f"{temp}/{sample}.csv", sep=",") - if len(dataset_df) != 0: - norm_df = normalization( - dataset_df, label, sample, skip_normalization, nmethod - ) - else: - continue - sample_peptides = norm_df[PEPTIDE_CANONICAL].unique().tolist() - if remove_low_frequency_peptides: - peptides_count = { - peptide: peptides_count.get(peptide, 0) + 1 - for peptide in sample_peptides - } - norm_df.to_csv(f"{temp}/{sample}.csv", sep=",", index=False) - if sample in plot_samples: - norm_intensities_df = pd.concat([norm_intensities_df, norm_df]) - del group_intensities, quantile - # Save normalized intensities QC plots - norm_intensities_df = norm_intensities_df.reset_index(drop=True) - log_after_norm = ( - nmethod == "msstats" - or nmethod == "qnorm" - or ((nmethod == "quantile" or nmethod == "robust") and not log2) - ) - density = plot_distributions( - norm_intensities_df, - NORM_INTENSITY, - SAMPLE_ID, - log2=log_after_norm, - width=plot_width, - title="Peptidoform intensity distribution after normalization, method: " - + nmethod, - ) - pdf.savefig(density) - box = plot_box_plot( - norm_intensities_df, - NORM_INTENSITY, - SAMPLE_ID, - log2=log_after_norm, - width=plot_width, - title="Peptidoform intensity distribution after normalization, method: " - + nmethod, - violin=violin, - ) - pdf.savefig(box) - del norm_intensities_df, strong_proteins - - print("IBAQPY WARNING: Writing normalized intensities into CSV...") - if remove_low_frequency_peptides: - sample_number = len(sample_names) - min_sample = 1 if sample_number > 1 else 0 - peptides_count = { - k: v / sample_number - for k, v in peptides_count.items() - if v / sample_number >= 0.2 and v > min_sample - } - strong_peptides = list(peptides_count.keys()) - del peptides_count - - final_norm_intensities_df = pd.DataFrame() - for sample in sample_names: - dataset_df = pd.read_csv(f"{temp}/{sample}.csv", sep=",") - if remove_low_frequency_peptides: - # Filter low-frequency peptides, which indicate whether the peptide occurs less than 20% in all samples or - # only in one sample - dataset_df = dataset_df[dataset_df[PEPTIDE_CANONICAL].isin(strong_peptides)] - dataset_df = dataset_df[ - [PEPTIDE_CANONICAL, PROTEIN_NAME, SAMPLE_ID, NORM_INTENSITY, CONDITION] - ] - write_mode = "a" if os.path.exists(output) else "w" - header = False if os.path.exists(output) else True - dataset_df.to_csv(output, index=False, header=header, mode=write_mode) - dataset_df.to_csv(f"{temp}/{sample}.csv", sep=",", index=False) - if sample in plot_samples: - final_norm_intensities_df = pd.concat( - [final_norm_intensities_df, dataset_df] - ) - if remove_low_frequency_peptides: - del strong_peptides - - # Save final normalized intensities QC plots - log_after_norm = ( - nmethod == "msstats" - or nmethod == "qnorm" - or ((nmethod == "quantile" or nmethod == "robust") and not log2) - ) - final_norm_intensities_df = final_norm_intensities_df.reset_index(drop=True) - density = plot_distributions( - final_norm_intensities_df, - NORM_INTENSITY, - SAMPLE_ID, - log2=log_after_norm, - width=plot_width, - title="Normalization at peptide level method: " + nmethod, - ) - pdf.savefig(density) - box = plot_box_plot( - final_norm_intensities_df, - NORM_INTENSITY, - SAMPLE_ID, - log2=log_after_norm, - width=plot_width, - title="Normalization at peptide level method: " + nmethod, - violin=violin, - ) - pdf.savefig(box) - pdf.close() - - -if __name__ == "__main__": - peptide_normalization() diff --git a/ibaq/ibaqpy_commons.py b/ibaq/ibaqpy_commons.py index 5dab149..236d6b6 100644 --- a/ibaq/ibaqpy_commons.py +++ b/ibaq/ibaqpy_commons.py @@ -1,6 +1,5 @@ import os import re -from typing import OrderedDict import click import matplotlib @@ -10,7 +9,7 @@ from matplotlib import pyplot as plt from pandas import DataFrame -FEATURE_COLUMNS = [ +PARQUET_COLUMNS = [ "protein_accessions", "peptidoform", "sequence", @@ -403,50 +402,6 @@ def average_peptide_intensities(dataset: DataFrame) -> DataFrame: return dataset_df -# TODO: Useless funcs -def get_mbr_hit(scan: str): - """ - This function annotates if the peptide is inferred or not by Match between Runs algorithm (1), 0 if the peptide is - identified in the corresponding file. - :param scan: scan value - :return: - """ - return 1 if pd.isna(scan) else 0 - - -def get_run_mztab(ms_run: str, metadata: OrderedDict) -> str: - """ - Convert the ms_run into a reference file for merging with msstats output - :param ms_run: ms_run index in mztab - :param metadata: metadata information in mztab - :return: file name - """ - m = re.search(r"\[([A-Za-z0-9_]+)\]", ms_run) - file_location = metadata["ms_run[" + str(m.group(1)) + "]-location"] - file_location = get_spectrum_prefix(file_location) - return os.path.basename(file_location) - - -def get_scan_mztab(ms_run: str) -> str: - """ - Get the scan number for an mzML spectrum in mzTab. The format of the reference - must be controllerType=0 controllerNumber=1 scan=30121 - :param ms_run: the original ms_run reference in mzTab - :return: the scan index - """ - reference_parts = ms_run.split() - return reference_parts[-1] - - -def best_probability_error_bestsearch_engine(probability: float) -> float: - """ - Convert probability to a Best search engine score - :param probability: probability - :return: - """ - return 1 - probability - - # Functions needed by Combiner def load_sdrf(sdrf_path: str) -> DataFrame: """ From 34395842baf8a6883f8325532763baee54795264 Mon Sep 17 00:00:00 2001 From: WangHong007 <88552471+WangHong007@users.noreply.github.com> Date: Sat, 25 Nov 2023 17:32:08 +0800 Subject: [PATCH 2/7] Unified log content --- bin/peptide_normalization.py | 107 +++++++++++------------------------ 1 file changed, 33 insertions(+), 74 deletions(-) diff --git a/bin/peptide_normalization.py b/bin/peptide_normalization.py index c5b4d34..39e952d 100644 --- a/bin/peptide_normalization.py +++ b/bin/peptide_normalization.py @@ -315,6 +315,7 @@ def intensity_normalization( dataset, index=[ PEPTIDE_SEQUENCE, + PEPTIDE_CANONICAL, PEPTIDE_CHARGE, FRACTION, RUN, @@ -333,6 +334,7 @@ def intensity_normalization( normalize_df = normalize_df.melt( id_vars=[ PEPTIDE_SEQUENCE, + PEPTIDE_CANONICAL, PEPTIDE_CHARGE, FRACTION, RUN, @@ -637,7 +639,8 @@ def peptide_normalization( ) dataset_df = dataset_df[dataset_df[PROTEIN_NAME].isin(strong_proteins)] - print_dataset_size(dataset_df, "Number of peptides: ", verbose) + print(f"Number of unique peptides: {len(unique_peptides)}") + print(f"Number of strong proteins: {len(strong_proteins)}") print("Logarithmic if specified..") dataset_df.loc[dataset_df.Intensity == 0, INTENSITY] = 1 @@ -721,32 +724,22 @@ def peptide_normalization( ) plt.show() pdf.savefig(box) - - print("Select the best peptidoform across fractions...") print( - "Number of peptides before peptidofrom selection: " + "Number of peptides after normalization: " + str(len(dataset_df.index)) ) + print("Select the best peptidoform across fractions...") dataset_df = get_peptidoform_normalize_intensities(dataset_df) print( "Number of peptides after peptidofrom selection: " + str(len(dataset_df.index)) ) - # Add the peptide sequence canonical without the modifications - if PEPTIDE_CANONICAL not in dataset_df.columns: - print("Add Canonical peptides to the dataframe...") - dataset_df[PEPTIDE_CANONICAL] = dataset_df[PEPTIDE_SEQUENCE].apply( - lambda x: get_canonical_peptide(x) - ) - print("Sum all peptidoforms per Sample...") - print("Number of peptides before sum selection: " + str(len(dataset_df.index))) dataset_df = sum_peptidoform_intensities(dataset_df) - print("Number of peptides after sum: " + str(len(dataset_df.index))) + print("Number of peptides after selection: " + str(len(dataset_df.index))) print("Average all peptidoforms per Peptide/Sample...") - print("Number of peptides before average: " + str(len(dataset_df.index))) dataset_df = average_peptide_intensities(dataset_df) print("Number of peptides after average: " + str(len(dataset_df.index))) @@ -779,10 +772,6 @@ def peptide_normalization( pdf.savefig(box) if remove_low_frequency_peptides: - print( - "Peptides before removing low frequency peptides: " - + str(len(dataset_df.index)) - ) print(dataset_df) dataset_df = remove_low_frequency_peptides_(dataset_df, 0.20) print_dataset_size( @@ -855,10 +844,11 @@ def peptide_normalization( # TODO: Stream processing to obtain strong proteins with more than 2 uniqe peptides temp = f"Temp-{str(uuid.uuid4())}/" os.mkdir(temp) - print(f"IBAQPY WARNING: Writing files into {temp}...") + print(f"INFO: Writing files into {temp}...") unique_peptides = {} group_intensities = {} quantile = {} + print("INFO: First iteration to get unique peptides and strong proteins...") for msstats_df in msstats_chunks: if parquet is None: msstats_df = msstats_common_process(msstats_df) @@ -872,7 +862,6 @@ def peptide_normalization( file_name = f"{temp}/{sample}.csv" write_mode = "a" if os.path.exists(file_name) else "w" header = False if os.path.exists(file_name) else True - print(result_df[SAMPLE_ID]) result_df[result_df[SAMPLE_ID] == sample].to_csv( file_name, index=False, header=header, mode=write_mode ) @@ -905,13 +894,11 @@ def peptide_normalization( pdf = PdfPages(qc_report) original_intensities_df = pd.DataFrame() + print("INFO: Second iteration to filter data and prepare normalization...") + print("Logarithmic if specified..") for sample in sample_names: - print( - f"{sample} -> Filter out proteins containing unique peptides fewer than {min_unique}.." - ) msstats_df = pd.read_csv(f"{temp}/{sample}.csv", sep=",") msstats_df = msstats_df[msstats_df[PROTEIN_NAME].isin(strong_proteins)] - print(f"{sample} -> Logarithmic if specified..") msstats_df.loc[msstats_df.Intensity == 0, INTENSITY] = 1 msstats_df[NORM_INTENSITY] = ( np.log2(msstats_df[INTENSITY]) if log2 else msstats_df[INTENSITY] @@ -987,20 +974,15 @@ def recalculate(original, count, value): pdf.savefig(box) del original_intensities_df - def normalization(dataset_df, label, sample, skip_normalization, nmethod): + def normalization(dataset_df, label, sample, skip_normalization, nmethod, record): # Remove high abundant and contaminants proteins and the outliers if remove_ids is not None: - print(f"{sample} -> Remove contaminants...") dataset_df = remove_protein_by_ids(dataset_df, remove_ids) if remove_decoy_contaminants: - print(f"{sample} -> Remove decoy and contaminants...") dataset_df = remove_contaminants_decoys(dataset_df) - print( - f"{sample} -> Peptides after contaminants removal: {len(dataset_df[PEPTIDE_SEQUENCE].unique().tolist())}" - ) + record[0] += len(dataset_df.index) if not skip_normalization: - print(f"{sample} -> Normalize intensities.. ") field = NORM_INTENSITY if nmethod == "msstats": # For ISO normalization @@ -1041,60 +1023,27 @@ def normalization(dataset_df, label, sample, skip_normalization, nmethod): else np.nan, axis=1, ) - - print(f"{sample} -> Select the best peptidoform across fractions...") - print( - f"{sample} -> Number of peptides before peptidofrom selection: " - + str(len(dataset_df.index)) - ) + record[1] += len(dataset_df.index) dataset_df = get_peptidoform_normalize_intensities(dataset_df) - print( - f"{sample} -> Number of peptides after peptidofrom selection: " - + str(len(dataset_df.index)) - ) - - # Add the peptide sequence canonical without the modifications - if PEPTIDE_CANONICAL not in dataset_df.columns: - print(f"{sample} -> Add Canonical peptides to the dataframe...") - dataset_df[PEPTIDE_CANONICAL] = dataset_df[PEPTIDE_SEQUENCE].apply( - get_canonical_peptide - ) - - print(f"{sample} -> Sum all peptidoforms per Sample...") - print( - f"{sample} -> Number of peptides before sum selection: " - + str(len(dataset_df.index)) - ) + dataset_df = dataset_df[dataset_df[NORM_INTENSITY].notna()] + record[2] += len(dataset_df.index) dataset_df = sum_peptidoform_intensities(dataset_df) - print( - f"{sample} -> Number of peptides after sum: " - + str(len(dataset_df.index)) - ) - - print(f"{sample} -> Average all peptidoforms per Peptide/Sample...") - print( - f"{sample} -> Number of peptides before average: " - + str(len(dataset_df.index)) - ) + record[3] += len(dataset_df.index) dataset_df = average_peptide_intensities(dataset_df) - print( - f"{sample} -> Number of peptides after average: " - + str(len(dataset_df.index)) - ) + record[4] += len(dataset_df.index) dataset_df = dataset_df.drop_duplicates() - dataset_df = dataset_df[dataset_df[NORM_INTENSITY].notna()] - return dataset_df + return dataset_df, record # TODO: Peptide intensity normalization peptides_count = {} + size_record = [0] * 5 norm_intensities_df = pd.DataFrame() if not skip_normalization: if nmethod == "qnorm": norm_intensity = {k: v[0] for k, v in quantile.items()} elif nmethod == "msstats": # For ISO normalization - print(f"Label -> {label}") if label in ["TMT", "ITRAQ"]: median_baseline = np.median( list(set(sum(group_intensities.values(), []))) @@ -1118,11 +1067,12 @@ def normalization(dataset_df, label, sample, skip_normalization, nmethod): key: np.median(values) for key, values in group_intensities.items() } + print("INFO: Third iteration to normalize and counting peptides frequency...") for sample in sample_names: dataset_df = pd.read_csv(f"{temp}/{sample}.csv", sep=",") if len(dataset_df) != 0: - norm_df = normalization( - dataset_df, label, sample, skip_normalization, nmethod + norm_df, size_record = normalization( + dataset_df, label, sample, skip_normalization, nmethod, size_record ) else: continue @@ -1136,6 +1086,11 @@ def normalization(dataset_df, label, sample, skip_normalization, nmethod): if sample in plot_samples: norm_intensities_df = pd.concat([norm_intensities_df, norm_df]) del group_intensities, quantile + print(f"Peptides after contaminants removal: {size_record[0]}") + print(f"Number of peptides after normalization: {size_record[1]}") + print(f"Number of peptides after peptidofrom selection: {size_record[2]}") + print(f"Number of peptides after selection: {size_record[3]}") + print(f"Number of peptides after average: {size_record[4]}") # Save normalized intensities QC plots norm_intensities_df = norm_intensities_df.reset_index(drop=True) log_after_norm = ( @@ -1168,7 +1123,7 @@ def normalization(dataset_df, label, sample, skip_normalization, nmethod): pdf.savefig(box) del norm_intensities_df, strong_proteins - print("IBAQPY WARNING: Writing normalized intensities into CSV...") + print("INFO: Writing normalized intensities into CSV...") if remove_low_frequency_peptides: sample_number = len(sample_names) min_sample = 1 if sample_number > 1 else 0 @@ -1181,6 +1136,7 @@ def normalization(dataset_df, label, sample, skip_normalization, nmethod): del peptides_count final_norm_intensities_df = pd.DataFrame() + size_record = 0 for sample in sample_names: dataset_df = pd.read_csv(f"{temp}/{sample}.csv", sep=",") if remove_low_frequency_peptides: @@ -1189,6 +1145,7 @@ def normalization(dataset_df, label, sample, skip_normalization, nmethod): dataset_df = dataset_df[ dataset_df[PEPTIDE_CANONICAL].isin(strong_peptides) ] + size_record += len(dataset_df.index) dataset_df = dataset_df[ [PEPTIDE_CANONICAL, PROTEIN_NAME, SAMPLE_ID, NORM_INTENSITY, CONDITION] ] @@ -1200,9 +1157,11 @@ def normalization(dataset_df, label, sample, skip_normalization, nmethod): final_norm_intensities_df = pd.concat( [final_norm_intensities_df, dataset_df] ) + print(f"Peptides after remove low frequency peptides: {size_record}") if remove_low_frequency_peptides: del strong_peptides + # TODO: No peptides intensity normalization applied in stream processing. # Save final normalized intensities QC plots log_after_norm = ( nmethod == "msstats" From 48e4354d65fc3cb02869ba168a73d2b94705e6a0 Mon Sep 17 00:00:00 2001 From: WangHong007 <88552471+WangHong007@users.noreply.github.com> Date: Wed, 29 Nov 2023 19:28:09 +0800 Subject: [PATCH 3/7] update peptide_normalization --- bin/peptide_normalization.py | 24 ++++++++++++------------ setup.py | 1 - 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/bin/peptide_normalization.py b/bin/peptide_normalization.py index 39e952d..9c08c82 100644 --- a/bin/peptide_normalization.py +++ b/bin/peptide_normalization.py @@ -268,6 +268,7 @@ def map_canonical_seq(data_df: pd.DataFrame) -> (pd.DataFrame, dict): FRAGMENT_ION, ISOTOPE_LABEL_TYPE, STUDY_ID, + SAMPLE_ID, ] ] data_df[CONDITION] = pd.Categorical(data_df[CONDITION]) @@ -345,7 +346,9 @@ def intensity_normalization( ] ) normalize_df.rename(columns={"value": NORM_INTENSITY}, inplace=True) - print(dataset.head()) + normalize_df = normalize_df[normalize_df[NORM_INTENSITY].notna()] + normalize_df = normalize_df.drop_duplicates() + print(normalize_df.head()) return normalize_df return dataset @@ -371,12 +374,10 @@ def remove_low_frequency_peptides_( ) # Count the number of null values in each row null_count = normalize_df.isnull().sum(axis=1) - # Find the rows that have null values above the threshold rows_to_drop = null_count[ null_count >= (1 - percentage_samples) * normalize_df.shape[1] ].index - # Drop the rows with too many null values normalize_df = normalize_df.drop(rows_to_drop) @@ -771,7 +772,7 @@ def peptide_normalization( plt.show() pdf.savefig(box) - if remove_low_frequency_peptides: + if remove_low_frequency_peptides and len(sample_names) > 1: print(dataset_df) dataset_df = remove_low_frequency_peptides_(dataset_df, 0.20) print_dataset_size( @@ -840,6 +841,7 @@ def peptide_normalization( parquet, batch_size=chunksize ) msstats_chunks = read_large_parquet(parquet, batch_size=chunksize) + sample_number = len(sample_names) # TODO: Stream processing to obtain strong proteins with more than 2 uniqe peptides temp = f"Temp-{str(uuid.uuid4())}/" @@ -1023,16 +1025,16 @@ def normalization(dataset_df, label, sample, skip_normalization, nmethod, record else np.nan, axis=1, ) + dataset_df = dataset_df.drop_duplicates() + dataset_df = dataset_df[dataset_df[NORM_INTENSITY].notna()] record[1] += len(dataset_df.index) dataset_df = get_peptidoform_normalize_intensities(dataset_df) - dataset_df = dataset_df[dataset_df[NORM_INTENSITY].notna()] record[2] += len(dataset_df.index) dataset_df = sum_peptidoform_intensities(dataset_df) record[3] += len(dataset_df.index) dataset_df = average_peptide_intensities(dataset_df) record[4] += len(dataset_df.index) - dataset_df = dataset_df.drop_duplicates() return dataset_df, record # TODO: Peptide intensity normalization @@ -1077,7 +1079,7 @@ def normalization(dataset_df, label, sample, skip_normalization, nmethod, record else: continue sample_peptides = norm_df[PEPTIDE_CANONICAL].unique().tolist() - if remove_low_frequency_peptides: + if remove_low_frequency_peptides and sample_number > 1: peptides_count = { peptide: peptides_count.get(peptide, 0) + 1 for peptide in sample_peptides @@ -1124,13 +1126,11 @@ def normalization(dataset_df, label, sample, skip_normalization, nmethod, record del norm_intensities_df, strong_proteins print("INFO: Writing normalized intensities into CSV...") - if remove_low_frequency_peptides: - sample_number = len(sample_names) - min_sample = 1 if sample_number > 1 else 0 + if remove_low_frequency_peptides and sample_number > 1: peptides_count = { k: v / sample_number for k, v in peptides_count.items() - if v / sample_number >= 0.2 and v > min_sample + if v / sample_number >= 0.80 } strong_peptides = list(peptides_count.keys()) del peptides_count @@ -1139,7 +1139,7 @@ def normalization(dataset_df, label, sample, skip_normalization, nmethod, record size_record = 0 for sample in sample_names: dataset_df = pd.read_csv(f"{temp}/{sample}.csv", sep=",") - if remove_low_frequency_peptides: + if remove_low_frequency_peptides and sample_number > 1: # Filter low-frequency peptides, which indicate whether the peptide occurs less than 20% in all samples or # only in one sample dataset_df = dataset_df[ diff --git a/setup.py b/setup.py index fb786c1..9952b3e 100644 --- a/setup.py +++ b/setup.py @@ -40,7 +40,6 @@ def readme(): ], scripts=['bin/compute_ibaq.py', 'bin/peptide_normalization.py', - 'bin/peptide_normalization_stream.py', 'bin/compute_tpa.py', 'bin/datasets_merger.py', 'bin/tsne_visualization.py', From 6333befeb813611f3b75c0d82327694af19c18d5 Mon Sep 17 00:00:00 2001 From: WangHong007 <88552471+WangHong007@users.noreply.github.com> Date: Wed, 29 Nov 2023 19:38:08 +0800 Subject: [PATCH 4/7] Update peptide_normalization.py --- bin/peptide_normalization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/peptide_normalization.py b/bin/peptide_normalization.py index 9c08c82..51d62cd 100644 --- a/bin/peptide_normalization.py +++ b/bin/peptide_normalization.py @@ -620,7 +620,7 @@ def peptide_normalization( del feature_df, sdrf_df gc.collect() else: - dataset_df = pd.read_parquet(parquet)[FEATURE_COLUMNS] + dataset_df = pd.read_parquet(parquet)[PARQUET_COLUMNS] dataset_df = parquet_common_process(dataset_df, label, choice) dataset_df = data_common_process(dataset_df, min_aa) From 6ed03f89f4a167dbad155d8f16023cfee50afd06 Mon Sep 17 00:00:00 2001 From: WangHong007 <88552471+WangHong007@users.noreply.github.com> Date: Tue, 5 Dec 2023 15:21:26 +0800 Subject: [PATCH 5/7] Make sure to get same proteins and peptides in normal and stream ways using default quantile normalization --- README.md | 6 +- bin/peptide_normalization.py | 303 +++++++++++++++++++---------------- requirements.txt | 1 + 3 files changed, 172 insertions(+), 138 deletions(-) diff --git a/README.md b/README.md index 2b36b1c..d6e4fb4 100644 --- a/README.md +++ b/README.md @@ -90,9 +90,9 @@ Options: properties for normalization --skip_normalization Skip normalization step --nmethod TEXT Normalization method used to normalize - intensities for all samples (options: qnorm) + intensities for all samples (options: msstats, quantile, qnorm) --pnormalization Normalize the peptide intensities using - different methods (options: qnorm) + different methods (options: quantile, qnorm) --compress Read the input peptides file in compress gzip file --log2 Transform to log2 the peptide intensity @@ -131,7 +131,7 @@ The first step is to remove contaminants and decoys. The script `peptide_normali A peptidoform is a combination of a `PeptideSequence(Modifications) + Charge + BioReplicate + Fraction`. In the current version of the file, each row correspond to one peptidoform. -The current version of the tool uses the parackage [qnorm](https://pypi.org/project/qnorm/) to normalize the intensities for each peptidofrom. **qnorm** implements a quantile normalization method. +The current version of the tool uses the parackage [qnorm](https://pypi.org/project/qnorm/) to normalize the intensities for each peptidofrom. **qnorm** implements a quantile normalization method. However, the current version of qnorm can not handle NA values which will lead to cause more NA values in data. We suggest users to use default method 'quantile' instead for now. #### 3. Peptidoform to Peptide Summarization diff --git a/bin/peptide_normalization.py b/bin/peptide_normalization.py index 51d62cd..0e706f0 100644 --- a/bin/peptide_normalization.py +++ b/bin/peptide_normalization.py @@ -5,6 +5,7 @@ import numpy as np import pandas as pd import qnorm +from scipy.stats import rankdata import os import random import uuid @@ -278,11 +279,32 @@ def map_canonical_seq(data_df: pd.DataFrame) -> (pd.DataFrame, dict): return data_df +def quantile_normalize( + data: pd.DataFrame, index: pd.core.indexes, cols: pd.core.indexes +) -> pd.DataFrame: + """Quantile normalization which considers NA values. + + :param data: Pivot dataframe contains intensities for each peptides. + :param index: Dataframe index. + :param cols: Dataframe columns. + :return: Dataframe normalizaed. + """ + data = data.to_numpy() + sorted_data = np.sort(data, axis=0) + row_quantiles = np.nanmean(sorted_data, axis=1) + rank_data = rankdata(data, axis=0, nan_policy="omit") + for i in range(data.shape[1]): + locs = (~np.isnan(data[:, i])).nonzero() + data[locs, i] = row_quantiles[[int(i) for i in (rank_data[locs, i] - 1)[0]]] + + return pd.DataFrame(data, index=index, columns=cols) + + def intensity_normalization( dataset: DataFrame, field: str, - class_field: str = "all", - scaling_method: str = "msstats", + class_field: str, + scaling_method: str = "quantile", ) -> DataFrame: # TODO add imputation and/or removal to those two norm strategies if scaling_method == "msstats": @@ -309,7 +331,7 @@ def intensity_normalization( ) return dataset - elif scaling_method == "qnorm": + else: # pivot to have one col per sample print("Transforming to wide format dataset size {}".format(len(dataset.index))) normalize_df = pd.pivot_table( @@ -330,7 +352,12 @@ def intensity_normalization( aggfunc={field: np.mean}, observed=True, ) - normalize_df = qnorm.quantile_normalize(normalize_df, axis=1) + if scaling_method == "qnorm": + normalize_df = qnorm.quantile_normalize(normalize_df, axis=1) + elif scaling_method == "quantile": + normalize_df = quantile_normalize( + normalize_df, normalize_df.index, normalize_df.columns + ) normalize_df = normalize_df.reset_index() normalize_df = normalize_df.melt( id_vars=[ @@ -414,26 +441,30 @@ def peptide_intensity_normalization( :param scaling_method: method to use for the normalization :return: """ + # pivot to have one col per sample + normalize_df = pd.pivot_table( + dataset_df, + index=[PEPTIDE_CANONICAL, PROTEIN_NAME, CONDITION], + columns=class_field, + values=field, + aggfunc={field: np.mean}, + observed=True, + ) if scaling_method == "qnorm": - # pivot to have one col per sample - normalize_df = pd.pivot_table( - dataset_df, - index=[PEPTIDE_CANONICAL, PROTEIN_NAME, CONDITION], - columns=class_field, - values=field, - aggfunc={field: np.mean}, - observed=True, - ) normalize_df = qnorm.quantile_normalize(normalize_df, axis=1) - normalize_df = normalize_df.reset_index() - normalize_df = normalize_df.melt( - id_vars=[PEPTIDE_CANONICAL, PROTEIN_NAME, CONDITION] + elif scaling_method == "quantile": + normalize_df = quantile_normalize( + normalize_df, normalize_df.index, normalize_df.columns ) - normalize_df.rename(columns={"value": NORM_INTENSITY}, inplace=True) - normalize_df = normalize_df[normalize_df[NORM_INTENSITY].notna()] - return normalize_df - - return dataset_df + else: + exit("Peptide intensity normalization works only with qnorm or quantile methods!") + normalize_df = normalize_df.reset_index() + normalize_df = normalize_df.melt( + id_vars=[PEPTIDE_CANONICAL, PROTEIN_NAME, CONDITION] + ) + normalize_df.rename(columns={"value": NORM_INTENSITY}, inplace=True) + normalize_df = normalize_df[normalize_df[NORM_INTENSITY].notna()] + return normalize_df def impute_peptide_intensities(dataset_df, field, class_field): @@ -530,8 +561,8 @@ def impute_peptide_intensities(dataset_df, field, class_field): ) @click.option( "--nmethod", - help="Normalization method used to normalize intensities for all samples (options: qnorm)", - default="qnorm", + help="Normalization method used to normalize intensities for all samples (options: quantile, msstats, qnorm)", + default="quantile", ) @click.option( "--pnormalization", @@ -725,10 +756,7 @@ def peptide_normalization( ) plt.show() pdf.savefig(box) - print( - "Number of peptides after normalization: " - + str(len(dataset_df.index)) - ) + print("Number of peptides after normalization: " + str(len(dataset_df.index))) print("Select the best peptidoform across fractions...") dataset_df = get_peptidoform_normalize_intensities(dataset_df) print( @@ -898,14 +926,20 @@ def peptide_normalization( print("INFO: Second iteration to filter data and prepare normalization...") print("Logarithmic if specified..") + norm_record = [0] * 2 for sample in sample_names: msstats_df = pd.read_csv(f"{temp}/{sample}.csv", sep=",") msstats_df = msstats_df[msstats_df[PROTEIN_NAME].isin(strong_proteins)] + # Remove high abundant and contaminants proteins and the outliers + if remove_ids is not None: + msstats_df = remove_protein_by_ids(msstats_df, remove_ids) + if remove_decoy_contaminants: + msstats_df = remove_contaminants_decoys(msstats_df) + norm_record[0] += len(msstats_df) msstats_df.loc[msstats_df.Intensity == 0, INTENSITY] = 1 msstats_df[NORM_INTENSITY] = ( np.log2(msstats_df[INTENSITY]) if log2 else msstats_df[INTENSITY] ) - msstats_df.to_csv(f"{temp}/{sample}.csv", index=False, sep=",") if sample in plot_samples: original_intensities_df = pd.concat( [original_intensities_df, msstats_df] @@ -927,31 +961,49 @@ def peptide_normalization( + group_intensity } ) - elif nmethod == "qnorm": - - def recalculate(original, count, value): - return (original * count + value) / (count + 1), count + 1 - - dic = ( - msstats_df[NORM_INTENSITY] - .dropna() - .sort_values(ascending=False) - .reset_index(drop=True) - .to_dict() + elif nmethod == "quantile": + msstats_df = ( + msstats_df.groupby( + [ + PEPTIDE_SEQUENCE, + PEPTIDE_CANONICAL, + PEPTIDE_CHARGE, + FRACTION, + RUN, + BIOREPLICATE, + PROTEIN_NAME, + STUDY_ID, + CONDITION, + ] + )[NORM_INTENSITY] + .agg(np.mean) + .reset_index() ) + rank = msstats_df[NORM_INTENSITY].rank(method="average") + dic = dict(zip(rank, msstats_df[NORM_INTENSITY])) if len(quantile) == 0: quantile = {k: (v, 1) for k, v in dic.items()} else: - update = min(len(quantile), len(dic)) - for i in range(0, update): - original, count = quantile[i] - quantile[i] = recalculate(original, count, dic[i]) - if len(dic) <= len(quantile): - continue - else: - quantile.update( - {k: (v, 1) for k, v in dic.items() if k >= update} - ) + # update = min(len(quantile), len(dic)) + intersec = set(quantile.keys()) & set(dic.keys()) + update = set(dic.keys()) - set(quantile.keys()) + quantile.update( + { + i: (quantile[i][0] + dic[i], quantile[i][1] + 1) + for i in intersec + } + ) + if len(update) > 0: + quantile.update({k: (dic[k], 1) for k in update}) + msstats_df[SAMPLE_ID] = sample + else: + exit("Stream process only supports msstats and quantile methods!") + msstats_df.to_csv(f"{temp}/{sample}.csv", index=False, sep=",") + norm_record[1] += len(msstats_df) + if not skip_normalization and nmethod == "quantile": + quantile = {k: v[0] / v[1] for k, v in quantile.items()} + print(f"Peptides after contaminants removal: {norm_record[0]}") + print(f"Number of peptides after normalization: {norm_record[1]}") # Save original intensities QC plots original_intensities_df = original_intensities_df.reset_index(drop=True) density = plot_distributions( @@ -976,14 +1028,41 @@ def recalculate(original, count, value): pdf.savefig(box) del original_intensities_df - def normalization(dataset_df, label, sample, skip_normalization, nmethod, record): - # Remove high abundant and contaminants proteins and the outliers - if remove_ids is not None: - dataset_df = remove_protein_by_ids(dataset_df, remove_ids) - if remove_decoy_contaminants: - dataset_df = remove_contaminants_decoys(dataset_df) - record[0] += len(dataset_df.index) + # TODO: Peptide intensity normalization + peptides_count = pd.DataFrame( + columns=[PROTEIN_NAME, PEPTIDE_CANONICAL, "count"] + ) + norm_intensities_df = pd.DataFrame() + if not skip_normalization and nmethod == "msstats": + # For ISO normalization + if label in ["TMT", "ITRAQ"]: + median_baseline = np.median( + list(set(sum(group_intensities.values(), []))) + ) + group_intensities = { + key: np.median(list(values)) + for key, values in group_intensities.items() + } + else: + fractions = [i[1] for i in group_intensities.keys()] + fraction_median = {} + for fraction in fractions: + fraction_keys = [ + i for i in group_intensities.keys() if i[1] == fraction + ] + fraction_intensities = [] + for key in fraction_keys: + fraction_intensities.extend(group_intensities[key]) + fraction_median[fraction] = np.median(fraction_intensities) + group_intensities = { + key: np.median(values) for key, values in group_intensities.items() + } + print("INFO: Third iteration to normalize and counting peptides frequency...") + size_record = [0] * 3 + def normalization( + dataset_df, label, sample, skip_normalization, nmethod, record + ): if not skip_normalization: field = NORM_INTENSITY if nmethod == "msstats": @@ -1008,68 +1087,25 @@ def normalization(dataset_df, label, sample, skip_normalization, nmethod, record ), axis=1, ) - elif nmethod == "qnorm": - # pivot to have one col per sample - ref_dict = ( - dataset_df[NORM_INTENSITY] - .dropna() - .drop_duplicates() - .sort_values(ascending=False) - .reset_index(drop=True) - .to_dict() - ) - ref_dict = {v: norm_intensity[k] for k, v in ref_dict.items()} + elif nmethod == "quantile": + rank = dataset_df[NORM_INTENSITY].rank(method="average") + ref_dict = dict(zip(rank, dataset_df[NORM_INTENSITY])) + ref_dict = {v: quantile[k] for k, v in ref_dict.items()} dataset_df.loc[:, NORM_INTENSITY] = dataset_df.apply( - lambda x: ref_dict[x[NORM_INTENSITY]] - if x[NORM_INTENSITY] in ref_dict.keys() - else np.nan, + lambda x: ref_dict.get(x[NORM_INTENSITY], np.nan), axis=1, ) dataset_df = dataset_df.drop_duplicates() dataset_df = dataset_df[dataset_df[NORM_INTENSITY].notna()] - record[1] += len(dataset_df.index) dataset_df = get_peptidoform_normalize_intensities(dataset_df) - record[2] += len(dataset_df.index) + record[0] += len(dataset_df.index) dataset_df = sum_peptidoform_intensities(dataset_df) - record[3] += len(dataset_df.index) + record[1] += len(dataset_df.index) dataset_df = average_peptide_intensities(dataset_df) - record[4] += len(dataset_df.index) + record[2] += len(dataset_df.index) return dataset_df, record - # TODO: Peptide intensity normalization - peptides_count = {} - size_record = [0] * 5 - norm_intensities_df = pd.DataFrame() - if not skip_normalization: - if nmethod == "qnorm": - norm_intensity = {k: v[0] for k, v in quantile.items()} - elif nmethod == "msstats": - # For ISO normalization - if label in ["TMT", "ITRAQ"]: - median_baseline = np.median( - list(set(sum(group_intensities.values(), []))) - ) - group_intensities = { - key: np.median(list(values)) - for key, values in group_intensities.items() - } - else: - fractions = [i[1] for i in group_intensities.keys()] - fraction_median = {} - for fraction in fractions: - fraction_keys = [ - i for i in group_intensities.keys() if i[1] == fraction - ] - fraction_intensities = [] - for key in fraction_keys: - fraction_intensities.extend(group_intensities[key]) - fraction_median[fraction] = np.median(fraction_intensities) - group_intensities = { - key: np.median(values) - for key, values in group_intensities.items() - } - print("INFO: Third iteration to normalize and counting peptides frequency...") for sample in sample_names: dataset_df = pd.read_csv(f"{temp}/{sample}.csv", sep=",") if len(dataset_df) != 0: @@ -1080,25 +1116,27 @@ def normalization(dataset_df, label, sample, skip_normalization, nmethod, record continue sample_peptides = norm_df[PEPTIDE_CANONICAL].unique().tolist() if remove_low_frequency_peptides and sample_number > 1: - peptides_count = { - peptide: peptides_count.get(peptide, 0) + 1 - for peptide in sample_peptides - } + sample_peptides = norm_df[ + [PROTEIN_NAME, PEPTIDE_CANONICAL] + ].drop_duplicates() + sample_peptides["count"] = 1 + peptides_count = ( + pd.concat([peptides_count, sample_peptides]) + .groupby([PROTEIN_NAME, PEPTIDE_CANONICAL]) + .agg(sum) + .reset_index() + ) norm_df.to_csv(f"{temp}/{sample}.csv", sep=",", index=False) if sample in plot_samples: norm_intensities_df = pd.concat([norm_intensities_df, norm_df]) del group_intensities, quantile - print(f"Peptides after contaminants removal: {size_record[0]}") - print(f"Number of peptides after normalization: {size_record[1]}") - print(f"Number of peptides after peptidofrom selection: {size_record[2]}") - print(f"Number of peptides after selection: {size_record[3]}") - print(f"Number of peptides after average: {size_record[4]}") + print(f"Number of peptides after peptidofrom selection: {size_record[0]}") + print(f"Number of peptides after selection: {size_record[1]}") + print(f"Number of peptides after average: {size_record[2]}") # Save normalized intensities QC plots norm_intensities_df = norm_intensities_df.reset_index(drop=True) - log_after_norm = ( - nmethod == "msstats" - or nmethod == "qnorm" - or ((nmethod == "quantile" or nmethod == "robust") and not log2) + log_after_norm = nmethod == "msstats" or ( + (nmethod == "quantile" or nmethod == "robust") and not log2 ) density = plot_distributions( norm_intensities_df, @@ -1127,13 +1165,10 @@ def normalization(dataset_df, label, sample, skip_normalization, nmethod, record print("INFO: Writing normalized intensities into CSV...") if remove_low_frequency_peptides and sample_number > 1: - peptides_count = { - k: v / sample_number - for k, v in peptides_count.items() - if v / sample_number >= 0.80 - } - strong_peptides = list(peptides_count.keys()) - del peptides_count + peptides_count = peptides_count.loc[ + (peptides_count["count"] > 0.20 * sample_number) + & (peptides_count["count"] != sample_number - 1) + ] final_norm_intensities_df = pd.DataFrame() size_record = 0 @@ -1142,9 +1177,9 @@ def normalization(dataset_df, label, sample, skip_normalization, nmethod, record if remove_low_frequency_peptides and sample_number > 1: # Filter low-frequency peptides, which indicate whether the peptide occurs less than 20% in all samples or # only in one sample - dataset_df = dataset_df[ - dataset_df[PEPTIDE_CANONICAL].isin(strong_peptides) - ] + dataset_df = dataset_df.merge( + peptides_count[[PEPTIDE_CANONICAL, PROTEIN_NAME]], how="inner" + ) size_record += len(dataset_df.index) dataset_df = dataset_df[ [PEPTIDE_CANONICAL, PROTEIN_NAME, SAMPLE_ID, NORM_INTENSITY, CONDITION] @@ -1159,14 +1194,12 @@ def normalization(dataset_df, label, sample, skip_normalization, nmethod, record ) print(f"Peptides after remove low frequency peptides: {size_record}") if remove_low_frequency_peptides: - del strong_peptides + del peptides_count # TODO: No peptides intensity normalization applied in stream processing. # Save final normalized intensities QC plots - log_after_norm = ( - nmethod == "msstats" - or nmethod == "qnorm" - or ((nmethod == "quantile" or nmethod == "robust") and not log2) + log_after_norm = nmethod == "msstats" or ( + (nmethod == "quantile" or nmethod == "robust") and not log2 ) final_norm_intensities_df = final_norm_intensities_df.reset_index(drop=True) density = plot_distributions( diff --git a/requirements.txt b/requirements.txt index 036b3da..d4d8772 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,6 +6,7 @@ pandas==2.0.2 matplotlib pyarrow qnorm +scipy==1.11.4 seaborn typing_extensions==4.6.3 setuptools==67.8.0 \ No newline at end of file From 102383ae5f1893b2045c464073edf5e2da4b9ff4 Mon Sep 17 00:00:00 2001 From: WangHong007 <88552471+WangHong007@users.noreply.github.com> Date: Tue, 5 Dec 2023 15:28:56 +0800 Subject: [PATCH 6/7] Update requirements.txt --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index d4d8772..b99850a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,7 @@ pandas==2.0.2 matplotlib pyarrow qnorm -scipy==1.11.4 +scipy>=1.10 seaborn typing_extensions==4.6.3 setuptools==67.8.0 \ No newline at end of file From d06252565d60b0252cf1999e2126801541a8b4d6 Mon Sep 17 00:00:00 2001 From: WangHong007 <88552471+WangHong007@users.noreply.github.com> Date: Wed, 6 Dec 2023 16:24:46 +0800 Subject: [PATCH 7/7] Update peptide_normalization.py --- bin/peptide_normalization.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/bin/peptide_normalization.py b/bin/peptide_normalization.py index 0e706f0..b9fc6e2 100644 --- a/bin/peptide_normalization.py +++ b/bin/peptide_normalization.py @@ -279,6 +279,8 @@ def map_canonical_seq(data_df: pd.DataFrame) -> (pd.DataFrame, dict): return data_df +# TODO: Here we present a method to apply quantile normalization. However, +# this function runs slowly and needs to be improved. def quantile_normalize( data: pd.DataFrame, index: pd.core.indexes, cols: pd.core.indexes ) -> pd.DataFrame: @@ -358,6 +360,9 @@ def intensity_normalization( normalize_df = quantile_normalize( normalize_df, normalize_df.index, normalize_df.columns ) + # TODO: When restoring the pivot table here, the previous grouping caused + # the dataframe to produce a large number of rows with NORM_INTENSITY of + # NA at melt. This results in an unbearable memory consumption. normalize_df = normalize_df.reset_index() normalize_df = normalize_df.melt( id_vars=[ @@ -457,7 +462,9 @@ def peptide_intensity_normalization( normalize_df, normalize_df.index, normalize_df.columns ) else: - exit("Peptide intensity normalization works only with qnorm or quantile methods!") + exit( + "Peptide intensity normalization works only with qnorm or quantile methods!" + ) normalize_df = normalize_df.reset_index() normalize_df = normalize_df.melt( id_vars=[PEPTIDE_CANONICAL, PROTEIN_NAME, CONDITION]