diff --git a/bin/peptide_file_generation.py b/bin/peptide_file_generation.py index 84e3381..a3c498c 100644 --- a/bin/peptide_file_generation.py +++ b/bin/peptide_file_generation.py @@ -56,6 +56,16 @@ def get_study_accession(sample_id: str) -> str: return sample_id.split('-')[0] +def get_reference_name(reference_spectrum: str) -> str: + """ + Get the reference name from Reference column. The function expected a reference name in the following format eg. + 20150820_Haura-Pilot-TMT1-bRPLC03-2.mzML_controllerType=0 controllerNumber=1 scan=16340 + :param reference_spectrum: + :return: reference name + """ + return re.split(r'\.mzML|\.MZML|\.raw|\.RAW', reference_spectrum)[0] + + def get_run_mztab(ms_run: str, metadata: OrderedDict) -> str: """ Convert the ms_run into a reference file for merging with msstats output @@ -123,7 +133,6 @@ def peptide_file_generation(msstats: str, sdrf: str, compress: bool, output: str # Read the msstats file msstats_df = pd.read_csv(msstats, sep=',', compression=compression_method) - msstats_df[REFERENCE] = msstats_df[REFERENCE].apply(remove_extension_file) msstats_df.rename( columns={'ProteinName': PROTEIN_NAME, 'PeptideSequence': PEPTIDE_SEQUENCE, 'PrecursorCharge': PEPTIDE_CHARGE, @@ -146,7 +155,32 @@ def peptide_file_generation(msstats: str, sdrf: str, compress: bool, output: str BIOREPLICATE, FRACTION, FRAGMENT_ION, ISOTOPE_LABEL_TYPE]] # Merged the SDRF with Resulted file - result_df = pd.merge(msstats_df, sdrf_df[['source name', REFERENCE]], how='left', on=[REFERENCE]) + labels = set(sdrf_df['comment[label]']) + if CHANNEL not in msstats_df.columns: + msstats_df[REFERENCE] = msstats_df[REFERENCE].apply(remove_extension_file) + result_df = pd.merge(msstats_df, sdrf_df[['source name', REFERENCE]], how='left', on=[REFERENCE]) + elif 'TMT' in ','.join(labels) or 'tmt' in ','.join(labels): + if (len(labels) > 11 or "TMT134N" in labels or "TMT133C" in labels + or "TMT133N" in labels or "TMT132C" in labels or "TMT132N" in labels): + choice = TMT16plex + elif len(labels) == 11 or "TMT131C" in labels: + choice = TMT11plex + elif len(labels) > 6: + choice = TMT10plex + else: + choice = TMT6plex + choice = pd.DataFrame.from_dict(choice, orient='index', columns=[CHANNEL]).reset_index().rename( + columns={'index': 'comment[label]'}) + sdrf_df = sdrf_df.merge(choice, on='comment[label]', how='left') + msstats_df[REFERENCE] = msstats_df[REFERENCE].apply(get_reference_name) + result_df = pd.merge(msstats_df, sdrf_df[['source name', REFERENCE, CHANNEL]], how='left', + on=[REFERENCE, CHANNEL]) + result_df.drop(CHANNEL, axis=1, inplace=True) + result_df.rename(columns={'Charge': PEPTIDE_CHARGE}, inplace=True) + else: + print("Warning: Only support label free and TMT experiment!") + exit(1) + result_df.rename(columns={'source name': SAMPLE_ID}, inplace=True) result_df[STUDY_ID] = result_df[SAMPLE_ID].apply(get_study_accession) diff --git a/bin/peptide_normalization.py b/bin/peptide_normalization.py index 7f4c081..64f18c9 100644 --- a/bin/peptide_normalization.py +++ b/bin/peptide_normalization.py @@ -18,6 +18,7 @@ def print_dataset_size(dataset: DataFrame, message: str, verbose: bool) -> None: if verbose: print(message + str(len(dataset.index))) + def print_help_msg(command) -> None: """ Print help information @@ -27,6 +28,7 @@ def print_help_msg(command) -> None: with click.Context(command) as ctx: click.echo(command.get_help(ctx)) + def remove_outliers_iqr(dataset: DataFrame): """ This method removes outliers from the dataframe inplace, the variable used for the outlier removal is Intensity @@ -39,6 +41,7 @@ def remove_outliers_iqr(dataset: DataFrame): dataset.query('(@Q1 - 1.5 * @IQR) <= Intensity <= (@Q3 + 1.5 * @IQR)', inplace=True) + def remove_missing_values(normalize_df: DataFrame, ratio: float = 0.3) -> DataFrame: """ Remove missing values if the peptide do not have values in the most of the samples @@ -50,40 +53,53 @@ def remove_missing_values(normalize_df: DataFrame, ratio: float = 0.3) -> DataFr normalize_df = normalize_df.dropna(thresh=round(n_samples * ratio)) return normalize_df -def get_canonical_peptide(peptide_sequence: str)-> str: + +def get_canonical_peptide(peptide_sequence: str) -> str: """ This function returns a peptide sequence without the modification information :param peptide_sequence: peptide sequence with mods :return: peptide sequence """ clean_peptide = re.sub("[\(\[].*?[\)\]]", "", peptide_sequence) - clean_peptide = clean_peptide.replace(".","") + clean_peptide = clean_peptide.replace(".", "") return clean_peptide -def intensity_normalization(dataset: DataFrame, field: str, class_field: str = "all", scaling_method: str = "msstats") -> DataFrame: - ## TODO add imputation and/or removal to those two norm strategies +def intensity_normalization(dataset: DataFrame, field: str, class_field: str = "all", + scaling_method: str = "msstats") -> DataFrame: + # TODO add imputation and/or removal to those two norm strategies if scaling_method == 'msstats': - g = dataset.groupby(['Run', 'Fraction'])[INTENSITY].apply(np.median) - g.name = 'RunMedian' - dataset = dataset.join(g, on=['Run', 'Fraction']) - dataset['FractionMedian'] = dataset['RunMedian'].groupby(dataset['Fraction']).transform('median') - dataset[NORM_INTENSITY] = dataset[INTENSITY] - dataset['RunMedian'] + dataset['FractionMedian'] + # For TMT normalization + if "Channel" in dataset.columns: + g = dataset.groupby(['Run', 'Channel'])[field].apply(np.median) + g.name = 'RunMedian' + dataset = dataset.join(g, on=['Run', 'Channel']) + median_baseline = dataset.drop_duplicates(subset=["Run", "Channel", field])[field].median() + dataset[NORM_INTENSITY] = dataset[field] - dataset['RunMedian'] + median_baseline + else: + g = dataset.groupby(['Run', 'Fraction'])[field].apply(np.median) + g.name = 'RunMedian' + dataset = dataset.join(g, on=['Run', 'Fraction']) + dataset['FractionMedian'] = dataset['RunMedian'].groupby(dataset['Fraction']).transform('median') + dataset[NORM_INTENSITY] = dataset[field] - dataset['RunMedian'] + dataset['FractionMedian'] return dataset elif scaling_method == 'qnorm': # pivot to have one col per sample - normalize_df = pd.pivot_table(dataset, index=[PEPTIDE_SEQUENCE, PEPTIDE_CHARGE, FRACTION, RUN, BIOREPLICATE, PROTEIN_NAME, STUDY_ID, CONDITION], + normalize_df = pd.pivot_table(dataset, index=[PEPTIDE_SEQUENCE, PEPTIDE_CHARGE, FRACTION, RUN, BIOREPLICATE, + PROTEIN_NAME, STUDY_ID, CONDITION], columns=class_field, values=field, aggfunc={field: np.mean}) normalize_df = qnorm.quantile_normalize(normalize_df, axis=1) normalize_df = normalize_df.reset_index() - normalize_df = normalize_df.melt(id_vars=[PEPTIDE_SEQUENCE, PEPTIDE_CHARGE, FRACTION, RUN, BIOREPLICATE, PROTEIN_NAME, STUDY_ID, CONDITION]) + normalize_df = normalize_df.melt( + id_vars=[PEPTIDE_SEQUENCE, PEPTIDE_CHARGE, FRACTION, RUN, BIOREPLICATE, PROTEIN_NAME, STUDY_ID, CONDITION]) normalize_df.rename(columns={'value': NORM_INTENSITY}, inplace=True) print(dataset.head()) return normalize_df return dataset + def get_peptidoform_normalize_intensities(dataset: DataFrame, higher_intensity: bool = True) -> DataFrame: """ Select the best peptidoform for the same sample and the same replicates. A peptidoform is the combination of @@ -94,12 +110,14 @@ def get_peptidoform_normalize_intensities(dataset: DataFrame, higher_intensity: """ dataset = dataset[dataset[NORM_INTENSITY].notna()] if higher_intensity: - dataset = dataset.loc[dataset.groupby([PEPTIDE_SEQUENCE, PEPTIDE_CHARGE, SAMPLE_ID, CONDITION, BIOREPLICATE])[NORM_INTENSITY].idxmax()].reset_index(drop=True) + dataset = dataset.loc[dataset.groupby([PEPTIDE_SEQUENCE, PEPTIDE_CHARGE, SAMPLE_ID, CONDITION, BIOREPLICATE])[ + NORM_INTENSITY].idxmax()].reset_index(drop=True) else: dataset = dataset.loc[dataset.groupby([PEPTIDE_SEQUENCE, PEPTIDE_CHARGE, SAMPLE_ID, CONDITION, BIOREPLICATE])[ SEARCH_ENGINE].idxmax()].reset_index(drop=True) return dataset + def sum_peptidoform_intensities(dataset: DataFrame) -> DataFrame: """ Sum the peptidoform intensities for all peptidofrom across replicates of the same sample. @@ -109,9 +127,12 @@ def sum_peptidoform_intensities(dataset: DataFrame) -> DataFrame: dataset = dataset[dataset[NORM_INTENSITY].notna()] normalize_df = dataset.groupby([PEPTIDE_CANONICAL, SAMPLE_ID, BIOREPLICATE, CONDITION])[NORM_INTENSITY].sum() normalize_df = normalize_df.reset_index() - normalize_df = pd.merge(normalize_df, dataset[[PROTEIN_NAME, PEPTIDE_CANONICAL, SAMPLE_ID, BIOREPLICATE, CONDITION]], how='left', on=[PEPTIDE_CANONICAL, SAMPLE_ID, BIOREPLICATE, CONDITION]) + normalize_df = pd.merge(normalize_df, + dataset[[PROTEIN_NAME, PEPTIDE_CANONICAL, SAMPLE_ID, BIOREPLICATE, CONDITION]], how='left', + on=[PEPTIDE_CANONICAL, SAMPLE_ID, BIOREPLICATE, CONDITION]) return normalize_df + def average_peptide_intensities(dataset: DataFrame) -> DataFrame: """ Median the intensities of all the peptidoforms for a specific peptide sample combination. @@ -120,9 +141,11 @@ def average_peptide_intensities(dataset: DataFrame) -> DataFrame: """ dataset_df = dataset.groupby([PEPTIDE_CANONICAL, SAMPLE_ID, CONDITION])[NORM_INTENSITY].median() dataset_df = dataset_df.reset_index() - dataset_df = pd.merge(dataset_df, dataset[[PROTEIN_NAME, PEPTIDE_CANONICAL, SAMPLE_ID, CONDITION]], how='left', on=[PEPTIDE_CANONICAL, SAMPLE_ID, CONDITION]) + dataset_df = pd.merge(dataset_df, dataset[[PROTEIN_NAME, PEPTIDE_CANONICAL, SAMPLE_ID, CONDITION]], how='left', + on=[PEPTIDE_CANONICAL, SAMPLE_ID, CONDITION]) return dataset_df + def remove_low_frequency_peptides(dataset_df: DataFrame, percentage_samples: float = 0.20): """ Remove peptides that are present in less than 20% of the samples. @@ -131,7 +154,7 @@ def remove_low_frequency_peptides(dataset_df: DataFrame, percentage_samples: flo :return: """ - normalize_df = pd.pivot_table(dataset_df,index=[PEPTIDE_CANONICAL, PROTEIN_NAME, CONDITION], + normalize_df = pd.pivot_table(dataset_df, index=[PEPTIDE_CANONICAL, PROTEIN_NAME], columns=SAMPLE_ID, values=NORM_INTENSITY, aggfunc={NORM_INTENSITY: np.mean}) # Count the number of null values in each row null_count = normalize_df.isnull().sum(axis=1) @@ -144,18 +167,22 @@ def remove_low_frequency_peptides(dataset_df: DataFrame, percentage_samples: flo # Remove rows with non-null values in only one column normalize_df = normalize_df[normalize_df.notnull().sum(axis=1) != normalize_df.shape[1] - 1] - normalize_df = normalize_df.reset_index() normalize_df = normalize_df.melt( - id_vars=[PEPTIDE_CANONICAL, PROTEIN_NAME, CONDITION]) + id_vars=[PEPTIDE_CANONICAL, PROTEIN_NAME]) normalize_df.rename(columns={'value': NORM_INTENSITY}, inplace=True) + # recover condition column + normalize_df = normalize_df.merge(dataset_df[[SAMPLE_ID, CONDITION]].drop_duplicates(subset=[SAMPLE_ID]), + on=SAMPLE_ID, how="left") + # Remove rows with null values in NORMALIZE_INTENSITY normalize_df = normalize_df[normalize_df[NORM_INTENSITY].notna()] print(normalize_df.head()) return normalize_df + def peptide_intensity_normalization(dataset_df: DataFrame, field: str, class_field: str, scaling_method: str): """ Normalize the peptide intensities using different methods. @@ -178,6 +205,7 @@ def peptide_intensity_normalization(dataset_df: DataFrame, field: str, class_fie return dataset_df + def impute_peptide_intensities(dataset_df, field, class_field): """ Impute the missing values using different methods. @@ -186,35 +214,52 @@ def impute_peptide_intensities(dataset_df, field, class_field): :param class_field: field to use as class :return: """ - # pivot to have one col per sample - normalize_df = pd.pivot_table(dataset_df, index=[PEPTIDE_CANONICAL, PROTEIN_NAME, CONDITION], - columns=class_field, values=field, aggfunc={field: np.mean}) - # Impute the missing values - imputer = MissForest(max_iter=5) - imputed_data = imputer.fit_transform(normalize_df) - normalize_df = pd.DataFrame(imputed_data, columns=normalize_df.columns, index=normalize_df.index) - - # Melt the dataframe - normalize_df = normalize_df.reset_index() - normalize_df = normalize_df.melt(id_vars=[PEPTIDE_CANONICAL, PROTEIN_NAME, CONDITION]) - normalize_df.rename(columns={'value': NORM_INTENSITY}, inplace=True) + normalize_df = pd.DataFrame() + # group by condition to detect missing values + for c, g in dataset_df.groupby(CONDITION): + # pivot to have one col per sample + group_normalize_df = pd.pivot_table(g, index=[PEPTIDE_CANONICAL, PROTEIN_NAME, CONDITION], + columns=class_field, values=field, aggfunc={field: np.mean}) + + # no missing values group -> only one sample + if len(group_normalize_df.columns) < 2: + group_normalize_df = group_normalize_df.reset_index() + group_normalize_df = group_normalize_df.melt(id_vars=[PEPTIDE_CANONICAL, PROTEIN_NAME, CONDITION]) + group_normalize_df.rename(columns={'value': NORM_INTENSITY}, inplace=True) + normalize_df = normalize_df.append(group_normalize_df, ignore_index=True) + else: + # Impute the missing values + imputer = MissForest(max_iter=5) + imputed_data = imputer.fit_transform(group_normalize_df) + group_normalize_df = pd.DataFrame(imputed_data, columns=group_normalize_df.columns, + index=group_normalize_df.index) + # Melt the dataframe + group_normalize_df = group_normalize_df.reset_index() + group_normalize_df = group_normalize_df.melt(id_vars=[PEPTIDE_CANONICAL, PROTEIN_NAME, CONDITION]) + group_normalize_df.rename(columns={'value': NORM_INTENSITY}, inplace=True) + normalize_df = normalize_df.append(group_normalize_df, ignore_index=True) + return normalize_df + @click.command() @click.option("--peptides", help="Peptides files from the peptide file generation tool") @click.option("--contaminants", help="Contaminants and high abundant proteins to be removed") @click.option("--output", help="Peptide intensity file including other all properties for normalization") -@click.option('--nmethod', help="Normalization method used to normalize intensities for all samples (options: qnorm)", default="qnorm") +@click.option('--nmethod', help="Normalization method used to normalize intensities for all samples (options: qnorm)", + default="qnorm") @click.option("--impute", help="Impute the missing values using MissForest", is_flag=True) -@click.option("--pnormalization", help="Normalize the peptide intensities using different methods (options: qnorm)", is_flag=True) -@click.option("--compress", help="Read the input peptides file in compress gzip file", is_flag= True) +@click.option("--pnormalization", help="Normalize the peptide intensities using different methods (options: qnorm)", + is_flag=True) +@click.option("--compress", help="Read the input peptides file in compress gzip file", is_flag=True) @click.option("--log2", help="Transform to log2 the peptide intensity values before normalization", is_flag=True) @click.option("--violin", help="Use violin plot instead of boxplot for distribution representations", is_flag=True) @click.option("--verbose", help="Print addition information about the distributions of the intensities, number of peptides remove " "after normalization, etc.", is_flag=True) -def peptide_normalization(peptides: str, contaminants: str, output: str, nmethod: str, impute: bool, pnormalization: bool, compress: bool, log2: bool, +def peptide_normalization(peptides: str, contaminants: str, output: str, nmethod: str, impute: bool, + pnormalization: bool, compress: bool, log2: bool, violin: bool, verbose: bool) -> None: """ Normalize the peptide intensities using different methods. @@ -261,10 +306,11 @@ def peptide_normalization(peptides: str, contaminants: str, output: str, nmethod print_dataset_size(dataset_df, "Peptides after contaminants removal: ", verbose) print("Normalize intensities.. ") - dataset_df = intensity_normalization(dataset_df, field=NORM_INTENSITY, class_field=SAMPLE_ID, scaling_method=nmethod) - + dataset_df = intensity_normalization(dataset_df, field=NORM_INTENSITY, class_field=SAMPLE_ID, + scaling_method=nmethod) if verbose: - log_after_norm = nmethod == "msstats" or nmethod == "qnorm" or ((nmethod == "quantile" or nmethod == "robust") and not log2) + log_after_norm = nmethod == "msstats" or nmethod == "qnorm" or ( + (nmethod == "quantile" or nmethod == "robust") and not log2) plot_distributions(dataset_df, NORM_INTENSITY, SAMPLE_ID, log2=log_after_norm) plot_box_plot(dataset_df, NORM_INTENSITY, SAMPLE_ID, log2=log_after_norm, title="Peptidoform intensity distribution after normalization, method: " + nmethod, violin=violin) @@ -289,7 +335,8 @@ def peptide_normalization(peptides: str, contaminants: str, output: str, nmethod print("Number of peptides after average: " + str(len(dataset_df.index))) if verbose: - log_after_norm = nmethod == "msstats" or nmethod == "qnorm" or ((nmethod == "quantile" or nmethod == "robust") and not log2) + log_after_norm = nmethod == "msstats" or nmethod == "qnorm" or ( + (nmethod == "quantile" or nmethod == "robust") and not log2) plot_distributions(dataset_df, NORM_INTENSITY, SAMPLE_ID, log2=log_after_norm) plot_box_plot(dataset_df, NORM_INTENSITY, SAMPLE_ID, log2=log_after_norm, title="Peptide intensity distribution method: " + nmethod, violin=violin) @@ -298,17 +345,18 @@ def peptide_normalization(peptides: str, contaminants: str, output: str, nmethod dataset_df = remove_low_frequency_peptides(dataset_df, 0.20) print_dataset_size(dataset_df, "Peptides after remove low frequency peptides: ", verbose) - # Perform imputation using Random Forest in Peptide Intensities if impute: - dataset_df = impute_peptide_intensities(dataset_df, field=NORM_INTENSITY, class_field=SAMPLE_ID) + dataset_df = impute_peptide_intensities(dataset_df, field=NORM_INTENSITY, class_field=SAMPLE_ID) if pnormalization: - print("Normalize at Peptide level...") - dataset_df = peptide_intensity_normalization(dataset_df, field=NORM_INTENSITY, class_field=SAMPLE_ID, scaling_method=nmethod) + print("Normalize at Peptide level...") + dataset_df = peptide_intensity_normalization(dataset_df, field=NORM_INTENSITY, class_field=SAMPLE_ID, + scaling_method=nmethod) if verbose: - log_after_norm = nmethod == "msstats" or nmethod == "qnorm" or ((nmethod == "quantile" or nmethod == "robust") and not log2) + log_after_norm = nmethod == "msstats" or nmethod == "qnorm" or ( + (nmethod == "quantile" or nmethod == "robust") and not log2) plot_distributions(dataset_df, NORM_INTENSITY, SAMPLE_ID, log2=log_after_norm) plot_box_plot(dataset_df, NORM_INTENSITY, SAMPLE_ID, log2=log_after_norm, title="Normalization at peptide level method: " + nmethod, violin=violin) @@ -317,6 +365,5 @@ def peptide_normalization(peptides: str, contaminants: str, output: str, nmethod dataset_df.to_csv(output, index=False, sep=',') - if __name__ == '__main__': peptide_normalization() diff --git a/ibaq/ibaqpy_commons.py b/ibaq/ibaqpy_commons.py index 765c78e..866e833 100644 --- a/ibaq/ibaqpy_commons.py +++ b/ibaq/ibaqpy_commons.py @@ -12,6 +12,9 @@ FRAGMENT_ION = 'FragmentIon' PRODUCT_CHARGE = 'ProductCharge' ISOTOPE_LABEL_TYPE = 'IsotopeLabelType' +CHANNEL = 'Channel' +MIXTRUE = 'Mixture' +TECHREPMIXTURE = 'TechRepMixture' CONDITION = 'Condition' BIOREPLICATE = 'BioReplicate' RUN = 'Run' @@ -30,6 +33,54 @@ IBAQ_LOG = 'IbaqLog' IBAQ_PPB = 'IbaqPpb' +TMT16plex = { + "TMT126": 1, + "TMT127N": 2, + "TMT127C": 3, + "TMT128N": 4, + "TMT128C": 5, + "TMT129N": 6, + "TMT129C": 7, + "TMT130N": 8, + "TMT130C": 9, + "TMT131N": 10, + "TMT131C": 11, + "TMT132N": 12, + "TMT132C": 13, + "TMT133N": 14, + "TMT133C": 15, + "TMT134N": 16, +} + +TMT11plex = { + "TMT126": 1, + "TMT127N": 2, + "TMT127C": 3, + "TMT128N": 4, + "TMT128C": 5, + "TMT129N": 6, + "TMT129C": 7, + "TMT130N": 8, + "TMT130C": 9, + "TMT131N": 10, + "TMT131C": 11, +} + +TMT10plex = { + "TMT126": 1, + "TMT127N": 2, + "TMT127C": 3, + "TMT128N": 4, + "TMT128C": 5, + "TMT129N": 6, + "TMT129C": 7, + "TMT130N": 8, + "TMT130C": 9, + "TMT131": 10, +} + +TMT6plex = {"TMT126": 1, "TMT127": 2, "TMT128": 3, "TMT129": 4, "TMT130": 5, "TMT131": 6} + def remove_contaminants_decoys(dataset: DataFrame, contaminants_file: str, protein_field=PROTEIN_NAME) -> DataFrame: """ @@ -53,6 +104,7 @@ def remove_contaminants_decoys(dataset: DataFrame, contaminants_file: str, prote return dataset[~dataset[protein_field].str.contains(cregex)] + def plot_distributions(dataset: DataFrame, field: str, class_field: str, log2: bool = True) -> None: """ Print the quantile plot for the dataset @@ -73,6 +125,7 @@ def plot_distributions(dataset: DataFrame, field: str, class_field: str, log2: b data_wide.plot.kde(figsize=(8, 6), linewidth=2, legend=False) pd.set_option('mode.chained_assignment', 'warn') + def plot_box_plot(dataset: DataFrame, field: str, class_field: str, log2: bool = False, weigth: int = 10, rotation: int = 45, title: str = "", violin: bool = False) -> None: """