Merge pull request #6 from ypriverol/master

q-values when Posterior Error probability
bigbio · May 13, 2022 · 9805a94 · 9805a94
2 parents a50a6fe + bcce411
commit 9805a94
Show file tree

Hide file tree

Showing 19 changed files with 126,385 additions and 192 deletions.
diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
@@ -19,10 +19,10 @@ jobs:
 
     steps:
     - uses: actions/checkout@v3
-    - name: Set up Python 3.10
+    - name: Set up Python 3.7
       uses: actions/setup-python@v3
       with:
-        python-version: "3.10"
+        python-version: "3.7"
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
@@ -34,6 +34,10 @@ jobs:
         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
-    - name: Test with pytest
+    - name: Test peptide file generation
       run: |
-        pytest
+        python peptide_file_generation.py --mztab data/PXD020192-heart.mzTab.gz --msstats data/PXD020192-heart-msstats.tsv.gz --triqler data/PXD020192-heart-triqler.tsv.gz --sdrf data/PXD020192-heart.sdrf.tsv.gz --output data/PXD020192-Peptide-Intensities.tsv --compress
+    - name: Test with normalization
+      run: |
+        python peptide_normalization.py --log2 --peptides ./data/heart-grouped-Intensities.tsv --contaminants contaminants_ids.tsv --routliers --output data/heart-grouped-Intensities-Norm.tsv --verbose --nmethod qnorm
+
diff --git a/compute_ibaq.py b/compute_ibaq.py
@@ -7,7 +7,6 @@
 
 from ibaqpy_commons import remove_contaminants_decoys, PROTEIN_NAME, INTENSITY, CONDITION, IBAQ, IBAQ_LOG, IBAQ_PPB
 
-
 def print_help_msg(command):
     """
     Print the help of the command
@@ -17,7 +16,6 @@ def print_help_msg(command):
     with click.Context(command) as ctx:
         click.echo(command.get_help(ctx))
 
-
 def normalize_ibaq(res: DataFrame) -> DataFrame:
     """
     Normalize the ibaq values using the total ibaq of the sample. The resulted
@@ -35,7 +33,6 @@ def normalize_ibaq(res: DataFrame) -> DataFrame:
     res[IBAQ_PPB] = res[IBAQ].apply(lambda x: (x / total_ibaq) * 100000000)
     return res
 
-
 @click.command()
 @click.option("-f", "--fasta", help="Protein database to compute IBAQ values")
 @click.option("-p", "--peptides", help="Peptide identifications with intensities following the peptide intensity output")
@@ -109,6 +106,5 @@ def get_average_nr_peptides_unique_bygroup(pdrow: Series) -> Series:
 
     res.to_csv(output, index=False)
 
-
 if __name__ == '__main__':
     ibaq_compute()
diff --git a/data/PXD004682-Peptide-Intensities.tsv.gz b/data/PXD004682-Peptide-Intensities.tsv.gz
diff --git a/data/PXD004682-out.mztab.gz b/data/PXD004682-out.mztab.gz
diff --git a/data/PXD004682-out_msstats.csv.gz b/data/PXD004682-out_msstats.csv.gz
diff --git a/data/PXD004682-out_triqler.tsv.gz b/data/PXD004682-out_triqler.tsv.gz
diff --git a/data/PXD004682.sdrf.tsv.gz b/data/PXD004682.sdrf.tsv.gz
diff --git a/data/PXD008934-Peptide-Intensities.tsv.gz b/data/PXD008934-Peptide-Intensities.tsv.gz
diff --git a/data/PXD020192-Peptide-Intensities.tsv.gz b/data/PXD020192-Peptide-Intensities.tsv.gz
diff --git a/data/PXD020192-heart-msstats.tsv.gz b/data/PXD020192-heart-msstats.tsv.gz
diff --git a/data/PXD020192-heart-triqler.tsv.gz b/data/PXD020192-heart-triqler.tsv.gz
diff --git a/data/PXD020192-heart.mzTab.gz b/data/PXD020192-heart.mzTab.gz
diff --git a/data/PXD020192-heart.sdrf.tsv.gz b/data/PXD020192-heart.sdrf.tsv.gz
diff --git a/data/heart-grouped-Intensities.tsv b/data/heart-grouped-Intensities.tsv
diff --git a/ibaqpy_commons.py b/ibaqpy_commons.py
@@ -2,6 +2,7 @@
 
 PROTEIN_NAME = 'ProteinName'
 PEPTIDE_SEQUENCE = 'PeptideSequence'
+PEPTIDE_CANONICAL = "PeptideCanonical"
 PEPTIDE_CHARGE = 'PrecursorCharge'
 FRAGMENT_ION = 'FragmentIon'
 PRODUCT_CHARGE = 'ProductCharge'

diff --git a/merge_condition_generation.py b/merge_condition_generation.py
@@ -0,0 +1,46 @@
+import gzip
+import os
+import re
+import shutil
+
+import click
+import pandas as pd
+from typing_extensions import OrderedDict
+
+from ibaqpy_commons import *
+
+def print_help_msg(command) -> None:
+  """
+  Print help information
+  :param command: command to print helps
+  :return: print help
+  """
+  with click.Context(command) as ctx:
+    click.echo(command.get_help(ctx))
+
+
+@click.command()
+@click.option("-i", "--input", help="Folder with all the Intensity files", required=True)
+@click.option("-o", "--output", help="Prefix name for the file to group by condition")
+@click.option("-p", "--pattern", help="Prefix of the pattern name for all the files in the folder")
+def merge_condition_generation(input: str, output: str, pattern: str) -> None:
+  """
+  Merge all the files in a folder with the specific pattern
+  :param input: Input folder containing all the peptide Intensity files
+  :param output: Output file prefix with all the intensities
+  :param pattern: pattern of the files with the corresponding file name prefix
+  :return:
+  """
+
+  files = [f for f in os.listdir(input) if pattern in f]
+  df_from_each_file = (pd.read_csv(input+"/"+f, sep="\t") for f in files)
+  concatenated_df = pd.concat(df_from_each_file, ignore_index=True)
+  concatenated_df[CONDITION] = concatenated_df[CONDITION].str.lower()
+  print(concatenated_df.head())
+
+  for k, g in concatenated_df.groupby([CONDITION]):
+    g.to_csv(f'{output}/{k}-grouped-Intensities.tsv', index=False, sep='\t')  # '{}.csv'.format(k)
+
+
+if __name__ == '__main__':
+  merge_condition_generation()
diff --git a/peptide_combat_normalization.py b/peptide_combat_normalization.py
diff --git a/peptide_file_generation.py b/peptide_file_generation.py
@@ -143,8 +143,17 @@ def peptide_file_generation(triqler: str, msstats: str, mztab: str, sdrf: str, c
   psms_df = mztab_df.spectrum_match_table
   psms_df[REFERENCE] = psms_df['spectra_ref'].apply(get_run_mztab, metadata=mztab_df.metadata)
 
-  psms_df['psmSearchScore'] = psms_df['opt_global_Posterior_Error_Probability_score'].apply(
+  psms_df['psmSearchScore'] = None
+
+  if("opt_global_Posterior_Error_Probability_score" in psms_df.columns):
+    psms_df['psmSearchScore'] = psms_df['opt_global_Posterior_Error_Probability_score'].apply(
     best_probability_error_bestsearch_engine)
+  elif("opt_global_q-value"):
+    psms_df['psmSearchScore'] = psms_df['opt_global_q-value'].apply(
+      best_probability_error_bestsearch_engine)
+  else:
+    raise Exception('The peptide quality score is not present in the mzTab')
+
   psms_df[SCAN] = psms_df['spectra_ref'].apply(get_scan_mztab)
   psms_df.rename(columns={'opt_global_cv_MS:1000889_peptidoform_sequence': PEPTIDE_SEQUENCE, 'charge': PEPTIDE_CHARGE,
                           'retention_time': RT}, inplace=True)