Merge pull request #19 from blab/multiple-alignment-feature

Multiple alignment feature
blab · Jun 3, 2024 · 900653f · 900653f
2 parents bd0556b + 7530d08
commit 900653f
Show file tree

Hide file tree

Showing 27 changed files with 5,214 additions and 1,002 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -9,7 +9,8 @@
 
 ### Features
 
-* Add optional output from pathogen-embed that produces the boxplot figure of Euclidean by genetic distance ([#14][])
+* Add support for multiple alignment and/or distance matrix inputs to `pathogen-embed` ([#19][])
+* Add optional output from `pathogen-embed` that produces the boxplot figure of Euclidean by genetic distance ([#14][])
 
 ### Bug Fixes
 
@@ -18,6 +19,7 @@
 [#12]: https://github.com/blab/pathogen-embed/pull/12
 [#13]: https://github.com/blab/pathogen-embed/pull/13
 [#14]: https://github.com/blab/pathogen-embed/pull/14
+[#19]: https://github.com/blab/pathogen-embed/pull/19
 
 ## 1.1.2
 

diff --git a/README.md b/README.md
@@ -46,7 +46,7 @@ pathogen-embed \
   --output-figure tsne.pdf \
   --output-pairwise-distance-figure tsne_pairwise_distances.pdf \
   t-sne \
-    --perplexity 50.0
+    --perplexity 45.0
 ```
 
 The following figure shows the resulting embedding.
@@ -76,6 +76,53 @@ These unassigned samples receive a cluster label of "-1".
 
 If you know the minimum genetic distance you want to require between clusters, you can use the equation from the pairwise distance figure above to determine the corresponding minimum Euclidean distance to pass to `pathogen-cluster`'s `--distance-threshold` argument.
 
+## Example: Identify reassortment groups from multiple gene alignments
+
+To identify potential reassortment groups for viruses with segmented genomes, you can calculate one distance matrix per gene and pass multiple distance matrices to `pathogen-embed`.
+Internally, `pathogen-embed` sums the given distances matrices into a single matrix to use for an embedding.
+The clusters in the resulting embedding represent genetic diversity in each gene individually and potential reassortment between genes.
+The following example shows how to apply this approach to alignments for seasonal influenza A/H3N2 HA and NA.
+
+Calculate a separate distance matrix per gene alignment for HA and NA.
+
+```bash
+pathogen-distance \
+  --alignment tests/data/h3n2_ha_alignment.fasta \
+  --output ha_distances.csv
+
+pathogen-distance \
+  --alignment tests/data/h3n2_na_alignment.fasta \
+  --output na_distances.csv
+```
+
+Create a t-SNE embedding using the HA/NA alignments and distance matrices.
+The t-SNE embedding gets initialized by a PCA embedding from the alignments.
+
+```bash
+pathogen-embed \
+  --alignment tests/data/h3n2_ha_alignment.fasta tests/data/h3n2_na_alignment.fasta \
+  --distance-matrix ha_distances.csv na_distances.csv \
+  --output-dataframe tsne.csv \
+  --output-figure tsne.pdf \
+  --output-pairwise-distance-figure tsne_pairwise_distances.pdf \
+  t-sne \
+    --perplexity 45.0
+```
+
+Finally, find clusters in the embedding which represent the within and between diversity of the given HA and NA sequences.
+
+```bash
+pathogen-cluster \
+  --embedding tsne.csv \
+  --label-attribute tsne_label \
+  --output-dataframe tsne_with_clusters.csv \
+  --output-figure tsne_with_clusters.pdf
+```
+
+Compare the resulting embedding and clusters to the embedding above from only HA sequences, to get a sense of how including the NA sequences affects the results.
+
+![Example t-SNE embedding of seasonal influenza A/H3N2 hemagglutinin and neuraminidase sequences colored by the cluster label assigned by pathogen-cluster](images/example-tsne-ha-na-embedding-with-clusters.png)
+
 ## Build documentation
 
 Build the [Documentation](https://blab.github.io/pathogen-embed/):

diff --git a/images/example-tsne-embedding-with-clusters.png b/images/example-tsne-embedding-with-clusters.png
diff --git a/images/example-tsne-embedding.png b/images/example-tsne-embedding.png
diff --git a/images/example-tsne-ha-na-embedding-with-clusters.png b/images/example-tsne-ha-na-embedding-with-clusters.png
diff --git a/images/example-tsne-pairwise-distances.png b/images/example-tsne-pairwise-distances.png
diff --git a/src/pathogen_embed/__main__.py b/src/pathogen_embed/__main__.py
@@ -17,8 +17,8 @@ def autoOrFloat(values):
 def make_parser_embed():
     parser = argparse.ArgumentParser(description = "Reduced dimension embeddings for pathogen sequences", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 
-    parser.add_argument("--alignment", required = True, help="an aligned FASTA file to create a distance matrix with. Make sure the strain order in this file matches the order in the distance matrix.")
-    parser.add_argument("--distance-matrix", help="a distance matrix that can be read in by pandas, index column as row 0")
+    parser.add_argument("--alignment", nargs='+', help="one or more aligned FASTA files used for PCA, for PCA initialization of t-SNE, or to create distance matrices on the fly when distances are needed and none are provided.")
+    parser.add_argument("--distance-matrix", nargs='+', help="one or more distance matrix CSVs with sequence names in a header row and the first column.")
     parser.add_argument("--separator", default=",", help="separator between columns in the given distance matrix")
     parser.add_argument("--indel-distance", action="store_true", help="include insertions/deletions in genetic distance calculations")
     parser.add_argument("--random-seed", default = 314159, type=int, help="an integer used for reproducible results.")

diff --git a/src/pathogen_embed/pathogen_embed.py b/src/pathogen_embed/pathogen_embed.py
@@ -184,7 +184,6 @@ def get_hamming_distances(genomes, count_indels=False):
 
     return hamming_distances
 
-
 def embed(args):
     # Setting Random seed for numpy
     np.random.seed(seed=args.random_seed)
@@ -194,40 +193,83 @@ def embed(args):
         sys.exit(1)
 
     if args.alignment is None and args.command == "pca":
-        print("You must specify an alignment for pca, not a distance matrix", file=sys.stderr)
+        print("ERROR: PCA requires an alignment input to create the embedding.", file=sys.stderr)
         sys.exit(1)
-    # getting or creating the distance matrix
-    distance_matrix = None
-    if args.distance_matrix is not None:
-        if not args.distance_matrix.endswith('.csv'):
-            print("You must supply a CSV file for distance_matrix.", file=sys.stderr)
-            sys.exit(1)
-        else: 
-            distance_matrix  = pd.read_csv(args.distance_matrix, index_col=0)
 
-    if args.alignment is not None:
-        sequences_by_name = OrderedDict()
+    if args.alignment is None and args.command == "t-sne":
+        print("ERROR: t-SNE requires an alignment input to initialize the embedding.", file=sys.stderr)
+        sys.exit(1)
 
-        for sequence in Bio.SeqIO.parse(args.alignment, "fasta"):
-            sequences_by_name[sequence.id] = str(sequence.seq)
+    if args.distance_matrix is not None and not all((distance_matrix.endswith(".csv") for distance_matrix in args.distance_matrix)):
+        print("ERROR: The distance matrix input(s) must be in comma-separate value (CSV) format.", file=sys.stderr)
+        sys.exit(1)
+
+    if args.alignment is not None and args.distance_matrix is not None and len(args.alignment) != len(args.distance_matrix):
+        print("ERROR: If giving multiple alignments and distance matrices the number of both must match.", file=sys.stderr)
+        sys.exit(1)
 
-        sequence_names = list(sequences_by_name.keys())
-        if args.command != "pca" and distance_matrix is None:
-            # Calculate Distance Matrix
-            hamming_distances = get_hamming_distances(
-                list(sequences_by_name.values()),
+    # Load distance matrices, if they have been provided, and sum them across
+    # all inputs.
+    distance_matrix = None
+    if args.distance_matrix is not None and args.command != "pca":
+        distance_path = args.distance_matrix[0]
+        distance_df = pd.read_csv(distance_path, index_col=0).sort_index(axis=0).sort_index(axis=1)
+        distance_array = distance_df.values.astype(float)
+
+        # Add the numpy arrays element-wise
+        for distance_path in args.distance_matrix[1:]:
+            other_distance_df = pd.read_csv(distance_path, index_col=0).sort_index(axis=0).sort_index(axis=1)
+            if not np.array_equal(distance_df.index.values, other_distance_df.index.values):
+                print("ERROR: The given distance matrices do not have the same sequence names.", file=sys.stderr)
+                sys.exit(1)
+
+            other_distance_array = other_distance_df.values.astype(float)
+            distance_array = distance_array + other_distance_array
+
+        # Confirm that the distance matrix is square.
+        if distance_array.shape[0] != distance_array.shape[1]:
+            print("ERROR: Distance matrices must be square (with the same number of rows and columns).")
+            sys.exit(1)
+
+        distance_matrix = pd.DataFrame(distance_array, index=distance_df.index)
+
+    # If we have alignments but no distance matrices and we need distances for
+    # the method, calculate distances on the fly.
+    if args.alignment is not None and distance_matrix is None and args.command != "pca":
+        for alignment in args.alignment:
+            # Calculate a distance matrix on the fly and sort the matrix
+            # alphabetically by sequence name, so we can safely sum values
+            # across all inputs.
+            new_distance_matrix = calculate_distances_from_alignment(
+                alignment,
                 args.indel_distance,
+            ).sort_index(
+                axis=0,
+            ).sort_index(
+                axis=1,
             )
-            distance_matrix = pd.DataFrame(squareform(hamming_distances))
-            distance_matrix.index = sequence_names
+
+            if distance_matrix is None:
+                distance_matrix = new_distance_matrix
+            else:
+                # Confirm that we have the same strain names in the same order
+                # for each matrix.
+                if not np.array_equal(
+                    new_distance_matrix.index.values,
+                    distance_matrix.index.values,
+                ):
+                    print("ERROR: The given alignments do not have the same sequence names.", file=sys.stderr)
+                    sys.exit(1)
+
+                distance_matrix += new_distance_matrix
 
     # Load embedding parameters from an external CSV file, if possible.
     external_embedding_parameters = None
     if args.embedding_parameters is not None:
         if not args.embedding_parameters.endswith('.csv'):
             print("You must supply a CSV file for embedding parameters.", file=sys.stderr)
             sys.exit(1)
-        else: 
+        else:
             external_embedding_parameters_df = pd.read_csv(args.embedding_parameters)
 
         # Get a dictionary of additional parameters provided by the external
@@ -242,17 +284,47 @@ def embed(args):
 
     # Use PCA as its own embedding or as an initialization for t-SNE.
     if args.command == "pca" or args.command == "t-sne":
-        sequence_names = list(sequences_by_name.keys())
-
-        numbers = list(sequences_by_name.values())[:]
-        for i in range(0,len(list(sequences_by_name.values()))):
-            numbers[i] = re.sub(r'[^AGCT]', '5', numbers[i])
-            numbers[i] = list(numbers[i].replace('A','1').replace('G','2').replace('C', '3').replace('T','4'))
-            numbers[i] = [int(j) for j in numbers[i]]
-
-        genomes_df = pd.DataFrame(numbers)
-        genomes_df.columns = ["Site " + str(k) for k in range(0,len(numbers[i]))]
-
+        genomes_df = None
+        sequence_names = None
+        for alignment in args.alignment:
+
+            sequences_by_name = OrderedDict()
+
+            for sequence in Bio.SeqIO.parse(alignment, "fasta"):
+                sequences_by_name[sequence.id] = str(sequence.seq)
+
+            seq_sorted = sorted(list(sequences_by_name.keys()))
+            if (sequence_names is not None):
+                if (sequence_names != seq_sorted):
+                    print("ERROR: The given alignments do not have the same sequence names.", file=sys.stderr)
+                    sys.exit(1)
+
+            sequence_names = seq_sorted
+            numbers = []
+            for sequence_name in seq_sorted:
+                sequence = sequences_by_name[sequence_name]
+                sequence = re.sub(r'[^AGCT]', '5', sequence)
+                sequence = list(sequence.replace('A','1').replace('G','2').replace('C', '3').replace('T','4'))
+                sequence = [int(j) for j in sequence]
+                numbers.append(sequence)
+
+            if genomes_df is None:
+                genomes_df = pd.DataFrame(numbers)
+                genomes_df.columns = ["Site " + str(k) for k in range(0,len(numbers[0]))]
+            else:
+                second_df = pd.DataFrame(numbers)
+                second_df.columns = ["Site " + str(k) for k in range(genomes_df.shape[1],genomes_df.shape[1] + len(numbers[0]))]
+                genomes_df = pd.concat([genomes_df, second_df], axis=1)
+
+        # If we're using PCA to initialize the t-SNE embedding, confirm that the
+        # input alignments used for PCA have the same sequence names as the
+        # input distance matrices.
+        if (
+                args.command == "t-sne" and
+                not np.array_equal(distance_matrix.index.values, np.array(sequence_names))
+        ):
+            print("ERROR: The sequence names for the distance matrix inputs do not match the names in the alignment inputs.", file=sys.stderr)
+            sys.exit(1)
 
         #performing PCA on my pandas dataframe
         pca = PCA(
@@ -311,6 +383,7 @@ def embed(args):
                 embedding_parameters[key] = value_type(value)
 
     if args.command != "pca":
+        #TODO: distance matrices are no longer symmetrics/not square? Check into this
         embedder = embedding_class(**embedding_parameters)
         embedding = embedder.fit_transform(distance_matrix)
 
@@ -371,7 +444,10 @@ def embed(args):
         # Group Euclidean distances across the range of observed genetic
         # distances, so we can make a separate boxplot of Euclidean distances
         # per genetic distance value.
-        genetic_distance_range = range(genetic_distances.min(), genetic_distances.max() + 1)
+        genetic_distance_range = range(
+            int(genetic_distances.min()),
+            int(genetic_distances.max()) + 1,
+        )
         grouped_euclidean_distances = []
         for genetic_distance in genetic_distance_range:
             grouped_euclidean_distances.append(
@@ -416,9 +492,6 @@ def embed(args):
         ax.xaxis.set_major_formatter(distance_tick_formatter)
         ax.xaxis.set_major_locator(MultipleLocator(5))
 
-        ax.yaxis.set_major_formatter(distance_tick_formatter)
-        ax.yaxis.set_major_locator(MultipleLocator(5))
-
         ax.set_xlim(left=-1)
         ax.set_ylim(bottom=-1)
 
@@ -431,7 +504,7 @@ def cluster(args):
     if not args.embedding.endswith('.csv'):
         print("You must supply a CSV file for the embedding.", file=sys.stderr)
         sys.exit(1)
-    else: 
+    else:
         embedding_df = pd.read_csv(args.embedding, index_col=0)
 
     clustering_parameters = {
@@ -478,20 +551,27 @@ def cluster(args):
     if args.output_dataframe is not None:
         embedding_df.to_csv(args.output_dataframe, index_label="strain")
 
-def distance(args):
+def calculate_distances_from_alignment(alignment_path, indel_distance):
     sequences_by_name = OrderedDict()
 
-    for sequence in Bio.SeqIO.parse(args.alignment, "fasta"):
+    for sequence in Bio.SeqIO.parse(alignment_path, "fasta"):
         sequences_by_name[sequence.id] = str(sequence.seq)
 
     sequence_names = list(sequences_by_name.keys())
 
     hamming_distances = get_hamming_distances(
         list(sequences_by_name.values()),
-        args.indel_distance,
+        indel_distance,
     )
     distance_matrix = pd.DataFrame(squareform(hamming_distances))
     distance_matrix.index = sequence_names
     distance_matrix.columns = distance_matrix.index
 
+    return distance_matrix
+
+def distance(args):
+    distance_matrix = calculate_distances_from_alignment(
+        args.alignment,
+        args.indel_distance,
+    )
     distance_matrix.to_csv(args.output)