machine-learning.html


<!DOCTYPE html>

<html>
  <head>
    <meta charset="utf-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
    <title>Machine learning in bioinformatics &#8212; An Introduction to Applied Bioinformatics</title>
    
  <link href="_static/css/theme.css" rel="stylesheet" />
  <link href="_static/css/index.c5995385ac14fb8791e8eb36b4908be2.css" rel="stylesheet" />

    
  <link rel="stylesheet"
    href="_static/vendor/fontawesome/5.13.0/css/all.min.css">
  <link rel="preload" as="font" type="font/woff2" crossorigin
    href="_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.woff2">
  <link rel="preload" as="font" type="font/woff2" crossorigin
    href="_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.woff2">

    
    <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
    <link rel="stylesheet" href="_static/sphinx-book-theme.acff12b8f9c144ce68a297486a2fa670.css" type="text/css" />
    <link rel="stylesheet" type="text/css" href="_static/togglebutton.css" />
    <link rel="stylesheet" type="text/css" href="_static/copybutton.css" />
    <link rel="stylesheet" type="text/css" href="_static/mystnb.css" />
    <link rel="stylesheet" type="text/css" href="_static/sphinx-thebe.css" />
    <link rel="stylesheet" type="text/css" href="_static/panels-main.c949a650a448cc0ae9fd3441c0e17fb0.css" />
    <link rel="stylesheet" type="text/css" href="_static/panels-variables.06eb56fa6e07937060861dad626602ad.css" />
    
  <link rel="preload" as="script" href="_static/js/index.1c5a1a01449ed65a7b51.js">

    <script id="documentation_options" data-url_root="./" src="_static/documentation_options.js"></script>
    <script src="_static/jquery.js"></script>
    <script src="_static/underscore.js"></script>
    <script src="_static/doctools.js"></script>
    <script src="_static/togglebutton.js"></script>
    <script src="_static/clipboard.min.js"></script>
    <script src="_static/copybutton.js"></script>
    <script >var togglebuttonSelector = '.toggle, .admonition.dropdown, .tag_hide_input div.cell_input, .tag_hide-input div.cell_input, .tag_hide_output div.cell_output, .tag_hide-output div.cell_output, .tag_hide_cell.cell, .tag_hide-cell.cell';</script>
    <script src="_static/sphinx-book-theme.12a9622fbb08dcb3a2a40b2c02b83a57.js"></script>
    <script async="async" src="https://unpkg.com/thebelab@latest/lib/index.js"></script>
    <script >
        const thebe_selector = ".thebe"
        const thebe_selector_input = "pre"
        const thebe_selector_output = ".output"
    </script>
    <script async="async" src="_static/sphinx-thebe.js"></script>
    <script async="async" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.7/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
    <script type="text/x-mathjax-config">MathJax.Hub.Config({"tex2jax": {"inlineMath": [["\\(", "\\)"]], "displayMath": [["\\[", "\\]"]], "processRefs": false, "processEnvironments": false}})</script>
    <link rel="index" title="Index" href="genindex.html" />
    <link rel="search" title="Search" href="search.html" />
    <link rel="prev" title="Sequence homology searching" href="database-searching.html" />
    <meta name="viewport" content="width=device-width, initial-scale=1" />
    <meta name="docsearch:language" content="en" />
    
  </head>
  <body data-spy="scroll" data-target="#bd-toc-nav" data-offset="80">
    
    <div class="container-fluid" id="banner"></div>

    
    <div class="container-xl">
      <div class="row">
          
<div class="col-12 col-md-3 bd-sidebar site-navigation show" id="site-navigation">
    
        <div class="navbar-brand-box">
    <a class="navbar-brand text-wrap" href="index.html">
      
      <img src="_static/logo.png" class="logo" alt="logo">
      
      
      <h1 class="site-logo" id="site-title">An Introduction to Applied Bioinformatics</h1>
      
    </a>
</div><form class="bd-search d-flex align-items-center" action="search.html" method="get">
  <i class="icon fas fa-search"></i>
  <input type="search" class="form-control" name="q" id="search-input" placeholder="Search this book..." aria-label="Search this book..." autocomplete="off" >
</form><nav class="bd-links" id="bd-docs-nav" aria-label="Main navigation">
    <div class="bd-toc-item active">
        <ul class="nav bd-sidenav">
 <li class="toctree-l1">
  <a class="reference internal" href="introduction.html">
   Introduction
  </a>
 </li>
</ul>
<ul class="current nav bd-sidenav">
 <li class="toctree-l1">
  <a class="reference internal" href="biological-information.html">
   Biological Information
  </a>
 </li>
 <li class="toctree-l1">
  <a class="reference internal" href="pairwise-alignment.html">
   Pairwise sequence alignment
  </a>
 </li>
 <li class="toctree-l1">
  <a class="reference internal" href="database-searching.html">
   Sequence homology searching
  </a>
 </li>
 <li class="toctree-l1 current active">
  <a class="current reference internal" href="#">
   Machine learning in bioinformatics
  </a>
 </li>
</ul>

    </div>
</nav> <!-- To handle the deprecated key -->

<div class="navbar_extra_footer">
  Powered by <a href="https://jupyterbook.org">Jupyter Book</a>
</div>

</div>


<main class="col py-md-3 pl-md-4 bd-content overflow-auto" role="main">
    
    <div class="topbar container-xl fixed-top">
    <div class="topbar-contents row">
        <div class="col-12 col-md-3 bd-topbar-whitespace site-navigation show"></div>
        <div class="col pl-md-4 topbar-main">
            
            <button id="navbar-toggler" class="navbar-toggler ml-0" type="button" data-toggle="collapse"
                data-toggle="tooltip" data-placement="bottom" data-target=".site-navigation" aria-controls="navbar-menu"
                aria-expanded="true" aria-label="Toggle navigation" aria-controls="site-navigation"
                title="Toggle navigation" data-toggle="tooltip" data-placement="left">
                <i class="fas fa-bars"></i>
                <i class="fas fa-arrow-left"></i>
                <i class="fas fa-arrow-up"></i>
            </button>
            
            
<div class="dropdown-buttons-trigger">
    <button id="dropdown-buttons-trigger" class="btn btn-secondary topbarbtn" aria-label="Download this page"><i
            class="fas fa-download"></i></button>

    <div class="dropdown-buttons">
        <!-- ipynb file if we had a myst markdown file -->
        <a class="dropdown-buttons"
            href="_sources/machine-learning.ipynb"><button type="button"
                class="btn btn-secondary topbarbtn" title="Download notebook file" data-toggle="tooltip"
                data-placement="left">.ipynb</button></a>
        <!-- Download raw file -->
        <a class="dropdown-buttons" href="_sources/machine-learning.md"><button type="button"
                class="btn btn-secondary topbarbtn" title="Download source file" data-toggle="tooltip"
                data-placement="left">.md</button></a>
        <!-- Download PDF via print -->
        <button type="button" id="download-print" class="btn btn-secondary topbarbtn" title="Print to PDF"
            onClick="window.print()" data-toggle="tooltip" data-placement="left">.pdf</button>
    </div>
</div>

            <!-- Source interaction buttons -->

            <!-- Full screen (wrap in <a> to have style consistency -->

<a class="full-screen-button"><button type="button" class="btn btn-secondary topbarbtn" data-toggle="tooltip"
        data-placement="bottom" onclick="toggleFullScreen()" aria-label="Fullscreen mode"
        title="Fullscreen mode"><i
            class="fas fa-expand"></i></button></a>

            <!-- Launch buttons -->

<div class="dropdown-buttons-trigger">
    <button id="dropdown-buttons-trigger" class="btn btn-secondary topbarbtn"
        aria-label="Launch interactive content"><i class="fas fa-rocket"></i></button>
    <div class="dropdown-buttons">
        
        <a class="binder-button" href="https://mybinder.org/v2/gh/applied-bioinformatics/iab2/main?urlpath=tree/book/machine-learning.md"><button type="button"
                class="btn btn-secondary topbarbtn" title="Launch Binder" data-toggle="tooltip"
                data-placement="left"><img class="binder-button-logo"
                    src="_static/images/logo_binder.svg"
                    alt="Interact on binder">Binder</button></a>
        
        
    </div>
</div>

        </div>

        <!-- Table of contents -->
        <div class="d-none d-md-block col-md-2 bd-toc show">
            
            <div class="tocsection onthispage pt-5 pb-3">
                <i class="fas fa-list"></i> Contents
            </div>
            <nav id="bd-toc-nav">
                <ul class="visible nav section-nav flex-column">
 <li class="toc-h2 nav-item toc-entry">
  <a class="reference internal nav-link" href="#the-feature-table">
   The feature table
  </a>
  <ul class="nav section-nav flex-column">
   <li class="toc-h3 nav-item toc-entry">
    <a class="reference internal nav-link" href="#the-iris-dataset">
     The Iris dataset
    </a>
   </li>
  </ul>
 </li>
 <li class="toc-h2 nav-item toc-entry">
  <a class="reference internal nav-link" href="#unsupervised-versus-supervised-learning-methods">
   Unsupervised versus supervised learning methods
  </a>
 </li>
 <li class="toc-h2 nav-item toc-entry">
  <a class="reference internal nav-link" href="#machine-learning-methods-applied-to-microbial-sequence-data">
   Machine learning methods applied to microbial sequence data
  </a>
 </li>
 <li class="toc-h2 nav-item toc-entry">
  <a class="reference internal nav-link" href="#unsupervised-learning">
   Unsupervised learning
  </a>
  <ul class="nav section-nav flex-column">
   <li class="toc-h3 nav-item toc-entry">
    <a class="reference internal nav-link" href="#computing-distances-between-samples">
     Computing distances between samples
    </a>
   </li>
   <li class="toc-h3 nav-item toc-entry">
    <a class="reference internal nav-link" href="#polar-ordination">
     Polar ordination
    </a>
   </li>
   <li class="toc-h3 nav-item toc-entry">
    <a class="reference internal nav-link" href="#interpreting-ordination-plots">
     Interpreting ordination plots
    </a>
    <ul class="nav section-nav flex-column">
     <li class="toc-h4 nav-item toc-entry">
      <a class="reference internal nav-link" href="#axis-order">
       Axis order
      </a>
     </li>
     <li class="toc-h4 nav-item toc-entry">
      <a class="reference internal nav-link" href="#uncorrelated-axes">
       Uncorrelated axes
      </a>
     </li>
     <li class="toc-h4 nav-item toc-entry">
      <a class="reference internal nav-link" href="#directionality-of-the-axes">
       Directionality of the axes
      </a>
     </li>
    </ul>
   </li>
   <li class="toc-h3 nav-item toc-entry">
    <a class="reference internal nav-link" href="#principal-coordinates-analysis-pcoa">
     Principal Coordinates Analysis (PCoA)
    </a>
   </li>
  </ul>
 </li>
 <li class="toc-h2 nav-item toc-entry">
  <a class="reference internal nav-link" href="#supervised-classification">
   Supervised classification
  </a>
  <ul class="nav section-nav flex-column">
   <li class="toc-h3 nav-item toc-entry">
    <a class="reference internal nav-link" href="#defining-a-classification-task">
     Defining a classification task
    </a>
   </li>
   <li class="toc-h3 nav-item toc-entry">
    <a class="reference internal nav-link" href="#training-data-test-data-and-cross-validation">
     Training data, test data, and cross-validation
    </a>
   </li>
   <li class="toc-h3 nav-item toc-entry">
    <a class="reference internal nav-link" href="#evaluating-a-binary-classifier">
     Evaluating a binary classifier
    </a>
   </li>
   <li class="toc-h3 nav-item toc-entry">
    <a class="reference internal nav-link" href="#naive-bayes-classifiers">
     Naive Bayes classifiers
    </a>
   </li>
   <li class="toc-h3 nav-item toc-entry">
    <a class="reference internal nav-link" href="#training-a-native-bayes-classifier">
     Training a Native Bayes classifier
    </a>
   </li>
   <li class="toc-h3 nav-item toc-entry">
    <a class="reference internal nav-link" href="#applying-a-naive-bayes-classifier">
     Applying a Naive Bayes classifier
    </a>
   </li>
   <li class="toc-h3 nav-item toc-entry">
    <a class="reference internal nav-link" href="#evaluating-our-confidence-in-the-results-of-the-naive-bayes-classifier">
     Evaluating our confidence in the results of the Naive Bayes classifier
    </a>
   </li>
  </ul>
 </li>
 <li class="toc-h2 nav-item toc-entry">
  <a class="reference internal nav-link" href="#variations-on-the-input-to-machine-learning-algorithms">
   Variations on the input to machine learning algorithms
  </a>
 </li>
 <li class="toc-h2 nav-item toc-entry">
  <a class="reference internal nav-link" href="#list-of-works-cited">
   List of works cited
  </a>
 </li>
</ul>

            </nav>
        </div>
    </div>
</div>
    <div id="main-content" class="row">
        <div class="col-12 col-md-9 pl-md-3 pr-md-0">
        
              <div>
                
  <div class="section" id="machine-learning-in-bioinformatics">
<h1>Machine learning in bioinformatics<a class="headerlink" href="#machine-learning-in-bioinformatics" title="Permalink to this headline">¶</a></h1>
<p>In this chapter we’ll begin talking about machine learning algorithms. Machine learning algorithms are used in bioinformatics for tasks where the user would like an algorithm to assist in the identification of patterns in a complex dataset. As is typically the case in this book, we’ll work through implementing a few algorithms but these are not the implementations that you should use in practice. The code is written to be accessible for learning. <a class="reference external" href="http://scikit-learn.org/">scikit-learn</a> is a popular and well-documented Python library for machine learning which many bioinformatics researchers and software developers use in their work. If you’d like to start trying some of these tools out, scikit-learn is a great place to start.</p>
<div class="admonition warning">
<p class="admonition-title">Warning</p>
<p>Machine learning algorithms can easily be misused, either intentionally or unintentionally, to provide misleading results. This chapter will cover some guidelines for how to use these techniques, but it is only intended as a primer to introduce machine learning. It’s not a detailed discussion of how machine learning algorithms should and shouldn’t be used. If you want to start applying machine learning tools in your own research, I recommend moving from this chapter to the scikit-learn documentation, and their content on <a class="reference external" href="https://scikit-learn.org/stable/common_pitfalls.html">Common pitfalls and recommended practices</a>.</p>
</div>
<div class="section" id="the-feature-table">
<h2>The feature table<a class="headerlink" href="#the-feature-table" title="Permalink to this headline">¶</a></h2>
<p>Machine learning algorithms generally are provided with a table of <strong>samples</strong> and user-defined <strong>features</strong> of those samples. These data are typically represented in a matrix, where samples are the rows and features are the columns. This matrix is referred to as a <strong>feature table</strong>, and it is central to machine learning and many subfields of bioinformatics. The terms used here are purposefully general. Samples are intended to be any unit of study, and features are attributes of those samples. Sometimes <strong>labels</strong> or <strong>response variables</strong> will also be associated with the samples, in which case a different class of methods can be applied.</p>
<p>scikit-learn provides a few example datasets that can be used for learning. Let’s start by taking a look and one of them to get an idea of what input might look like in a machine learning task.</p>
<div class="section" id="the-iris-dataset">
<h3>The Iris dataset<a class="headerlink" href="#the-iris-dataset" title="Permalink to this headline">¶</a></h3>
<p>The <a class="reference external" href="https://scikit-learn.org/stable/datasets/toy_dataset.html#iris-plants-dataset">Iris dataset</a> is a classic example used in machine learning, originally published by RA Fisher <span id="id1">[<a class="reference internal" href="#id28">Fis36</a>]</span>. This feature table describes four features of 150 specimens of Iris, a genus of flowering plant, representing three species. The feature table follows:</p>
<div class="cell tag_hide-cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="c1"># This cell loads data from scikit-learn and organizes it into some strcutures that</span>
<span class="c1"># we&#39;ll use to conveniently view the data.</span>

<span class="kn">import</span> <span class="nn">sklearn.datasets</span>
<span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span>

<span class="n">iris_dataset</span> <span class="o">=</span> <span class="n">sklearn</span><span class="o">.</span><span class="n">datasets</span><span class="o">.</span><span class="n">load_iris</span><span class="p">(</span><span class="n">as_frame</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="n">iris_feature_table</span> <span class="o">=</span> <span class="n">iris_dataset</span><span class="o">.</span><span class="n">frame</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="s1">&#39;target&#39;</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
<span class="n">iris_feature_table</span><span class="o">.</span><span class="n">index</span><span class="o">.</span><span class="n">name</span> <span class="o">=</span> <span class="s1">&#39;sample-id&#39;</span>
<span class="c1"># map target integers onto species names</span>
<span class="n">iris_labels</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="n">iris_dataset</span><span class="o">.</span><span class="n">target_names</span><span class="p">[</span><span class="n">iris_dataset</span><span class="o">.</span><span class="n">target</span><span class="p">],</span> 
                        <span class="n">index</span><span class="o">=</span><span class="n">iris_dataset</span><span class="o">.</span><span class="n">target</span><span class="o">.</span><span class="n">index</span><span class="p">,</span> <span class="n">name</span><span class="o">=</span><span class="s1">&#39;species&#39;</span><span class="p">)</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span>
<span class="n">iris_labels</span><span class="o">.</span><span class="n">index</span><span class="o">.</span><span class="n">name</span> <span class="o">=</span> <span class="s1">&#39;sample-id&#39;</span>
</pre></div>
</div>
</div>
</div>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">iris_feature_table</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<div class="output text_html"><div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
</style>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>sepal length (cm)</th>
      <th>sepal width (cm)</th>
      <th>petal length (cm)</th>
      <th>petal width (cm)</th>
    </tr>
    <tr>
      <th>sample-id</th>
      <th></th>
      <th></th>
      <th></th>
      <th></th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>5.1</td>
      <td>3.5</td>
      <td>1.4</td>
      <td>0.2</td>
    </tr>
    <tr>
      <th>1</th>
      <td>4.9</td>
      <td>3.0</td>
      <td>1.4</td>
      <td>0.2</td>
    </tr>
    <tr>
      <th>2</th>
      <td>4.7</td>
      <td>3.2</td>
      <td>1.3</td>
      <td>0.2</td>
    </tr>
    <tr>
      <th>3</th>
      <td>4.6</td>
      <td>3.1</td>
      <td>1.5</td>
      <td>0.2</td>
    </tr>
    <tr>
      <th>4</th>
      <td>5.0</td>
      <td>3.6</td>
      <td>1.4</td>
      <td>0.2</td>
    </tr>
    <tr>
      <th>...</th>
      <td>...</td>
      <td>...</td>
      <td>...</td>
      <td>...</td>
    </tr>
    <tr>
      <th>145</th>
      <td>6.7</td>
      <td>3.0</td>
      <td>5.2</td>
      <td>2.3</td>
    </tr>
    <tr>
      <th>146</th>
      <td>6.3</td>
      <td>2.5</td>
      <td>5.0</td>
      <td>1.9</td>
    </tr>
    <tr>
      <th>147</th>
      <td>6.5</td>
      <td>3.0</td>
      <td>5.2</td>
      <td>2.0</td>
    </tr>
    <tr>
      <th>148</th>
      <td>6.2</td>
      <td>3.4</td>
      <td>5.4</td>
      <td>2.3</td>
    </tr>
    <tr>
      <th>149</th>
      <td>5.9</td>
      <td>3.0</td>
      <td>5.1</td>
      <td>1.8</td>
    </tr>
  </tbody>
</table>
<p>150 rows × 4 columns</p>
</div></div></div>
</div>
<p>The rows in this table represent our samples - in this case specimens of Iris. The columns represent features, or attributes of the samples. Each <strong>sample vector</strong> (i.e., row) will include a unique identifier for the sample which we usually call the <em>sample id</em> (here these are simply integers), and values for each feature for that sample. Each <strong>feature vector</strong> (i.e., column) will similarly contain an identifier for the feature, or the the <em>feature id</em>. These are often simplistic descriptions of the features, as they are in this example, but they don’t need to be (integers would work fine as feature ids). The feature vector then contains the values measured for that feature in each sample.</p>
<p>This feature table on its own can serve as an input dataset for unsupervised learning tasks, which we’ll cover first in this chapter. A goal of unsupervised learning might be to determine if there are groups of samples that are most similar to one another.</p>
<p>In addition to this feature table, the Iris dataset contains labels for each of the 150 samples indicating which species each sample belongs to:</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">iris_labels</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<div class="output text_html"><div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
</style>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>species</th>
    </tr>
    <tr>
      <th>sample-id</th>
      <th></th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>setosa</td>
    </tr>
    <tr>
      <th>1</th>
      <td>setosa</td>
    </tr>
    <tr>
      <th>2</th>
      <td>setosa</td>
    </tr>
    <tr>
      <th>3</th>
      <td>setosa</td>
    </tr>
    <tr>
      <th>4</th>
      <td>setosa</td>
    </tr>
    <tr>
      <th>...</th>
      <td>...</td>
    </tr>
    <tr>
      <th>145</th>
      <td>virginica</td>
    </tr>
    <tr>
      <th>146</th>
      <td>virginica</td>
    </tr>
    <tr>
      <th>147</th>
      <td>virginica</td>
    </tr>
    <tr>
      <th>148</th>
      <td>virginica</td>
    </tr>
    <tr>
      <th>149</th>
      <td>virginica</td>
    </tr>
  </tbody>
</table>
<p>150 rows × 1 columns</p>
</div></div></div>
</div>
<p>The sample ids in this label vector must be the same as the sample ids in the feature table. The feature table and the sample labels together can be used as input data for supervised learning tasks, which we’ll cover second in this chapter. A goal of supervised learning might be to develop a classifier that could report the species of an Iris if provided with values for its sepal length and width and its petal length and width (i.e., the features that the algorithm originally had access).</p>
<p>There are three different labels, or classes, in this dataset:</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">iris_labels</span><span class="p">[</span><span class="s1">&#39;species&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">unique</span><span class="p">()</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<div class="output text_plain highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>array([&#39;setosa&#39;, &#39;versicolor&#39;, &#39;virginica&#39;], dtype=object)
</pre></div>
</div>
</div>
</div>
</div>
</div>
<div class="section" id="unsupervised-versus-supervised-learning-methods">
<h2>Unsupervised versus supervised learning methods<a class="headerlink" href="#unsupervised-versus-supervised-learning-methods" title="Permalink to this headline">¶</a></h2>
<p>Many machine learning methods are classified at a high level as either unsupervised or supervised learning methods.</p>
<p>In <strong>unsupervised learning</strong> we either don’t have or don’t use sample labels, and the algorithm therefore operates on a feature table alone. Typically the user is hoping to discover some structure in the data that can help them to understand which samples are most similar to each other based on their feature values. In this chapter we’ll introduce ordination as an unsupervised learning task. Ordination is very widely used in biology - you may have already encountered ordination plots (such as PCoA or NMDS plots) in some of your own work.</p>
<p>In <strong>supervised learning</strong>, on the other hand, sample labels are used in addition to a feature table. The sample labels can be discrete, as in the Iris dataset, or continuous, and that distinction defines whether we’re working on a classification or regression task, respectively. The goal of a supervised learning task is typically to have the computer develop a model that can accurate predict an unlabeled sample’s label from its feature values (for example, what species does an Iris specimen belong to, based on its sepal and petal length and width).</p>
</div>
<div class="section" id="machine-learning-methods-applied-to-microbial-sequence-data">
<h2>Machine learning methods applied to microbial sequence data<a class="headerlink" href="#machine-learning-methods-applied-to-microbial-sequence-data" title="Permalink to this headline">¶</a></h2>
<div class="cell tag_hide-cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="c1"># This cell performs some configuration for this notebook. It&#39;s hidden by</span>
<span class="c1"># default because it&#39;s not relevant to the content of this chapter. You&#39;ll</span>
<span class="c1"># occasionally notice that I hide this type of information so it&#39;s not </span>
<span class="c1"># distracting.</span>

<span class="o">%</span><span class="k">pylab</span> inline

<span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span>
<span class="kn">import</span> <span class="nn">skbio</span>
<span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
<span class="kn">import</span> <span class="nn">itertools</span>
<span class="kn">import</span> <span class="nn">collections</span>
<span class="kn">import</span> <span class="nn">random</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<div class="output stream highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>Populating the interactive namespace from numpy and matplotlib
</pre></div>
</div>
</div>
</div>
<p>In this chapter, we’ll work with 16S rRNA data <a class="reference internal" href="database-searching.html#load-qdr"><span class="std std-ref">as we did previously</span></a>. Specifically, we’ll load sequences from the Greengenes database and construct a feature table from them. We’ll use this feature table in an unsupervised learning task and a supervised learning task. We’ll also load labels for the sequences which we’ll primarily use in our supervised learning task, but which we’ll also use to aid in interpretation in our unsupervised learning task.</p>
<p>Our goal with these tasks will be to explore species-level taxonomy of a few microbial species based on sequence data. In our unsupervised learning task, we’ll determine if samples (i.e., sequences) coming from the same species appear to generally be more similar to each other than samples coming from different species. In our supervised learning task, we’ll determine if we can develop a classifier to predict microbial species from an unlabeled sequence.</p>
<p>Let’s start by loading five sequences from each of five specific microbial species from Greengenes.</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">collections</span>
<span class="kn">import</span> <span class="nn">qiime_default_reference</span> <span class="k">as</span> <span class="nn">qdr</span>
<span class="kn">import</span> <span class="nn">skbio</span>

<span class="k">def</span> <span class="nf">load_annotated_sequences</span><span class="p">(</span><span class="n">taxa_of_interest</span><span class="p">,</span> <span class="n">class_size</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">sequence_length</span><span class="o">=</span><span class="mi">500</span><span class="p">,</span> 
                             <span class="n">verbose</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">ids_to_exclude</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
    
    <span class="c1"># Load the taxonomic data</span>
    <span class="n">result</span> <span class="o">=</span> <span class="p">{}</span>
    <span class="n">SequenceRecord</span> <span class="o">=</span> <span class="n">collections</span><span class="o">.</span><span class="n">namedtuple</span><span class="p">(</span><span class="n">typename</span><span class="o">=</span><span class="s1">&#39;SequenceRecord&#39;</span><span class="p">,</span>
                                            <span class="n">field_names</span><span class="o">=</span><span class="p">[</span><span class="s1">&#39;identifier&#39;</span><span class="p">,</span> <span class="s1">&#39;split_taxonomy&#39;</span><span class="p">,</span> <span class="s1">&#39;taxonomy&#39;</span><span class="p">,</span> <span class="s1">&#39;sequence&#39;</span><span class="p">])</span>
    
    <span class="n">taxon_to_sequence_records</span> <span class="o">=</span> <span class="p">{</span><span class="n">t</span><span class="p">:</span> <span class="nb">list</span><span class="p">()</span> <span class="k">for</span> <span class="n">t</span> <span class="ow">in</span> <span class="n">taxa_of_interest</span><span class="p">}</span>        
    
    <span class="n">id_to_taxonomy_record</span> <span class="o">=</span> <span class="p">{}</span>
    <span class="k">for</span> <span class="n">line</span> <span class="ow">in</span> <span class="nb">open</span><span class="p">(</span><span class="n">qdr</span><span class="o">.</span><span class="n">get_reference_taxonomy</span><span class="p">()):</span>
        <span class="n">identifier</span><span class="p">,</span> <span class="n">taxonomy</span> <span class="o">=</span> <span class="n">line</span><span class="o">.</span><span class="n">strip</span><span class="p">()</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s1">&#39;</span><span class="se">\t</span><span class="s1">&#39;</span><span class="p">)</span>
        <span class="n">id_to_taxonomy_record</span><span class="p">[</span><span class="n">identifier</span><span class="p">]</span> <span class="o">=</span> <span class="n">taxonomy</span>
    
    <span class="k">for</span> <span class="n">seq</span> <span class="ow">in</span> <span class="n">skbio</span><span class="o">.</span><span class="n">io</span><span class="o">.</span><span class="n">read</span><span class="p">(</span><span class="n">qdr</span><span class="o">.</span><span class="n">get_reference_sequences</span><span class="p">(),</span> <span class="nb">format</span><span class="o">=</span><span class="s1">&#39;fasta&#39;</span><span class="p">,</span> 
                             <span class="n">constructor</span><span class="o">=</span><span class="n">skbio</span><span class="o">.</span><span class="n">DNA</span><span class="p">):</span>
        <span class="n">identifier</span> <span class="o">=</span> <span class="n">seq</span><span class="o">.</span><span class="n">metadata</span><span class="p">[</span><span class="s1">&#39;id&#39;</span><span class="p">]</span>
        <span class="k">if</span> <span class="n">ids_to_exclude</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">identifier</span> <span class="ow">in</span> <span class="n">ids_to_exclude</span><span class="p">:</span>
            <span class="c1"># if this id was tagged to not be included in the result, </span>
            <span class="c1"># move on to the next record</span>
            <span class="k">continue</span>
        
        <span class="n">tax</span> <span class="o">=</span> <span class="n">id_to_taxonomy_record</span><span class="p">[</span><span class="n">identifier</span><span class="p">]</span>
        <span class="n">split_taxonomy</span> <span class="o">=</span> <span class="p">[</span><span class="n">e</span><span class="o">.</span><span class="n">strip</span><span class="p">()</span> <span class="k">for</span> <span class="n">e</span> <span class="ow">in</span> <span class="n">tax</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s1">&#39;;&#39;</span><span class="p">)]</span>
        <span class="n">taxonomy</span> <span class="o">=</span> <span class="s1">&#39;;&#39;</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">split_taxonomy</span><span class="p">)</span>
        <span class="k">if</span> <span class="n">taxonomy</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">taxon_to_sequence_records</span><span class="p">:</span>
            <span class="c1"># if this is not one of the taxa that we&#39;re interested in, </span>
            <span class="c1"># move on to the next record. </span>
            <span class="k">continue</span>
        
        <span class="k">if</span> <span class="n">seq</span><span class="o">.</span><span class="n">has_degenerates</span><span class="p">():</span>
            <span class="c1"># for the purpose of this exercise we&#39;ll skip records </span>
            <span class="c1"># that have non-ACGT characters. if degenerate characters</span>
            <span class="c1"># are present, move on to the next record</span>
            <span class="k">continue</span>
            
        <span class="k">if</span> <span class="n">sequence_length</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
            <span class="n">sequence</span> <span class="o">=</span> <span class="n">seq</span><span class="p">[:</span><span class="n">sequence_length</span><span class="p">]</span>
        <span class="k">else</span><span class="p">:</span>
            <span class="n">sequence</span> <span class="o">=</span> <span class="n">seq</span>

        <span class="n">sr</span> <span class="o">=</span> <span class="n">SequenceRecord</span><span class="p">(</span><span class="n">identifier</span><span class="o">=</span><span class="n">identifier</span><span class="p">,</span>
                            <span class="n">split_taxonomy</span><span class="o">=</span><span class="n">split_taxonomy</span><span class="p">,</span>
                            <span class="n">taxonomy</span><span class="o">=</span><span class="n">taxonomy</span><span class="p">,</span>
                            <span class="n">sequence</span><span class="o">=</span><span class="n">sequence</span><span class="p">)</span>
        <span class="n">taxon_to_sequence_records</span><span class="p">[</span><span class="n">taxonomy</span><span class="p">]</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">sr</span><span class="p">)</span>
        
    <span class="k">if</span> <span class="n">verbose</span><span class="p">:</span>
        <span class="k">for</span> <span class="n">taxon</span><span class="p">,</span> <span class="n">srs</span> <span class="ow">in</span> <span class="n">taxon_to_sequence_records</span><span class="o">.</span><span class="n">items</span><span class="p">():</span>
            <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;</span><span class="si">%d</span><span class="s2"> sequences were identified for taxon </span><span class="si">%s</span><span class="s2">.&quot;</span> <span class="o">%</span> <span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">srs</span><span class="p">),</span> <span class="n">taxon</span><span class="p">))</span>
    
    <span class="k">if</span> <span class="n">class_size</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
        <span class="n">result</span> <span class="o">=</span> <span class="p">{</span><span class="n">sr</span><span class="o">.</span><span class="n">identifier</span><span class="p">:</span> <span class="n">sr</span> <span class="k">for</span> <span class="n">srs</span> <span class="ow">in</span> <span class="n">taxon_to_sequence_records</span><span class="o">.</span><span class="n">values</span><span class="p">()</span> <span class="k">for</span> <span class="n">sr</span> <span class="ow">in</span> <span class="n">srs</span><span class="p">}</span>
    <span class="k">else</span><span class="p">:</span>
        <span class="n">result</span> <span class="o">=</span> <span class="p">{}</span>
        <span class="k">for</span> <span class="n">taxon</span><span class="p">,</span> <span class="n">srs</span> <span class="ow">in</span> <span class="n">taxon_to_sequence_records</span><span class="o">.</span><span class="n">items</span><span class="p">():</span>
            <span class="k">if</span> <span class="n">class_size</span> <span class="o">&gt;</span> <span class="nb">len</span><span class="p">(</span><span class="n">srs</span><span class="p">):</span>
                <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;Class size (</span><span class="si">%d</span><span class="s2">) too large for taxon </span><span class="si">%s</span><span class="s2">, which has only </span><span class="si">%d</span><span class="s2"> non-degenerate sequences.&quot;</span> <span class="o">%</span> 
                                 <span class="p">(</span><span class="n">class_size</span><span class="p">,</span> <span class="n">t</span><span class="p">,</span> <span class="nb">len</span><span class="p">(</span><span class="n">srs</span><span class="p">)))</span>
            <span class="n">sampled_sequence_records</span> <span class="o">=</span> <span class="n">random</span><span class="o">.</span><span class="n">sample</span><span class="p">(</span><span class="n">srs</span><span class="p">,</span> <span class="n">k</span><span class="o">=</span><span class="n">class_size</span><span class="p">)</span>
            <span class="n">result</span><span class="o">.</span><span class="n">update</span><span class="p">({</span><span class="n">sr</span><span class="o">.</span><span class="n">identifier</span><span class="p">:</span> <span class="n">sr</span> <span class="k">for</span> <span class="n">sr</span> <span class="ow">in</span> <span class="n">sampled_sequence_records</span><span class="p">})</span>

    <span class="k">return</span> <span class="n">result</span>
</pre></div>
</div>
</div>
</div>
<div class="cell docutils container" id="ml-define-sequences-per-speciesm">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">taxa_of_interest</span> <span class="o">=</span> <span class="p">{</span>
    <span class="s1">&#39;k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Prevotellaceae;g__Prevotella;s__stercorea&#39;</span><span class="p">,</span>
    <span class="s1">&#39;k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Prevotellaceae;g__Prevotella;s__copri&#39;</span><span class="p">,</span>
    <span class="s1">&#39;k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Prevotellaceae;g__Prevotella;s__melaninogenica&#39;</span><span class="p">,</span>
    <span class="s1">&#39;k__Bacteria;p__Bacteroidetes;c__Flavobacteriia;o__Flavobacteriales;f__Flavobacteriaceae;g__Flavobacterium;s__succinicans&#39;</span><span class="p">,</span>
    <span class="s1">&#39;k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Propionibacteriaceae;g__Propionibacterium;s__acnes&#39;</span><span class="p">,</span>
    <span class="s1">&#39;k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Pseudomonadales;f__Pseudomonadaceae;g__Pseudomonas;s__veronii&#39;</span><span class="p">,</span>
    <span class="s1">&#39;k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Pseudomonadales;f__Pseudomonadaceae;g__Pseudomonas;s__viridiflava&#39;</span>
<span class="p">}</span>
<span class="n">sequences_per_taxon</span> <span class="o">=</span> <span class="mi">5</span>

<span class="n">seq_data</span> <span class="o">=</span> <span class="n">load_annotated_sequences</span><span class="p">(</span><span class="n">taxa_of_interest</span><span class="p">,</span> <span class="n">class_size</span><span class="o">=</span><span class="n">sequences_per_taxon</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<div class="output stream highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>15 sequences were identified for taxon k__Bacteria;p__Bacteroidetes;c__Flavobacteriia;o__Flavobacteriales;f__Flavobacteriaceae;g__Flavobacterium;s__succinicans.
127 sequences were identified for taxon k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Propionibacteriaceae;g__Propionibacterium;s__acnes.
26 sequences were identified for taxon k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Prevotellaceae;g__Prevotella;s__melaninogenica.
35 sequences were identified for taxon k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Prevotellaceae;g__Prevotella;s__stercorea.
17 sequences were identified for taxon k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Pseudomonadales;f__Pseudomonadaceae;g__Pseudomonas;s__viridiflava.
24 sequences were identified for taxon k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Pseudomonadales;f__Pseudomonadaceae;g__Pseudomonas;s__veronii.
121 sequences were identified for taxon k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Prevotellaceae;g__Prevotella;s__copri.
</pre></div>
</div>
</div>
</div>
<p>We can look at a few randomly selected records from the data that was just compiled as follows. For each, we have a unique identifier, the source species for the sequence record, and a 16S rRNA sequence.</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="k">for</span> <span class="n">sr</span> <span class="ow">in</span> <span class="n">random</span><span class="o">.</span><span class="n">sample</span><span class="p">(</span><span class="nb">list</span><span class="p">(</span><span class="n">seq_data</span><span class="o">.</span><span class="n">values</span><span class="p">()),</span> <span class="mi">3</span><span class="p">):</span>
    <span class="nb">print</span><span class="p">(</span><span class="n">sr</span><span class="o">.</span><span class="n">identifier</span><span class="p">)</span>
    <span class="nb">print</span><span class="p">(</span><span class="n">sr</span><span class="o">.</span><span class="n">taxonomy</span><span class="p">)</span>
    <span class="nb">print</span><span class="p">(</span><span class="n">sr</span><span class="o">.</span><span class="n">sequence</span><span class="p">)</span>
    <span class="nb">print</span><span class="p">(</span><span class="s1">&#39;🦠&#39;</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<div class="output stream highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>970921
k__Bacteria;p__Bacteroidetes;c__Flavobacteriia;o__Flavobacteriales;f__Flavobacteriaceae;g__Flavobacterium;s__succinicans
GATGAACGCTAGCGGCACGCTTAACACATGCAAGTCGAGGGGTATAAGTCTTCGGATTTAGAGACCGGCGCACGGGTGCCTAACCCGTATGCTATCTACCTTTTACAGAGGGATAGCCCATACAAATTTGGATTAATACCTCATAACATAGCAATCTCGCATGACATCGCTATTAAAGTCACGACGGTCAAAGATGAGCATGCCTCCCATTAGCTACTTGGTAACGTAACGGCTTACCAAGGGTACTATGGGTAGGGGTCCTGAAAGGGAGATCCCCCACACTGGTACTGAGACCCCGACCATACTCCCACGGGAGGCAGAATCGAGGAATATTGGACAATGGGCACTAGCCTGATCCAGCCATGCCGCGTGCACGATGACGGTCCTATGGATTGTAAACTGCTTTTATACTAGAACACACACTCCTTCGAGAAGGAATTTGACTGTATCGTAACAATAAGGATCGGCTAACTCCGTGCCAGCAGCCGCTGTAATACTGA
🦠
299830
k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Prevotellaceae;g__Prevotella;s__copri
TAGAGTTTGATCCTGGCTCAGGATGAACGCTAGCTACAGGCTTAACACATGCAAGTCGAGGGGCAGCATGACGGAAGCTTGCTTTCGTTGATGGCGACCGGCGCACGGGTGAGTAACGCGTATCCAACCTGCCCTTGTCCATCGGATAACCCGTCGAAAGGCGGCCTAACACGATATGCGGTTCACCGCAGGCATCTAACGTGAACGAAATGTGAAGGAGAAGGATGGGGATGCGTCTGATTAGCTTGTTGGTGGGGTAACGGCCCACCAAGGCGACGATCAGTAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGTCCAAACTCCTACGGGAGGCAGCAGTGAGGAATATTGGTCAATGGGCGAGAGCCTGAACCAGCCAAGTAGCGTGCAGGATGACGGCCCTATGGGTTGTAAACTGCTTTTATACGGGGATAAAGTTGGGGACGTGTCCCCATTTGTAGGTACCGTATGAATAAGGACCGGCT
🦠
4321402
k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Prevotellaceae;g__Prevotella;s__melaninogenica
CAGGAAGAACGCTAGCCCCAGGCTTCACACACGCAAGTTCGCGGGGAAAACGACATTCGAAGTCTCGCTTCGAACGGGCGTTCGACCGGCGCACGGGAGAGTCACGCGTCTCCAACCCGCCTCCGACTAAGGGATAACCCGGCGAAAGTTCGGACTAATACCTTACGAGGTTTTCTCGCAGACATCTAATCGAAAACGAAAGAATTATCGGTCAGTCGATGGGGATCGCGTCTGATTAGCTTCGTTGGCGGGGTAACGGCCCACCAAGGCAACGATCAGTAGGGGTTTCTGAGAGGAAGGTCCCCCACATTCGGAACTGAGACACGGTCCAAACTCCTACGGGAGGCAGCAGTGAGGAATATTGGTCAATGGACGGAAGTTCTGAACCAGCCCAAGTAGCGTGCAGGATGACGGCCCTATGGGTTCGTAAACTGCTTTTGTATGGGGATAAAGTTTAGGGACGTGTCCCTATTTTGCAGGTACCATACGAATAAGGACCG
🦠
</pre></div>
</div>
</div>
</div>
<p>The first thing we need to generate from these data is our feature table, which raises the question of which features we want our machine learning algorithms to work with. In the last chapter, we discussed k-mers are length-k stretches of adjacent characters in a sequence. Those k-mers helped us to identify relevant sequences in our database searching, so they may be useful here as well. We don’t necessarily know how long our k-mers should be (i.e., what value <code class="docutils literal notranslate"><span class="pre">k</span></code> should be set to) however. The longer our kmers, the more likely they are to be specific to certain taxa, which is helpful for machine learning tasks. However, if they get too long it becomes less likely that we’ll observe those kmers in other sequences because the longer a k-mer sequence is, the more likely we are to see variation across closely related organisms. This is a problem for machine learning tasks, because we need to identify features that are shared among related samples.</p>
<p>Let’s set <span class="math notranslate nohighlight">\(k=4\)</span>, and use k-mers as the features that will define our sequence records for the examples in this chapter. I chose this value of <span class="math notranslate nohighlight">\(k\)</span> for our work here based on experimentation with multiple Greengenes subsamples. The features could be based on different values of <span class="math notranslate nohighlight">\(k\)</span>, or other features of sequences that you identify. If you have ideas about other values that you could compute from these sequences, come back here and try it out after you’ve finished reading this chapter.</p>
<div class="cell docutils container" id="ml-define-k">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">k</span> <span class="o">=</span> <span class="mi">4</span>
</pre></div>
</div>
</div>
</div>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="k">def</span> <span class="nf">feature_table_from_sequence_records</span><span class="p">(</span><span class="n">sequence_records</span><span class="p">,</span> <span class="n">k</span><span class="p">):</span>
    <span class="n">kmer_frequencies</span> <span class="o">=</span> <span class="p">{</span><span class="n">id_</span> <span class="p">:</span> <span class="n">sr</span><span class="o">.</span><span class="n">sequence</span><span class="o">.</span><span class="n">kmer_frequencies</span><span class="p">(</span><span class="n">k</span><span class="o">=</span><span class="n">k</span><span class="p">)</span> <span class="k">for</span> <span class="n">id_</span><span class="p">,</span> <span class="n">sr</span> <span class="ow">in</span> <span class="n">sequence_records</span><span class="o">.</span><span class="n">items</span><span class="p">()}</span>
    <span class="n">result</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">kmer_frequencies</span><span class="p">)</span><span class="o">.</span><span class="n">fillna</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="nb">int</span><span class="p">)</span><span class="o">.</span><span class="n">T</span>
    <span class="n">result</span><span class="o">.</span><span class="n">index</span><span class="o">.</span><span class="n">name</span> <span class="o">=</span> <span class="s1">&#39;id&#39;</span>
    <span class="k">return</span> <span class="n">result</span>
</pre></div>
</div>
</div>
</div>
<p>After extracting all k-mers from the sequences and putting them in a table where the rows are our sequences (indexed by the unique sequence identifiers), the columns represent unique k-mers (labeled by the k-mer itself), and the values are the number of times each k-mer is observed in each sequence, we end up with our feature table for unsupervised and supervised learning.</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">sequence_feature_table</span> <span class="o">=</span> <span class="n">feature_table_from_sequence_records</span><span class="p">(</span><span class="n">seq_data</span><span class="p">,</span> <span class="n">k</span><span class="p">)</span>
<span class="n">sequence_feature_table</span><span class="p">[:</span><span class="mi">12</span><span class="p">]</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<div class="output text_html"><div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
</style>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>GATG</th>
      <th>ATGA</th>
      <th>TGAA</th>
      <th>GAAC</th>
      <th>AACG</th>
      <th>ACGC</th>
      <th>CGCT</th>
      <th>GCTA</th>
      <th>CTAG</th>
      <th>TAGC</th>
      <th>...</th>
      <th>CCCT</th>
      <th>GCTC</th>
      <th>GCGA</th>
      <th>TTGC</th>
      <th>CGCC</th>
      <th>TGTG</th>
      <th>CTCT</th>
      <th>TTCC</th>
      <th>GTTC</th>
      <th>ATTC</th>
    </tr>
    <tr>
      <th>id</th>
      <th></th>
      <th></th>
      <th></th>
      <th></th>
      <th></th>
      <th></th>
      <th></th>
      <th></th>
      <th></th>
      <th></th>
      <th></th>
      <th></th>
      <th></th>
      <th></th>
      <th></th>
      <th></th>
      <th></th>
      <th></th>
      <th></th>
      <th></th>
      <th></th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>1020921</th>
      <td>4</td>
      <td>3</td>
      <td>2</td>
      <td>2</td>
      <td>4</td>
      <td>2</td>
      <td>1</td>
      <td>3</td>
      <td>1</td>
      <td>3</td>
      <td>...</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
    </tr>
    <tr>
      <th>1111241</th>
      <td>5</td>
      <td>2</td>
      <td>2</td>
      <td>1</td>
      <td>3</td>
      <td>1</td>
      <td>0</td>
      <td>3</td>
      <td>1</td>
      <td>2</td>
      <td>...</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
    </tr>
    <tr>
      <th>241971</th>
      <td>6</td>
      <td>5</td>
      <td>4</td>
      <td>3</td>
      <td>4</td>
      <td>2</td>
      <td>1</td>
      <td>4</td>
      <td>1</td>
      <td>5</td>
      <td>...</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
    </tr>
    <tr>
      <th>970921</th>
      <td>3</td>
      <td>4</td>
      <td>2</td>
      <td>2</td>
      <td>3</td>
      <td>2</td>
      <td>4</td>
      <td>5</td>
      <td>3</td>
      <td>5</td>
      <td>...</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
    </tr>
    <tr>
      <th>867450</th>
      <td>5</td>
      <td>3</td>
      <td>2</td>
      <td>1</td>
      <td>4</td>
      <td>2</td>
      <td>1</td>
      <td>4</td>
      <td>2</td>
      <td>3</td>
      <td>...</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
    </tr>
    <tr>
      <th>4226754</th>
      <td>3</td>
      <td>1</td>
      <td>1</td>
      <td>3</td>
      <td>4</td>
      <td>2</td>
      <td>2</td>
      <td>2</td>
      <td>0</td>
      <td>1</td>
      <td>...</td>
      <td>2</td>
      <td>2</td>
      <td>2</td>
      <td>1</td>
      <td>2</td>
      <td>1</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
    </tr>
    <tr>
      <th>403853</th>
      <td>3</td>
      <td>1</td>
      <td>0</td>
      <td>1</td>
      <td>2</td>
      <td>1</td>
      <td>1</td>
      <td>2</td>
      <td>0</td>
      <td>1</td>
      <td>...</td>
      <td>2</td>
      <td>3</td>
      <td>2</td>
      <td>1</td>
      <td>2</td>
      <td>2</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
    </tr>
    <tr>
      <th>862869</th>
      <td>2</td>
      <td>1</td>
      <td>1</td>
      <td>4</td>
      <td>4</td>
      <td>2</td>
      <td>4</td>
      <td>2</td>
      <td>0</td>
      <td>1</td>
      <td>...</td>
      <td>2</td>
      <td>2</td>
      <td>2</td>
      <td>3</td>
      <td>4</td>
      <td>5</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
    </tr>
    <tr>
      <th>4121939</th>
      <td>3</td>
      <td>1</td>
      <td>0</td>
      <td>3</td>
      <td>4</td>
      <td>2</td>
      <td>2</td>
      <td>2</td>
      <td>0</td>
      <td>1</td>
      <td>...</td>
      <td>2</td>
      <td>2</td>
      <td>2</td>
      <td>1</td>
      <td>2</td>
      <td>2</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
    </tr>
    <tr>
      <th>1028571</th>
      <td>3</td>
      <td>1</td>
      <td>0</td>
      <td>4</td>
      <td>3</td>
      <td>3</td>
      <td>2</td>
      <td>1</td>
      <td>0</td>
      <td>1</td>
      <td>...</td>
      <td>3</td>
      <td>1</td>
      <td>2</td>
      <td>1</td>
      <td>4</td>
      <td>2</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
    </tr>
    <tr>
      <th>535359</th>
      <td>4</td>
      <td>2</td>
      <td>2</td>
      <td>2</td>
      <td>3</td>
      <td>1</td>
      <td>0</td>
      <td>1</td>
      <td>0</td>
      <td>2</td>
      <td>...</td>
      <td>1</td>
      <td>0</td>
      <td>2</td>
      <td>2</td>
      <td>0</td>
      <td>2</td>
      <td>1</td>
      <td>2</td>
      <td>1</td>
      <td>1</td>
    </tr>
    <tr>
      <th>981912</th>
      <td>4</td>
      <td>1</td>
      <td>3</td>
      <td>3</td>
      <td>5</td>
      <td>2</td>
      <td>1</td>
      <td>3</td>
      <td>1</td>
      <td>3</td>
      <td>...</td>
      <td>1</td>
      <td>0</td>
      <td>0</td>
      <td>3</td>
      <td>0</td>
      <td>1</td>
      <td>0</td>
      <td>2</td>
      <td>1</td>
      <td>1</td>
    </tr>
  </tbody>
</table>
<p>12 rows × 256 columns</p>
</div></div></div>
</div>
<p>As mentioned above, supervised learning tasks also require labels. In this example, the labels will be the species that each sequence was identified in. We’ll next compile our sample label vector.</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="k">def</span> <span class="nf">feature_labels_from_sequence_records</span><span class="p">(</span><span class="n">sequence_records</span><span class="p">):</span>
    <span class="n">result</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span><span class="n">id_</span><span class="p">:</span><span class="n">sr</span><span class="o">.</span><span class="n">split_taxonomy</span> <span class="k">for</span> <span class="n">id_</span><span class="p">,</span> <span class="n">sr</span> <span class="ow">in</span> <span class="n">sequence_records</span><span class="o">.</span><span class="n">items</span><span class="p">()})</span><span class="o">.</span><span class="n">T</span>
    <span class="n">result</span><span class="o">.</span><span class="n">columns</span> <span class="o">=</span> <span class="p">[</span><span class="s1">&#39;domain&#39;</span><span class="p">,</span> <span class="s1">&#39;phylum&#39;</span><span class="p">,</span> <span class="s1">&#39;class&#39;</span><span class="p">,</span> <span class="s1">&#39;order&#39;</span><span class="p">,</span> <span class="s1">&#39;family&#39;</span><span class="p">,</span> <span class="s1">&#39;genus&#39;</span><span class="p">,</span> <span class="s1">&#39;species&#39;</span><span class="p">]</span>
    <span class="n">result</span><span class="o">.</span><span class="n">index</span><span class="o">.</span><span class="n">name</span> <span class="o">=</span> <span class="s1">&#39;id&#39;</span>
    <span class="n">legend_entries</span> <span class="o">=</span> <span class="p">[]</span>
    <span class="k">for</span> <span class="n">_</span><span class="p">,</span> <span class="p">(</span><span class="n">g</span><span class="p">,</span> <span class="n">s</span><span class="p">,</span> <span class="n">p</span><span class="p">)</span> <span class="ow">in</span> <span class="n">result</span><span class="p">[[</span><span class="s1">&#39;genus&#39;</span><span class="p">,</span> <span class="s1">&#39;species&#39;</span><span class="p">,</span> <span class="s1">&#39;phylum&#39;</span><span class="p">]]</span><span class="o">.</span><span class="n">iterrows</span><span class="p">():</span>
        <span class="n">legend_entries</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="s1">&#39;</span><span class="si">%s</span><span class="s1"> </span><span class="si">%s</span><span class="s1"> (</span><span class="si">%s</span><span class="s1">)&#39;</span> <span class="o">%</span> <span class="p">(</span><span class="n">g</span><span class="p">[</span><span class="mi">3</span><span class="p">:],</span> <span class="n">s</span><span class="p">[</span><span class="mi">3</span><span class="p">:],</span> <span class="n">p</span><span class="p">[</span><span class="mi">3</span><span class="p">:]))</span>
    <span class="n">result</span><span class="p">[</span><span class="s1">&#39;legend entry&#39;</span><span class="p">]</span> <span class="o">=</span> <span class="n">legend_entries</span>
    <span class="k">return</span> <span class="n">result</span>
</pre></div>
</div>
</div>
</div>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">sequence_labels</span> <span class="o">=</span> <span class="n">feature_labels_from_sequence_records</span><span class="p">(</span><span class="n">seq_data</span><span class="p">)</span>
<span class="n">sequence_labels</span><span class="p">[:</span><span class="mi">12</span><span class="p">]</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<div class="output text_html"><div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
</style>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>domain</th>
      <th>phylum</th>
      <th>class</th>
      <th>order</th>
      <th>family</th>
      <th>genus</th>
      <th>species</th>
      <th>legend entry</th>
    </tr>
    <tr>
      <th>id</th>
      <th></th>
      <th></th>
      <th></th>
      <th></th>
      <th></th>
      <th></th>
      <th></th>
      <th></th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>1020921</th>
      <td>k__Bacteria</td>
      <td>p__Bacteroidetes</td>
      <td>c__Flavobacteriia</td>
      <td>o__Flavobacteriales</td>
      <td>f__Flavobacteriaceae</td>
      <td>g__Flavobacterium</td>
      <td>s__succinicans</td>
      <td>Flavobacterium succinicans (Bacteroidetes)</td>
    </tr>
    <tr>
      <th>1111241</th>
      <td>k__Bacteria</td>
      <td>p__Bacteroidetes</td>
      <td>c__Flavobacteriia</td>
      <td>o__Flavobacteriales</td>
      <td>f__Flavobacteriaceae</td>
      <td>g__Flavobacterium</td>
      <td>s__succinicans</td>
      <td>Flavobacterium succinicans (Bacteroidetes)</td>
    </tr>
    <tr>
      <th>241971</th>
      <td>k__Bacteria</td>
      <td>p__Bacteroidetes</td>
      <td>c__Flavobacteriia</td>
      <td>o__Flavobacteriales</td>
      <td>f__Flavobacteriaceae</td>
      <td>g__Flavobacterium</td>
      <td>s__succinicans</td>
      <td>Flavobacterium succinicans (Bacteroidetes)</td>
    </tr>
    <tr>
      <th>970921</th>
      <td>k__Bacteria</td>
      <td>p__Bacteroidetes</td>
      <td>c__Flavobacteriia</td>
      <td>o__Flavobacteriales</td>
      <td>f__Flavobacteriaceae</td>
      <td>g__Flavobacterium</td>
      <td>s__succinicans</td>
      <td>Flavobacterium succinicans (Bacteroidetes)</td>
    </tr>
    <tr>
      <th>867450</th>
      <td>k__Bacteria</td>
      <td>p__Bacteroidetes</td>
      <td>c__Flavobacteriia</td>
      <td>o__Flavobacteriales</td>
      <td>f__Flavobacteriaceae</td>
      <td>g__Flavobacterium</td>
      <td>s__succinicans</td>
      <td>Flavobacterium succinicans (Bacteroidetes)</td>
    </tr>
    <tr>
      <th>4226754</th>
      <td>k__Bacteria</td>
      <td>p__Actinobacteria</td>
      <td>c__Actinobacteria</td>
      <td>o__Actinomycetales</td>
      <td>f__Propionibacteriaceae</td>
      <td>g__Propionibacterium</td>
      <td>s__acnes</td>
      <td>Propionibacterium acnes (Actinobacteria)</td>
    </tr>
    <tr>
      <th>403853</th>
      <td>k__Bacteria</td>
      <td>p__Actinobacteria</td>
      <td>c__Actinobacteria</td>
      <td>o__Actinomycetales</td>
      <td>f__Propionibacteriaceae</td>
      <td>g__Propionibacterium</td>
      <td>s__acnes</td>
      <td>Propionibacterium acnes (Actinobacteria)</td>
    </tr>
    <tr>
      <th>862869</th>
      <td>k__Bacteria</td>
      <td>p__Actinobacteria</td>
      <td>c__Actinobacteria</td>
      <td>o__Actinomycetales</td>
      <td>f__Propionibacteriaceae</td>
      <td>g__Propionibacterium</td>
      <td>s__acnes</td>
      <td>Propionibacterium acnes (Actinobacteria)</td>
    </tr>
    <tr>
      <th>4121939</th>
      <td>k__Bacteria</td>
      <td>p__Actinobacteria</td>
      <td>c__Actinobacteria</td>
      <td>o__Actinomycetales</td>
      <td>f__Propionibacteriaceae</td>
      <td>g__Propionibacterium</td>
      <td>s__acnes</td>
      <td>Propionibacterium acnes (Actinobacteria)</td>
    </tr>
    <tr>
      <th>1028571</th>
      <td>k__Bacteria</td>
      <td>p__Actinobacteria</td>
      <td>c__Actinobacteria</td>
      <td>o__Actinomycetales</td>
      <td>f__Propionibacteriaceae</td>
      <td>g__Propionibacterium</td>
      <td>s__acnes</td>
      <td>Propionibacterium acnes (Actinobacteria)</td>
    </tr>
    <tr>
      <th>535359</th>
      <td>k__Bacteria</td>
      <td>p__Bacteroidetes</td>
      <td>c__Bacteroidia</td>
      <td>o__Bacteroidales</td>
      <td>f__Prevotellaceae</td>
      <td>g__Prevotella</td>
      <td>s__melaninogenica</td>
      <td>Prevotella melaninogenica (Bacteroidetes)</td>
    </tr>
    <tr>
      <th>981912</th>
      <td>k__Bacteria</td>
      <td>p__Bacteroidetes</td>
      <td>c__Bacteroidia</td>
      <td>o__Bacteroidales</td>
      <td>f__Prevotellaceae</td>
      <td>g__Prevotella</td>
      <td>s__melaninogenica</td>
      <td>Prevotella melaninogenica (Bacteroidetes)</td>
    </tr>
  </tbody>
</table>
</div></div></div>
</div>
<p>Our data is ready, so let’s get started with unsupervised learning.</p>
</div>
<div class="section" id="unsupervised-learning">
<h2>Unsupervised learning<a class="headerlink" href="#unsupervised-learning" title="Permalink to this headline">¶</a></h2>
<p>We’ll begin our exploration of machine learning approaches with unsupervised learning, and specifically with ordination. We’ll work through ordination in two strokes. First, we’ll explore an approach called <strong>Polar Ordination</strong>, where the math is simple but which isn’t widely used in practice because more informative techniques exist. Working through this on a small data set will give you an idea of how ordination techniques can reduce the dimensionality of a data set and how to interpret the results of an ordination. Then, we’ll apply an approach called <strong>Principal Coordinates Analysis (PCoA)</strong>. The math for PCoA is a bit more complicated than I want get into this book, but we’ll look at how to apply PCoA using scikit-bio.</p>
<div class="section" id="computing-distances-between-samples">
<h3>Computing distances between samples<a class="headerlink" href="#computing-distances-between-samples" title="Permalink to this headline">¶</a></h3>
<p>Most ordination techniques begin with computing <strong>distances</strong> between all pairs of samples. A simple and useful way to define distances between our samples is by computing the fraction of k-mers that are unique to either sample. This is known as the Jaccard Distance between the samples. This metric derives from set theory, and is the inverse of the Jaccard Index (or Jaccard Similarity). It is defined as follows:</p>
<div class="math notranslate nohighlight" id="equation-jaccard-sim">
<span class="eqno">(1)<a class="headerlink" href="#equation-jaccard-sim" title="Permalink to this equation">¶</a></span>\[Jaccard \, Index_{(A,B)} = \frac{| A \cap B |}{| A \cup B |}\]</div>
<p>Let’s break this formula down. First, <span class="math notranslate nohighlight">\((A,B)\)</span> defines the two samples we’re computing distances between. We refer to them here with the variables <span class="math notranslate nohighlight">\(A\)</span> and <span class="math notranslate nohighlight">\(B\)</span>. <span class="math notranslate nohighlight">\(| A \cap B |\)</span> is the count of features that are observed, or that have a value of one or more, in both samples. If you’ve studied set theory, you may recognize this as the size of the intersection of the sets of k-mers in samples <span class="math notranslate nohighlight">\(A\)</span> and <span class="math notranslate nohighlight">\(B\)</span>. That number is divided by <span class="math notranslate nohighlight">\(| A \cup B |\)</span>, which is the count of features that are observed in either or both of the samples. In set theory terminology, this is the size of the union of the sets of k-mers in samples <span class="math notranslate nohighlight">\(A\)</span> and <span class="math notranslate nohighlight">\(B\)</span>. The resulting value is a measure of the similarity of the two samples. To make this a distance, we simply subtract that value from 1.</p>
<div class="math notranslate nohighlight" id="equation-jaccard-dist">
<span class="eqno">(2)<a class="headerlink" href="#equation-jaccard-dist" title="Permalink to this equation">¶</a></span>\[Jaccard \, Distance_{(A,B)} = 1 - Jaccard \, Index_{(A,B)}\]</div>
<p>If we apply this computation to all pairs of samples in our feature table, we can store the results in a <strong>distance matrix</strong>. This can be computed from our feature table as follows using scikit-bio.</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">skbio.diversity</span>

<span class="n">sequence_distance_matrix</span> <span class="o">=</span> <span class="n">skbio</span><span class="o">.</span><span class="n">diversity</span><span class="o">.</span><span class="n">beta_diversity</span><span class="p">(</span><span class="s1">&#39;jaccard&#39;</span><span class="p">,</span> <span class="n">sequence_feature_table</span><span class="p">,</span> <span class="n">ids</span><span class="o">=</span><span class="n">sequence_feature_table</span><span class="o">.</span><span class="n">index</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<div class="output stderr highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>/usr/share/miniconda/envs/iab2/lib/python3.8/site-packages/sklearn/metrics/pairwise.py:1776: DataConversionWarning: Data was converted to boolean for metric jaccard
  warnings.warn(msg, DataConversionWarning)
</pre></div>
</div>
</div>
</div>
<p>A convenient way to get an initial glance at patterns in a small dataset like the one we’re working with here would be to plot the distance matrix as a heatmap. Here, colors represent the distances and larger distances imply that a pair of samples are dissimilar from each other in their kmer content.</p>
<p>Notice that the values along the diagonal are zero: this is because the diagonal represents distances between a sequence and itself, which is always zero. In other words, <span class="math notranslate nohighlight">\(Jaccard \, Distance_{(A,A)} = 0\)</span>. Also notice that the matrix is symmetric, meaning that if you were to flip the values across the diagonal they would be equal to each other. In other words, <span class="math notranslate nohighlight">\(Jaccard \, Distance_{(A,B)} = Jaccard \, Distance_{(B,A)}\)</span>.</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">sequence_distance_matrix</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<img alt="_images/machine-learning_26_0.svg" src="_images/machine-learning_26_0.svg" /></div>
</div>
<div class="admonition-exercise admonition">
<p class="admonition-title">Exercise</p>
<p>If you refer back to the table of our sample labels (defined in the <code class="docutils literal notranslate"><span class="pre">sequence_labels</span></code> variable) above, do you notice any patterns emerging from this heatmap?</p>
</div>
<p>If you want to look up the distance between a specific pair of samples, you can do that as follows:</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">sample_id1</span> <span class="o">=</span> <span class="n">sequence_feature_table</span><span class="o">.</span><span class="n">index</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="n">sample_id2</span> <span class="o">=</span> <span class="n">sequence_feature_table</span><span class="o">.</span><span class="n">index</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span>
<span class="n">d</span> <span class="o">=</span> <span class="n">sequence_distance_matrix</span><span class="p">[</span><span class="n">sample_id1</span><span class="p">,</span> <span class="n">sample_id2</span><span class="p">]</span>
<span class="nb">print</span><span class="p">(</span><span class="s1">&#39;The Jaccard Distance between samples </span><span class="si">%s</span><span class="s1"> and </span><span class="si">%s</span><span class="s1"> is </span><span class="si">%1.3f</span><span class="s1">.&#39;</span> <span class="o">%</span> <span class="p">(</span><span class="n">sample_id1</span><span class="p">,</span> <span class="n">sample_id2</span><span class="p">,</span> <span class="n">d</span><span class="p">))</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<div class="output stream highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>The Jaccard Distance between samples 1020921 and 1111241 is 0.150.
</pre></div>
</div>
</div>
</div>
</div>
<div class="section" id="polar-ordination">
<h3>Polar ordination<a class="headerlink" href="#polar-ordination" title="Permalink to this headline">¶</a></h3>
<p>Now that we have distances between all pairs of samples, we can perform a polar ordination on the samples. Results of ordination are typically viewed in a scatterplot with two or three dimensions, so I find it useful to think through polar ordination as an approach to build a scatterplot from this distance matrix.</p>
<p>First, identify the largest distance in the distance matrix and note the sample ids associated with that distance. We’ll refer to this distance as <span class="math notranslate nohighlight">\(D\)</span>.</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">sorted_indices</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">argsort</span><span class="p">(</span><span class="n">sequence_distance_matrix</span><span class="o">.</span><span class="n">data</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="kc">None</span><span class="p">)[::</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span>
<span class="n">sorted_indices</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">unravel_index</span><span class="p">(</span><span class="n">sorted_indices</span><span class="p">,</span> <span class="n">shape</span><span class="o">=</span><span class="n">sequence_distance_matrix</span><span class="o">.</span><span class="n">shape</span><span class="p">)</span>
<span class="n">sample_id1</span> <span class="o">=</span> <span class="n">sequence_distance_matrix</span><span class="o">.</span><span class="n">ids</span><span class="p">[</span><span class="n">sorted_indices</span><span class="p">[</span><span class="mi">0</span><span class="p">][</span><span class="mi">0</span><span class="p">]]</span>
<span class="n">sample_id2</span> <span class="o">=</span> <span class="n">sequence_distance_matrix</span><span class="o">.</span><span class="n">ids</span><span class="p">[</span><span class="n">sorted_indices</span><span class="p">[</span><span class="mi">0</span><span class="p">][</span><span class="mi">1</span><span class="p">]]</span>
<span class="n">D</span> <span class="o">=</span> <span class="n">sequence_distance_matrix</span><span class="p">[</span><span class="n">sample_id1</span><span class="p">,</span> <span class="n">sample_id2</span><span class="p">]</span>
<span class="nb">print</span><span class="p">(</span><span class="s1">&#39;The largest distance in the distance matrix is </span><span class="si">%1.3f</span><span class="s1">, between samples </span><span class="si">%s</span><span class="s1"> and </span><span class="si">%s</span><span class="s1">.&#39;</span> <span class="o">%</span> 
    <span class="p">(</span><span class="n">D</span><span class="p">,</span> <span class="n">sample_id1</span><span class="p">,</span> <span class="n">sample_id2</span><span class="p">))</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<div class="output stream highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>The largest distance in the distance matrix is 0.355, between samples 970921 and 4121939.
</pre></div>
</div>
</div>
</div>
<p>These two samples define the first axis of your scatter plot. The length of this axis is <span class="math notranslate nohighlight">\(D\)</span>, and each sample will be placed at an endpoint on this axis. Choose one sample to be plotted at zero on this axis, and the other sample to plot at <span class="math notranslate nohighlight">\(D\)</span> on this axis. It doesn’t matter which sample you choose to plot at which endpoint.</p>
<p>Next, we’ll identify the location of every other samples on this first axis. For each sample <span class="math notranslate nohighlight">\(s\)</span>, this computed with the following formula.</p>
<div class="math notranslate nohighlight" id="equation-polar-ordination-axis">
<span class="eqno">(3)<a class="headerlink" href="#equation-polar-ordination-axis" title="Permalink to this equation">¶</a></span>\[A_s = \frac{D^2 + D1^2 - D2^2}{2 \times D}\]</div>
<p>In this formula, <span class="math notranslate nohighlight">\(A_s\)</span> is the location of the current sample on the current axis. <span class="math notranslate nohighlight">\(D\)</span> is the distance between the endpoints. <span class="math notranslate nohighlight">\(D1\)</span> is distance between the current sample and the sample at <span class="math notranslate nohighlight">\(0\)</span> on this axis, which you can look up in the distance matrix computed above. <span class="math notranslate nohighlight">\(D1\)</span> is distance between the current sample and the sample at <span class="math notranslate nohighlight">\(D\)</span> on this axis, which is also looked up in the distance matrix.</p>
<p>The following Python function can be applied to compute the placement of all samples on a polar ordination axis, given the distances between all pairs of samples and the identifiers of the samples serving as the endpoints of this axis.</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="k">def</span> <span class="nf">compute_axis</span><span class="p">(</span><span class="n">dm</span><span class="p">,</span> <span class="n">endpoint1</span><span class="p">,</span> <span class="n">endpoint2</span><span class="p">):</span>
    <span class="n">d</span> <span class="o">=</span> <span class="n">dm</span><span class="p">[</span><span class="n">endpoint1</span><span class="p">,</span> <span class="n">endpoint2</span><span class="p">]</span>
    <span class="n">result</span> <span class="o">=</span> <span class="p">{</span><span class="n">endpoint1</span><span class="p">:</span> <span class="mi">0</span><span class="p">,</span> <span class="n">endpoint2</span><span class="p">:</span> <span class="n">d</span><span class="p">}</span>
    <span class="n">non_endpoints</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span><span class="n">dm</span><span class="o">.</span><span class="n">ids</span><span class="p">)</span> <span class="o">-</span> <span class="nb">set</span><span class="p">([</span><span class="n">endpoint1</span><span class="p">,</span> <span class="n">endpoint2</span><span class="p">])</span>
    <span class="k">for</span> <span class="n">e</span> <span class="ow">in</span> <span class="n">non_endpoints</span><span class="p">:</span>
        <span class="n">d1</span> <span class="o">=</span> <span class="n">dm</span><span class="p">[</span><span class="n">endpoint1</span><span class="p">,</span> <span class="n">e</span><span class="p">]</span>
        <span class="n">d2</span> <span class="o">=</span> <span class="n">dm</span><span class="p">[</span><span class="n">endpoint2</span><span class="p">,</span> <span class="n">e</span><span class="p">]</span>
        <span class="n">result</span><span class="p">[</span><span class="n">e</span><span class="p">]</span> <span class="o">=</span> <span class="p">(</span><span class="n">d</span><span class="o">**</span><span class="mi">2</span> <span class="o">+</span> <span class="n">d1</span><span class="o">**</span><span class="mi">2</span> <span class="o">-</span> <span class="n">d2</span><span class="o">**</span><span class="mi">2</span><span class="p">)</span> <span class="o">/</span> <span class="p">(</span><span class="mi">2</span> <span class="o">*</span> <span class="n">d</span><span class="p">)</span>
    <span class="k">return</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="n">result</span><span class="p">)</span>
</pre></div>
</div>
</div>
</div>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">axis1_values</span> <span class="o">=</span> <span class="n">compute_axis</span><span class="p">(</span><span class="n">sequence_distance_matrix</span><span class="p">,</span> 
                            <span class="n">sample_id1</span><span class="p">,</span>
                            <span class="n">sample_id2</span><span class="p">)</span>
</pre></div>
</div>
</div>
</div>
<p>At this stage, we have computed our first polar ordination axis. If we sort and view this axis we may even be able to see some clustering or grouping of samples along this axis.</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">axis1_values</span><span class="o">.</span><span class="n">sort_values</span><span class="p">()</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<div class="output text_plain highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>970921     0.000000
241971     0.080489
867450     0.084018
1111241    0.093832
1020921    0.101953
3248323    0.110663
347138     0.115381
591785     0.116548
4321402    0.123459
515798     0.130300
981912     0.135637
4307092    0.137678
265715     0.138974
4404222    0.139079
328035     0.140359
299830     0.150428
4321915    0.151231
342638     0.156564
514723     0.157467
4437190    0.157514
535359     0.158603
269937     0.165551
289441     0.173129
4349788    0.178520
264673     0.183357
631620     0.184135
237364     0.189060
130002     0.205811
454228     0.206203
242843     0.206698
862869     0.257691
1028571    0.301334
4226754    0.305866
403853     0.345773
4121939    0.355372
dtype: float64
</pre></div>
</div>
</div>
</div>
<div class="admonition-exercise admonition">
<p class="admonition-title">Exercise</p>
<p>If you again refer back to the table of our sample labels (defined in the <code class="docutils literal notranslate"><span class="pre">sequence_labels</span></code> variable) above, do you notice any patterns in the ordered samples along this axis?</p>
</div>
<p>We can plot this single axis using a strip plot. In this plot, only placement on the horizontal axis is meaningful. The variation in placement of points along the vertical axis is only to aid in visualization. In this plot, each point represents a single sample from our feature table. Samples that are closer in space along this axis are more similar to each other in their k-mer composition.</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">seaborn</span> <span class="k">as</span> <span class="nn">sns</span>

<span class="n">_</span> <span class="o">=</span> <span class="n">sns</span><span class="o">.</span><span class="n">stripplot</span><span class="p">(</span><span class="n">x</span><span class="o">=</span><span class="n">axis1_values</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<img alt="_images/machine-learning_37_0.png" src="_images/machine-learning_37_0.png" />
</div>
</div>
<p>While we may already be able to see some clustering of samples on the first axis, additional axes that are uncorrelated with this first axis can provide more information about which samples are most similar to each other.</p>
<p>Selecting the next axes to plot in polar ordination is more complicated than choosing the first. Generally, you would begin by computing all axes, which would be defined for all pairs of samples. Then, you identify the axes that represent the largest distances (as we did for our first axis above), but that are also uncorrelated with previously selected axes.</p>
<p>Let’s start this by computing all polar ordination axes.</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="k">def</span> <span class="nf">polar_ordination</span><span class="p">(</span><span class="n">dm</span><span class="p">):</span>
    <span class="n">result</span> <span class="o">=</span> <span class="p">{}</span>
    <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">id1</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">dm</span><span class="o">.</span><span class="n">ids</span><span class="p">):</span>
        <span class="k">for</span> <span class="n">id2</span> <span class="ow">in</span> <span class="n">dm</span><span class="o">.</span><span class="n">ids</span><span class="p">[:</span><span class="n">i</span><span class="p">]:</span>
            <span class="n">axis_label</span> <span class="o">=</span> <span class="s1">&#39;</span><span class="si">%s</span><span class="s1"> to </span><span class="si">%s</span><span class="s1">&#39;</span> <span class="o">%</span> <span class="p">(</span><span class="n">id1</span><span class="p">,</span> <span class="n">id2</span><span class="p">)</span>
            <span class="n">result</span><span class="p">[</span><span class="n">axis_label</span><span class="p">]</span> <span class="o">=</span> <span class="n">compute_axis</span><span class="p">(</span><span class="n">dm</span><span class="p">,</span> <span class="n">id1</span><span class="p">,</span> <span class="n">id2</span><span class="p">)</span>
    <span class="n">result</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">result</span><span class="p">)</span>
    <span class="k">return</span> <span class="n">result</span>
</pre></div>
</div>
</div>
</div>
<p>We can apply this function to our distance matrix, and see all of the polar ordination axes.</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">sequence_polar_ordination</span> <span class="o">=</span> <span class="n">polar_ordination</span><span class="p">(</span><span class="n">sequence_distance_matrix</span><span class="p">)</span>
<span class="n">sequence_polar_ordination</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<div class="output text_html"><div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
</style>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>1111241 to 1020921</th>
      <th>241971 to 1020921</th>
      <th>241971 to 1111241</th>
      <th>970921 to 1020921</th>
      <th>970921 to 1111241</th>
      <th>970921 to 241971</th>
      <th>867450 to 1020921</th>
      <th>867450 to 1111241</th>
      <th>867450 to 241971</th>
      <th>867450 to 970921</th>
      <th>...</th>
      <th>342638 to 265715</th>
      <th>342638 to 4437190</th>
      <th>342638 to 631620</th>
      <th>342638 to 289441</th>
      <th>342638 to 264673</th>
      <th>342638 to 242843</th>
      <th>342638 to 515798</th>
      <th>342638 to 299830</th>
      <th>342638 to 514723</th>
      <th>342638 to 328035</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>1020921</th>
      <td>0.150215</td>
      <td>0.146018</td>
      <td>0.047244</td>
      <td>0.188841</td>
      <td>0.126809</td>
      <td>0.123595</td>
      <td>0.140969</td>
      <td>0.057069</td>
      <td>0.059137</td>
      <td>0.034827</td>
      <td>...</td>
      <td>0.090745</td>
      <td>0.112010</td>
      <td>0.086142</td>
      <td>0.096484</td>
      <td>0.091427</td>
      <td>0.086229</td>
      <td>-0.000239</td>
      <td>0.069525</td>
      <td>0.045682</td>
      <td>0.089937</td>
    </tr>
    <tr>
      <th>1028571</th>
      <td>0.057949</td>
      <td>0.055256</td>
      <td>0.052957</td>
      <td>0.116924</td>
      <td>0.128346</td>
      <td>0.121467</td>
      <td>0.055764</td>
      <td>0.070843</td>
      <td>0.068727</td>
      <td>0.044327</td>
      <td>...</td>
      <td>0.076107</td>
      <td>0.086698</td>
      <td>0.103087</td>
      <td>0.102166</td>
      <td>0.087414</td>
      <td>0.106500</td>
      <td>0.011588</td>
      <td>0.020975</td>
      <td>0.089545</td>
      <td>-0.068627</td>
    </tr>
    <tr>
      <th>1111241</th>
      <td>0.000000</td>
      <td>0.034359</td>
      <td>0.106195</td>
      <td>0.121835</td>
      <td>0.181435</td>
      <td>0.147272</td>
      <td>0.054329</td>
      <td>0.134199</td>
      <td>0.090732</td>
      <td>0.037489</td>
      <td>...</td>
      <td>0.125775</td>
      <td>0.135028</td>
      <td>0.110808</td>
      <td>0.132479</td>
      <td>0.106107</td>
      <td>0.129468</td>
      <td>0.049989</td>
      <td>0.128384</td>
      <td>0.055548</td>
      <td>0.044616</td>
    </tr>
    <tr>
      <th>130002</th>
      <td>0.043865</td>
      <td>0.080792</td>
      <td>0.107992</td>
      <td>0.180682</td>
      <td>0.206366</td>
      <td>0.175371</td>
      <td>0.094276</td>
      <td>0.127062</td>
      <td>0.081859</td>
      <td>0.004313</td>
      <td>...</td>
      <td>0.199372</td>
      <td>0.215031</td>
      <td>0.217350</td>
      <td>0.214494</td>
      <td>0.199590</td>
      <td>0.212951</td>
      <td>0.032673</td>
      <td>0.055750</td>
      <td>0.020269</td>
      <td>0.095954</td>
    </tr>
    <tr>
      <th>237364</th>
      <td>0.073785</td>
      <td>0.119687</td>
      <td>0.119149</td>
      <td>0.174917</td>
      <td>0.175594</td>
      <td>0.131477</td>
      <td>0.095949</td>
      <td>0.095328</td>
      <td>0.039813</td>
      <td>0.012330</td>
      <td>...</td>
      <td>0.152496</td>
      <td>0.168793</td>
      <td>0.181711</td>
      <td>0.176570</td>
      <td>0.162770</td>
      <td>0.198412</td>
      <td>0.066947</td>
      <td>0.055750</td>
      <td>0.032158</td>
      <td>0.095954</td>
    </tr>
    <tr>
      <th>241971</th>
      <td>0.041676</td>
      <td>0.000000</td>
      <td>0.000000</td>
      <td>0.100912</td>
      <td>0.125153</td>
      <td>0.154185</td>
      <td>0.054310</td>
      <td>0.087531</td>
      <td>0.129464</td>
      <td>0.061388</td>
      <td>...</td>
      <td>0.133957</td>
      <td>0.139243</td>
      <td>0.121342</td>
      <td>0.142736</td>
      <td>0.096525</td>
      <td>0.122890</td>
      <td>0.122891</td>
      <td>0.107102</td>
      <td>0.077681</td>
      <td>0.104803</td>
    </tr>
    <tr>
      <th>242843</th>
      <td>0.041527</td>
      <td>0.093675</td>
      <td>0.129014</td>
      <td>0.195946</td>
      <td>0.224189</td>
      <td>0.181865</td>
      <td>0.082909</td>
      <td>0.117739</td>
      <td>0.054952</td>
      <td>-0.022832</td>
      <td>...</td>
      <td>0.181203</td>
      <td>0.185950</td>
      <td>0.197659</td>
      <td>0.187156</td>
      <td>0.180439</td>
      <td>0.225000</td>
      <td>0.032545</td>
      <td>0.055752</td>
      <td>-0.004884</td>
      <td>0.045223</td>
    </tr>
    <tr>
      <th>264673</th>
      <td>0.085438</td>
      <td>0.142716</td>
      <td>0.134332</td>
      <td>0.192688</td>
      <td>0.184444</td>
      <td>0.131433</td>
      <td>0.107171</td>
      <td>0.094073</td>
      <td>0.026058</td>
      <td>0.001592</td>
      <td>...</td>
      <td>0.190645</td>
      <td>0.189863</td>
      <td>0.202339</td>
      <td>0.191041</td>
      <td>0.213389</td>
      <td>0.171127</td>
      <td>-0.001766</td>
      <td>0.055362</td>
      <td>-0.002120</td>
      <td>0.092829</td>
    </tr>
    <tr>
      <th>265715</th>
      <td>0.052104</td>
      <td>0.081399</td>
      <td>0.097173</td>
      <td>0.113181</td>
      <td>0.129289</td>
      <td>0.092124</td>
      <td>0.070485</td>
      <td>0.092848</td>
      <td>0.055269</td>
      <td>0.061166</td>
      <td>...</td>
      <td>0.232365</td>
      <td>0.216483</td>
      <td>0.219110</td>
      <td>0.215744</td>
      <td>0.207599</td>
      <td>0.187134</td>
      <td>0.013984</td>
      <td>0.022814</td>
      <td>0.006204</td>
      <td>0.098638</td>
    </tr>
    <tr>
      <th>269937</th>
      <td>0.105687</td>
      <td>0.147911</td>
      <td>0.112832</td>
      <td>0.215325</td>
      <td>0.191239</td>
      <td>0.154238</td>
      <td>0.139274</td>
      <td>0.105129</td>
      <td>0.055154</td>
      <td>0.003109</td>
      <td>...</td>
      <td>0.000644</td>
      <td>0.016084</td>
      <td>0.018297</td>
      <td>0.028580</td>
      <td>0.029665</td>
      <td>0.034975</td>
      <td>-0.020625</td>
      <td>0.095492</td>
      <td>0.032845</td>
      <td>-0.002020</td>
    </tr>
    <tr>
      <th>289441</th>
      <td>0.053811</td>
      <td>0.080792</td>
      <td>0.093923</td>
      <td>0.150187</td>
      <td>0.166392</td>
      <td>0.138022</td>
      <td>0.082235</td>
      <td>0.103280</td>
      <td>0.068748</td>
      <td>0.028894</td>
      <td>...</td>
      <td>0.203343</td>
      <td>0.212402</td>
      <td>0.211767</td>
      <td>0.219008</td>
      <td>0.196071</td>
      <td>0.182172</td>
      <td>0.032955</td>
      <td>0.024060</td>
      <td>0.009525</td>
      <td>0.093803</td>
    </tr>
    <tr>
      <th>299830</th>
      <td>0.065757</td>
      <td>0.123543</td>
      <td>0.135808</td>
      <td>0.173305</td>
      <td>0.180564</td>
      <td>0.125851</td>
      <td>0.081083</td>
      <td>0.088699</td>
      <td>0.019277</td>
      <td>0.001488</td>
      <td>...</td>
      <td>0.010157</td>
      <td>0.013043</td>
      <td>0.006047</td>
      <td>0.011365</td>
      <td>0.026839</td>
      <td>0.025633</td>
      <td>-0.040234</td>
      <td>0.103448</td>
      <td>-0.015172</td>
      <td>0.029612</td>
    </tr>
    <tr>
      <th>3248323</th>
      <td>0.062865</td>
      <td>0.131480</td>
      <td>0.150812</td>
      <td>0.142280</td>
      <td>0.150666</td>
      <td>0.080336</td>
      <td>0.095372</td>
      <td>0.106945</td>
      <td>0.025884</td>
      <td>0.049141</td>
      <td>...</td>
      <td>0.012938</td>
      <td>0.053948</td>
      <td>0.032071</td>
      <td>0.040877</td>
      <td>0.025596</td>
      <td>0.046895</td>
      <td>-0.009150</td>
      <td>0.066333</td>
      <td>0.009609</td>
      <td>0.038252</td>
    </tr>
    <tr>
      <th>328035</th>
      <td>0.115805</td>
      <td>0.146955</td>
      <td>0.097206</td>
      <td>0.154309</td>
      <td>0.119356</td>
      <td>0.080413</td>
      <td>0.091931</td>
      <td>0.044074</td>
      <td>0.004684</td>
      <td>0.032456</td>
      <td>...</td>
      <td>0.013385</td>
      <td>0.013730</td>
      <td>0.014018</td>
      <td>0.013505</td>
      <td>0.013717</td>
      <td>0.006338</td>
      <td>-0.007675</td>
      <td>0.009026</td>
      <td>-0.016842</td>
      <td>0.031532</td>
    </tr>
    <tr>
      <th>342638</th>
      <td>0.106292</td>
      <td>0.150165</td>
      <td>0.115077</td>
      <td>0.156781</td>
      <td>0.129805</td>
      <td>0.080400</td>
      <td>0.081560</td>
      <td>0.043828</td>
      <td>-0.010230</td>
      <td>0.020782</td>
      <td>...</td>
      <td>0.000000</td>
      <td>0.000000</td>
      <td>0.000000</td>
      <td>0.000000</td>
      <td>0.000000</td>
      <td>0.000000</td>
      <td>0.000000</td>
      <td>0.000000</td>
      <td>0.000000</td>
      <td>0.000000</td>
    </tr>
    <tr>
      <th>347138</th>
      <td>0.042286</td>
      <td>0.117926</td>
      <td>0.161284</td>
      <td>0.112741</td>
      <td>0.136959</td>
      <td>0.056994</td>
      <td>0.035791</td>
      <td>0.067393</td>
      <td>-0.023705</td>
      <td>0.032067</td>
      <td>...</td>
      <td>0.022992</td>
      <td>0.047161</td>
      <td>0.024812</td>
      <td>0.050078</td>
      <td>0.035551</td>
      <td>0.040588</td>
      <td>-0.023172</td>
      <td>0.041550</td>
      <td>-0.021217</td>
      <td>0.076179</td>
    </tr>
    <tr>
      <th>403853</th>
      <td>0.057582</td>
      <td>0.069496</td>
      <td>0.073057</td>
      <td>0.189682</td>
      <td>0.204377</td>
      <td>0.197092</td>
      <td>0.070485</td>
      <td>0.086716</td>
      <td>0.068694</td>
      <td>-0.026274</td>
      <td>...</td>
      <td>0.126580</td>
      <td>0.134919</td>
      <td>0.144389</td>
      <td>0.146819</td>
      <td>0.113966</td>
      <td>0.141219</td>
      <td>0.051404</td>
      <td>0.077668</td>
      <td>0.091493</td>
      <td>-0.077193</td>
    </tr>
    <tr>
      <th>4121939</th>
      <td>0.085191</td>
      <td>0.084538</td>
      <td>0.054687</td>
      <td>0.191860</td>
      <td>0.183786</td>
      <td>0.185515</td>
      <td>0.086029</td>
      <td>0.072141</td>
      <td>0.068655</td>
      <td>-0.015500</td>
      <td>...</td>
      <td>0.099277</td>
      <td>0.115546</td>
      <td>0.134669</td>
      <td>0.129048</td>
      <td>0.104271</td>
      <td>0.132541</td>
      <td>0.007822</td>
      <td>0.018282</td>
      <td>0.091552</td>
      <td>-0.077901</td>
    </tr>
    <tr>
      <th>4226754</th>
      <td>0.099088</td>
      <td>0.097961</td>
      <td>0.053485</td>
      <td>0.150856</td>
      <td>0.129602</td>
      <td>0.122583</td>
      <td>0.056113</td>
      <td>0.025161</td>
      <td>0.020942</td>
      <td>0.005842</td>
      <td>...</td>
      <td>0.082173</td>
      <td>0.085617</td>
      <td>0.092996</td>
      <td>0.101325</td>
      <td>0.065487</td>
      <td>0.096246</td>
      <td>0.030386</td>
      <td>0.019388</td>
      <td>0.064967</td>
      <td>-0.074061</td>
    </tr>
    <tr>
      <th>4307092</th>
      <td>0.115466</td>
      <td>0.146115</td>
      <td>0.096530</td>
      <td>0.182277</td>
      <td>0.148746</td>
      <td>0.115462</td>
      <td>0.102686</td>
      <td>0.055750</td>
      <td>0.017341</td>
      <td>0.009666</td>
      <td>...</td>
      <td>0.011620</td>
      <td>0.026412</td>
      <td>0.020168</td>
      <td>0.030517</td>
      <td>0.039499</td>
      <td>0.044532</td>
      <td>-0.019001</td>
      <td>0.103244</td>
      <td>0.043040</td>
      <td>0.071424</td>
    </tr>
    <tr>
      <th>4321402</th>
      <td>0.015229</td>
      <td>0.069493</td>
      <td>0.132962</td>
      <td>0.104405</td>
      <td>0.150684</td>
      <td>0.092651</td>
      <td>0.031896</td>
      <td>0.093588</td>
      <td>0.026680</td>
      <td>0.038271</td>
      <td>...</td>
      <td>0.040148</td>
      <td>0.026083</td>
      <td>0.019741</td>
      <td>0.037121</td>
      <td>0.019983</td>
      <td>0.051533</td>
      <td>0.015611</td>
      <td>0.099727</td>
      <td>0.055663</td>
      <td>0.044196</td>
    </tr>
    <tr>
      <th>4321915</th>
      <td>0.074872</td>
      <td>0.114357</td>
      <td>0.110284</td>
      <td>0.147687</td>
      <td>0.146353</td>
      <td>0.103174</td>
      <td>0.070485</td>
      <td>0.067363</td>
      <td>0.018098</td>
      <td>0.021726</td>
      <td>...</td>
      <td>0.029315</td>
      <td>0.033013</td>
      <td>0.034566</td>
      <td>0.037976</td>
      <td>0.038832</td>
      <td>0.037339</td>
      <td>0.006845</td>
      <td>0.003808</td>
      <td>0.000686</td>
      <td>-0.059141</td>
    </tr>
    <tr>
      <th>4349788</th>
      <td>0.054978</td>
      <td>0.091533</td>
      <td>0.107040</td>
      <td>0.167584</td>
      <td>0.183533</td>
      <td>0.149158</td>
      <td>0.093323</td>
      <td>0.113621</td>
      <td>0.068708</td>
      <td>0.018470</td>
      <td>...</td>
      <td>0.166280</td>
      <td>0.178456</td>
      <td>0.183994</td>
      <td>0.175935</td>
      <td>0.174255</td>
      <td>0.194107</td>
      <td>0.033456</td>
      <td>0.040272</td>
      <td>0.023391</td>
      <td>0.090399</td>
    </tr>
    <tr>
      <th>4404222</th>
      <td>0.092289</td>
      <td>0.124235</td>
      <td>0.099230</td>
      <td>0.160911</td>
      <td>0.145697</td>
      <td>0.110016</td>
      <td>0.116800</td>
      <td>0.096520</td>
      <td>0.057388</td>
      <td>0.046129</td>
      <td>...</td>
      <td>0.065377</td>
      <td>0.079145</td>
      <td>0.068378</td>
      <td>0.081505</td>
      <td>0.076559</td>
      <td>0.080361</td>
      <td>0.048842</td>
      <td>0.030291</td>
      <td>0.068748</td>
      <td>0.035696</td>
    </tr>
    <tr>
      <th>4437190</th>
      <td>0.075265</td>
      <td>0.112402</td>
      <td>0.107040</td>
      <td>0.154777</td>
      <td>0.153407</td>
      <td>0.113709</td>
      <td>0.103373</td>
      <td>0.101470</td>
      <td>0.056113</td>
      <td>0.041683</td>
      <td>...</td>
      <td>0.188640</td>
      <td>0.202479</td>
      <td>0.196304</td>
      <td>0.196372</td>
      <td>0.180156</td>
      <td>0.167338</td>
      <td>0.033765</td>
      <td>0.025529</td>
      <td>-0.009846</td>
      <td>0.088170</td>
    </tr>
    <tr>
      <th>454228</th>
      <td>0.041717</td>
      <td>0.093519</td>
      <td>0.128530</td>
      <td>0.184117</td>
      <td>0.211720</td>
      <td>0.167526</td>
      <td>0.108273</td>
      <td>0.144170</td>
      <td>0.082747</td>
      <td>0.012330</td>
      <td>...</td>
      <td>0.221447</td>
      <td>0.225474</td>
      <td>0.233137</td>
      <td>0.227860</td>
      <td>0.219201</td>
      <td>0.235749</td>
      <td>0.013032</td>
      <td>0.073139</td>
      <td>0.065928</td>
      <td>0.102101</td>
    </tr>
    <tr>
      <th>514723</th>
      <td>0.096315</td>
      <td>0.116878</td>
      <td>0.083419</td>
      <td>0.121584</td>
      <td>0.101432</td>
      <td>0.068816</td>
      <td>0.119528</td>
      <td>0.094879</td>
      <td>0.068656</td>
      <td>0.093407</td>
      <td>...</td>
      <td>0.004055</td>
      <td>-0.007387</td>
      <td>0.002883</td>
      <td>0.006607</td>
      <td>-0.001509</td>
      <td>-0.003298</td>
      <td>0.048837</td>
      <td>-0.022278</td>
      <td>0.151899</td>
      <td>-0.081134</td>
    </tr>
    <tr>
      <th>515798</th>
      <td>0.074308</td>
      <td>0.069506</td>
      <td>0.049412</td>
      <td>0.103273</td>
      <td>0.100593</td>
      <td>0.091252</td>
      <td>0.047620</td>
      <td>0.043976</td>
      <td>0.043786</td>
      <td>0.052982</td>
      <td>...</td>
      <td>0.005757</td>
      <td>0.015951</td>
      <td>0.009106</td>
      <td>0.014393</td>
      <td>-0.000792</td>
      <td>0.013836</td>
      <td>0.095652</td>
      <td>-0.037201</td>
      <td>0.030753</td>
      <td>-0.023284</td>
    </tr>
    <tr>
      <th>535359</th>
      <td>0.084721</td>
      <td>0.109403</td>
      <td>0.089540</td>
      <td>0.159656</td>
      <td>0.150656</td>
      <td>0.122524</td>
      <td>0.111764</td>
      <td>0.099700</td>
      <td>0.068632</td>
      <td>0.043267</td>
      <td>...</td>
      <td>0.074912</td>
      <td>0.097681</td>
      <td>0.087958</td>
      <td>0.098754</td>
      <td>0.087546</td>
      <td>0.083945</td>
      <td>0.035342</td>
      <td>0.014907</td>
      <td>0.059131</td>
      <td>0.077678</td>
    </tr>
    <tr>
      <th>591785</th>
      <td>0.107295</td>
      <td>0.140532</td>
      <td>0.100411</td>
      <td>0.169246</td>
      <td>0.141948</td>
      <td>0.104790</td>
      <td>0.118441</td>
      <td>0.081446</td>
      <td>0.040794</td>
      <td>0.038002</td>
      <td>...</td>
      <td>-0.012229</td>
      <td>-0.006827</td>
      <td>-0.014814</td>
      <td>-0.001274</td>
      <td>0.016156</td>
      <td>0.022080</td>
      <td>-0.018035</td>
      <td>0.102015</td>
      <td>0.052276</td>
      <td>0.035336</td>
    </tr>
    <tr>
      <th>631620</th>
      <td>0.074688</td>
      <td>0.103768</td>
      <td>0.095983</td>
      <td>0.159208</td>
      <td>0.158496</td>
      <td>0.127312</td>
      <td>0.118441</td>
      <td>0.117945</td>
      <td>0.082259</td>
      <td>0.049476</td>
      <td>...</td>
      <td>0.181489</td>
      <td>0.186598</td>
      <td>0.192469</td>
      <td>0.186105</td>
      <td>0.182502</td>
      <td>0.169081</td>
      <td>0.018322</td>
      <td>0.011251</td>
      <td>0.003653</td>
      <td>0.085564</td>
    </tr>
    <tr>
      <th>862869</th>
      <td>0.085421</td>
      <td>0.081042</td>
      <td>0.049553</td>
      <td>0.161646</td>
      <td>0.152148</td>
      <td>0.151821</td>
      <td>0.120339</td>
      <td>0.107924</td>
      <td>0.109957</td>
      <td>0.048308</td>
      <td>...</td>
      <td>0.110978</td>
      <td>0.129476</td>
      <td>0.111614</td>
      <td>0.126351</td>
      <td>0.105784</td>
      <td>0.115029</td>
      <td>0.050262</td>
      <td>0.039550</td>
      <td>0.110229</td>
      <td>-0.008007</td>
    </tr>
    <tr>
      <th>867450</th>
      <td>0.068907</td>
      <td>0.062355</td>
      <td>0.047219</td>
      <td>0.114079</td>
      <td>0.116312</td>
      <td>0.111259</td>
      <td>0.000000</td>
      <td>0.000000</td>
      <td>0.000000</td>
      <td>0.000000</td>
      <td>...</td>
      <td>0.097464</td>
      <td>0.096824</td>
      <td>0.059130</td>
      <td>0.096050</td>
      <td>0.074509</td>
      <td>0.085384</td>
      <td>0.049781</td>
      <td>0.070175</td>
      <td>0.010446</td>
      <td>0.043571</td>
    </tr>
    <tr>
      <th>970921</th>
      <td>0.065979</td>
      <td>0.032302</td>
      <td>0.010037</td>
      <td>0.000000</td>
      <td>0.000000</td>
      <td>0.000000</td>
      <td>0.040818</td>
      <td>0.046154</td>
      <td>0.078341</td>
      <td>0.165217</td>
      <td>...</td>
      <td>0.126178</td>
      <td>0.113879</td>
      <td>0.083761</td>
      <td>0.102169</td>
      <td>0.059651</td>
      <td>0.053359</td>
      <td>0.105399</td>
      <td>0.039360</td>
      <td>0.089439</td>
      <td>0.104739</td>
    </tr>
    <tr>
      <th>981912</th>
      <td>0.095650</td>
      <td>0.137701</td>
      <td>0.112991</td>
      <td>0.186523</td>
      <td>0.169572</td>
      <td>0.128631</td>
      <td>0.104501</td>
      <td>0.079837</td>
      <td>0.028807</td>
      <td>0.006360</td>
      <td>...</td>
      <td>0.017053</td>
      <td>0.032638</td>
      <td>0.026735</td>
      <td>0.036295</td>
      <td>0.037498</td>
      <td>0.050183</td>
      <td>0.008449</td>
      <td>-0.010451</td>
      <td>0.012845</td>
      <td>-0.042424</td>
    </tr>
  </tbody>
</table>
<p>35 rows × 595 columns</p>
</div></div></div>
</div>
<p>This isn’t much (or any) more interpretable than the distance matrix itself was, so the next step is to select the most informative axes to view. The first (most informative) axis will still be the axis that we identified above as the one representing the largest distance in the distance matrix. The second will be an axis that also contains a large distance (relative to the other distances in the distance matrix), and which is uncorrelated with the first axis. Selecting this axis based on these two criteria is subjective, because there is not a specific definition of “uncorrelated”. We need to come up with an objective approach so a computer can solve the problem. Here, I define a <em>score</em> metric that can be computed for each axis. This score is computed as the largest distance along the current axis divided by the absolute value of the Spearman correlation coefficient between the current axis and the first axis. In other words:</p>
<div class="math notranslate nohighlight" id="equation-polar-ordination-axis-score">
<span class="eqno">(4)<a class="headerlink" href="#equation-polar-ordination-axis-score" title="Permalink to this equation">¶</a></span>\[Score_{A} = \frac{max \, distance \, on \, A}{|Spearman(Axis \, 1, A)|}\]</div>
<p>In this formula, <span class="math notranslate nohighlight">\(A\)</span> represents the current axis, and <span class="math notranslate nohighlight">\(Axis 1\)</span> represents the first axis in the polar ordination.</p>
<p>This can be computed as follows. This function takes a polar ordination result as input and returns the identifiers of the first two axes as well as a summary of the maximum distance along each axis, the correlation of each axis with the first axis, and the score (as defined by <a class="reference internal" href="#equation-polar-ordination-axis-score">(4)</a>) for each axis.</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="k">def</span> <span class="nf">select_polar_ordination_axes</span><span class="p">(</span><span class="n">polar_ordination</span><span class="p">):</span>
    <span class="c1"># this function would be better if it defined more axes, </span>
    <span class="c1"># eg by always looking for correlation with preceding axis. </span>
    <span class="n">distance_sorted_ord_axes</span> <span class="o">=</span> <span class="n">polar_ordination</span><span class="o">.</span><span class="n">max</span><span class="p">()</span><span class="o">.</span><span class="n">sort_values</span><span class="p">(</span><span class="n">ascending</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
    <span class="n">first_axis_idx</span> <span class="o">=</span> <span class="n">distance_sorted_ord_axes</span><span class="o">.</span><span class="n">index</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
    <span class="n">corrs</span> <span class="o">=</span> <span class="n">polar_ordination</span><span class="o">.</span><span class="n">corrwith</span><span class="p">(</span><span class="n">polar_ordination</span><span class="p">[</span><span class="n">first_axis_idx</span><span class="p">],</span> <span class="n">method</span><span class="o">=</span><span class="s1">&#39;spearman&#39;</span><span class="p">)</span><span class="o">.</span><span class="n">abs</span><span class="p">()</span><span class="o">.</span><span class="n">sort_values</span><span class="p">(</span><span class="n">ascending</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
    <span class="n">scores</span> <span class="o">=</span> <span class="n">distance_sorted_ord_axes</span> <span class="o">/</span> <span class="n">corrs</span>
    <span class="n">result</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">concat</span><span class="p">([</span><span class="n">distance_sorted_ord_axes</span><span class="p">,</span> <span class="n">corrs</span><span class="p">,</span> <span class="n">scores</span><span class="p">],</span> <span class="n">axis</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
    <span class="n">result</span><span class="o">.</span><span class="n">columns</span> <span class="o">=</span> <span class="p">[</span><span class="s1">&#39;maximum distance&#39;</span><span class="p">,</span> <span class="s1">&#39;corr with first axis&#39;</span><span class="p">,</span> <span class="s1">&#39;score&#39;</span><span class="p">]</span>
    <span class="n">result</span><span class="o">.</span><span class="n">sort_values</span><span class="p">(</span><span class="n">by</span><span class="o">=</span><span class="s1">&#39;score&#39;</span><span class="p">,</span> <span class="n">ascending</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">inplace</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
    <span class="n">second_axis_idx</span> <span class="o">=</span> <span class="n">result</span><span class="o">.</span><span class="n">index</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
    <span class="k">return</span> <span class="n">first_axis_idx</span><span class="p">,</span> <span class="n">second_axis_idx</span><span class="p">,</span> <span class="n">result</span>
</pre></div>
</div>
</div>
</div>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">first_axis_idx</span><span class="p">,</span> <span class="n">second_axis_idx</span><span class="p">,</span> <span class="n">axis_summaries</span> <span class="o">=</span> <span class="n">select_polar_ordination_axes</span><span class="p">(</span><span class="n">sequence_polar_ordination</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="s1">&#39;Axis 1 is defined by the distance from sample </span><span class="si">%s</span><span class="s1">&#39;</span> <span class="o">%</span> <span class="n">first_axis_idx</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="s1">&#39;Axis 2 is defined by the distance from sample </span><span class="si">%s</span><span class="s1">&#39;</span> <span class="o">%</span> <span class="n">second_axis_idx</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<div class="output stream highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>Axis 1 is defined by the distance from sample 4121939 to 970921
Axis 2 is defined by the distance from sample 289441 to 862869
</pre></div>
</div>
</div>
</div>
<p>For each axis, we can view the maximum distance along that axis, it’s correlation coefficient with the first axis, and the score of the axis, by viewing the summary returned from <code class="docutils literal notranslate"><span class="pre">select_polar_ordination_axes</span></code>.</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">axis_summaries</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<div class="output text_html"><div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
</style>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>maximum distance</th>
      <th>corr with first axis</th>
      <th>score</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>289441 to 862869</th>
      <td>0.231469</td>
      <td>0.000560</td>
      <td>413.171897</td>
    </tr>
    <tr>
      <th>3248323 to 4321402</th>
      <td>0.229508</td>
      <td>0.001821</td>
      <td>126.044136</td>
    </tr>
    <tr>
      <th>631620 to 4226754</th>
      <td>0.272340</td>
      <td>0.002241</td>
      <td>121.531915</td>
    </tr>
    <tr>
      <th>299830 to 241971</th>
      <td>0.230453</td>
      <td>0.002801</td>
      <td>82.265843</td>
    </tr>
    <tr>
      <th>264673 to 4226754</th>
      <td>0.300847</td>
      <td>0.007003</td>
      <td>42.961017</td>
    </tr>
    <tr>
      <th>...</th>
      <td>...</td>
      <td>...</td>
      <td>...</td>
    </tr>
    <tr>
      <th>4321915 to 981912</th>
      <td>0.092920</td>
      <td>0.562745</td>
      <td>0.165120</td>
    </tr>
    <tr>
      <th>1028571 to 4121939</th>
      <td>0.114260</td>
      <td>0.698319</td>
      <td>0.163621</td>
    </tr>
    <tr>
      <th>4437190 to 454228</th>
      <td>0.105727</td>
      <td>0.694678</td>
      <td>0.152196</td>
    </tr>
    <tr>
      <th>4307092 to 591785</th>
      <td>0.076598</td>
      <td>0.544258</td>
      <td>0.140739</td>
    </tr>
    <tr>
      <th>4404222 to 535359</th>
      <td>0.090750</td>
      <td>0.666480</td>
      <td>0.136163</td>
    </tr>
  </tbody>
</table>
<p>595 rows × 3 columns</p>
</div></div></div>
</div>
<p>Now we can expand our strip plot to a two-dimensional scatter plot where we plot the first two axes of our polar ordination. You should notice that there is some grouping or clustering of samples in this plot. This group should be more distinct than it was in the strip plot, since our samples can now separate along two axes.</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">_</span> <span class="o">=</span> <span class="n">sns</span><span class="o">.</span><span class="n">scatterplot</span><span class="p">(</span><span class="n">x</span><span class="o">=</span><span class="n">sequence_polar_ordination</span><span class="p">[</span><span class="n">first_axis_idx</span><span class="p">],</span> 
                    <span class="n">y</span><span class="o">=</span><span class="n">sequence_polar_ordination</span><span class="p">[</span><span class="n">second_axis_idx</span><span class="p">])</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<img alt="_images/machine-learning_48_0.png" src="_images/machine-learning_48_0.png" />
</div>
</div>
<p>This plot illustrates that there is some structure in our dataset. If you look up distances between samples that are closer to each other in space in the scatterplot, those distances on average will be smaller than the distances between samples that are farther apart in the scatterplot. This structure in the dataset is what has been “learned” by the polar ordination algorithm. Notice that the sequence labels were not used at all in this analysis, but if you look up where each sample is found on the plot, and cross-reference that against the sample labels, you’ll discover that samples that cluster together are from the same species.</p>
<p>When labels are available for a collection of samples, labels can be used with an ordination plot to explore whether samples cluster by sample label. This is often achieved by coloring the points in a scatterplot by their label. We can do this for our sequence ordination as follows. Notice that the colors (the microbial species represented in our dataset) group together.</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">matplotlib.pyplot</span> <span class="k">as</span> <span class="nn">plt</span>

<span class="n">_</span> <span class="o">=</span> <span class="n">sns</span><span class="o">.</span><span class="n">scatterplot</span><span class="p">(</span><span class="n">x</span><span class="o">=</span><span class="n">sequence_polar_ordination</span><span class="p">[</span><span class="n">first_axis_idx</span><span class="p">],</span> 
                    <span class="n">y</span><span class="o">=</span><span class="n">sequence_polar_ordination</span><span class="p">[</span><span class="n">second_axis_idx</span><span class="p">],</span> 
                    <span class="n">hue</span><span class="o">=</span><span class="n">sequence_labels</span><span class="p">[</span><span class="s1">&#39;legend entry&#39;</span><span class="p">])</span>
<span class="n">_</span> <span class="o">=</span> <span class="n">plt</span><span class="o">.</span><span class="n">legend</span><span class="p">(</span><span class="n">bbox_to_anchor</span><span class="o">=</span><span class="p">(</span><span class="mf">1.05</span><span class="p">,</span> <span class="mi">1</span><span class="p">),</span> <span class="n">loc</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">borderaxespad</span><span class="o">=</span><span class="mf">0.</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<img alt="_images/machine-learning_50_0.png" src="_images/machine-learning_50_0.png" />
</div>
</div>
<div class="admonition note">
<p class="admonition-title">Note</p>
<p>Remember that while we used the sample labels to help us interpret the ordination plot, they were not used at all in computing the ordination. Not using the labels when computing the ordination is what defines this ordination approach as being unsupervised.</p>
</div>
</div>
<div class="section" id="interpreting-ordination-plots">
<h3>Interpreting ordination plots<a class="headerlink" href="#interpreting-ordination-plots" title="Permalink to this headline">¶</a></h3>
<p>Let’s now work through some of the ideas we explored with polar ordination to understand general features of unsupervised ordination plots.</p>
<div class="section" id="axis-order">
<h4>Axis order<a class="headerlink" href="#axis-order" title="Permalink to this headline">¶</a></h4>
<p>First, the order of the axes in an ordination is generally important. The first axis typically represents in the largest differences in the data set, and the second axis generally presents the next largest differences that are uncorrelated with the first axis. If we didn’t select axes that represented the largest distances, we would be less likely to see patterns in our data.</p>
<p>For example, let’s focus on just a single axis again. We can sort our axis summary by maximum distance along an axis, and plot the axes that represent the largest and then the smallest distances.</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">distance_sorted_axis_summary</span> <span class="o">=</span> <span class="n">axis_summaries</span><span class="o">.</span><span class="n">sort_values</span><span class="p">(</span><span class="n">by</span><span class="o">=</span><span class="s1">&#39;maximum distance&#39;</span><span class="p">,</span> <span class="n">ascending</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
</pre></div>
</div>
</div>
</div>
<p>First, we’ll plot our samples along the axis representing the largest distance and we’ll separate the samples by species so we can see how their placement along the axes we define differ. You should notice here that the samples within each species roughly group together, and there may even be some separation of species on the axis.</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">largest_distance_axis_idx</span> <span class="o">=</span> <span class="n">distance_sorted_axis_summary</span><span class="o">.</span><span class="n">index</span><span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span>
<span class="n">_</span> <span class="o">=</span> <span class="n">sns</span><span class="o">.</span><span class="n">stripplot</span><span class="p">(</span><span class="n">x</span><span class="o">=</span><span class="n">sequence_polar_ordination</span><span class="p">[</span><span class="n">largest_distance_axis_idx</span><span class="p">],</span> 
                  <span class="n">y</span><span class="o">=</span><span class="n">sequence_labels</span><span class="p">[</span><span class="s1">&#39;legend entry&#39;</span><span class="p">])</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<img alt="_images/machine-learning_54_0.png" src="_images/machine-learning_54_0.png" />
</div>
</div>
<p>Now contrast this with what we’d see if we generated this same plot, but based on the smallest distance in the distance matrix rather than the largest. Clustering of samples by species should be much less apparent here.</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">smallest_distance_axis_idx</span> <span class="o">=</span> <span class="n">distance_sorted_axis_summary</span><span class="o">.</span><span class="n">index</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>

<span class="n">_</span> <span class="o">=</span> <span class="n">sns</span><span class="o">.</span><span class="n">stripplot</span><span class="p">(</span><span class="n">x</span><span class="o">=</span><span class="n">sequence_polar_ordination</span><span class="p">[</span><span class="n">smallest_distance_axis_idx</span><span class="p">],</span> 
                  <span class="n">y</span><span class="o">=</span><span class="n">sequence_labels</span><span class="p">[</span><span class="s1">&#39;legend entry&#39;</span><span class="p">])</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<img alt="_images/machine-learning_56_0.png" src="_images/machine-learning_56_0.png" />
</div>
</div>
<p>As you plot successive axes in an ordination, the axes represent smaller differences between samples. Typically you’ll want to focus on the first few ordination axes to the exclusion of later axes.</p>
</div>
<div class="section" id="uncorrelated-axes">
<h4>Uncorrelated axes<a class="headerlink" href="#uncorrelated-axes" title="Permalink to this headline">¶</a></h4>
<p>Next, ordination algorithms generally present uncorrelated axes as the most important axes. This is important because correlated axes, by definition, will present similar information.</p>
<p>Let’s look at another example here where we’ll plot a second polar ordination axis that is highly correlated with the first axis.</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="c1"># the axis most correlated with first_axis_idx will be first_axis_idx, so </span>
<span class="c1"># select the second axis</span>
<span class="n">correlated_axis_idx</span> <span class="o">=</span> <span class="n">axis_summaries</span><span class="o">.</span><span class="n">sort_values</span><span class="p">(</span><span class="n">by</span><span class="o">=</span><span class="s1">&#39;corr with first axis&#39;</span><span class="p">,</span> <span class="n">ascending</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span><span class="o">.</span><span class="n">index</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span>
</pre></div>
</div>
</div>
</div>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">_</span> <span class="o">=</span> <span class="n">sns</span><span class="o">.</span><span class="n">scatterplot</span><span class="p">(</span><span class="n">x</span><span class="o">=</span><span class="n">sequence_polar_ordination</span><span class="p">[</span><span class="n">first_axis_idx</span><span class="p">],</span> 
                    <span class="n">y</span><span class="o">=</span><span class="n">sequence_polar_ordination</span><span class="p">[</span><span class="n">correlated_axis_idx</span><span class="p">],</span> 
                    <span class="n">hue</span><span class="o">=</span><span class="n">sequence_labels</span><span class="p">[</span><span class="s1">&#39;legend entry&#39;</span><span class="p">])</span>
<span class="n">_</span> <span class="o">=</span> <span class="n">plt</span><span class="o">.</span><span class="n">legend</span><span class="p">(</span><span class="n">bbox_to_anchor</span><span class="o">=</span><span class="p">(</span><span class="mf">1.05</span><span class="p">,</span> <span class="mi">1</span><span class="p">),</span> <span class="n">loc</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">borderaxespad</span><span class="o">=</span><span class="mf">0.</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<img alt="_images/machine-learning_59_0.png" src="_images/machine-learning_59_0.png" />
</div>
</div>
<p>You can see that we don’t get much additional information from the second axis than we do from the first. In fact, if we drew a line along the diagonal, the placement of points along that line would be almost identical to those that we observed in the strip plot of the first axis that we started with above. The correlated axis isn’t adding much new information to our understanding of the clustering of samples, so we might as well not have it.</p>
</div>
<div class="section" id="directionality-of-the-axes">
<h4>Directionality of the axes<a class="headerlink" href="#directionality-of-the-axes" title="Permalink to this headline">¶</a></h4>
<p>One thing that you may have noticed as you computed the polar ordination above is that our definitions of end points was arbitrary. We first encountered this when computing <a class="reference internal" href="#equation-polar-ordination-axis">(3)</a>, when we defined one of a pair of samples to be placed at <span class="math notranslate nohighlight">\(0\)</span> along an axis, and the other of the pair of samples to be placed at <span class="math notranslate nohighlight">\(D\)</span> along the axis. Let’s again look at placement of all samples along the axis represented the largest distance in our data set computed as we did initially.</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">axis1_values_a</span> <span class="o">=</span> <span class="n">compute_axis</span><span class="p">(</span><span class="n">sequence_distance_matrix</span><span class="p">,</span> 
                              <span class="n">sample_id1</span><span class="p">,</span>
                              <span class="n">sample_id2</span><span class="p">)</span>

<span class="n">_</span> <span class="o">=</span> <span class="n">sns</span><span class="o">.</span><span class="n">stripplot</span><span class="p">(</span><span class="n">x</span><span class="o">=</span><span class="n">axis1_values_a</span><span class="p">,</span> 
                  <span class="n">y</span><span class="o">=</span><span class="n">sequence_labels</span><span class="p">[</span><span class="s1">&#39;legend entry&#39;</span><span class="p">])</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<img alt="_images/machine-learning_61_0.png" src="_images/machine-learning_61_0.png" />
</div>
</div>
<p>Now, let’s reverse the order of the sample ids that we’re providing as input to this function. In practice, this means that the sample that was previously placed at <span class="math notranslate nohighlight">\(0\)</span> will now be placed at <span class="math notranslate nohighlight">\(D\)</span> along this axis, and the sample that was previously placed at <span class="math notranslate nohighlight">\(D\)</span> will now be placed at <span class="math notranslate nohighlight">\(0\)</span> along this axis.</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">axis1_values_b</span> <span class="o">=</span> <span class="n">compute_axis</span><span class="p">(</span><span class="n">sequence_distance_matrix</span><span class="p">,</span> 
                              <span class="n">sample_id2</span><span class="p">,</span>
                              <span class="n">sample_id1</span><span class="p">)</span>

<span class="n">_</span> <span class="o">=</span> <span class="n">sns</span><span class="o">.</span><span class="n">stripplot</span><span class="p">(</span><span class="n">x</span><span class="o">=</span><span class="n">axis1_values_b</span><span class="p">,</span> 
                  <span class="n">y</span><span class="o">=</span><span class="n">sequence_labels</span><span class="p">[</span><span class="s1">&#39;legend entry&#39;</span><span class="p">])</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<img alt="_images/machine-learning_63_0.png" src="_images/machine-learning_63_0.png" />
</div>
</div>
<p>Notice that these two plots are mirror images of each other. Because they’re perfectly anti-correlated, they present identical information about the grouping of the samples. This will be true for any axis in our ordination, and for this reason the directionality of the axes in an ordination is not meaningful. You can always flip an axis and have the same result. You may also notice that if you run the same ordination multiple times, the directionality of the axes might change across runs. This is typically a result of how the algorithm is implemented, and it doesn’t impact your results at all.</p>
</div>
</div>
<div class="section" id="principal-coordinates-analysis-pcoa">
<h3>Principal Coordinates Analysis (PCoA)<a class="headerlink" href="#principal-coordinates-analysis-pcoa" title="Permalink to this headline">¶</a></h3>
<p>Finally, lets conclude our introduction to unsupervised learning by plotting these same data using principal coordinates analysis, or PCoA. As mentioned earlier, the math for PCoA is more complex than for polar ordination, but it works better than polar ordination and should be preferred in practice to polar ordination. scikit-bio includes an implementation of PCoA that can be used in practice as illustrated here.</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">skbio.stats.ordination</span>
<span class="n">sequence_pcoa_ordination</span> <span class="o">=</span> <span class="n">skbio</span><span class="o">.</span><span class="n">stats</span><span class="o">.</span><span class="n">ordination</span><span class="o">.</span><span class="n">pcoa</span><span class="p">(</span><span class="n">sequence_distance_matrix</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<div class="output stderr highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>/usr/share/miniconda/envs/iab2/lib/python3.8/site-packages/skbio/stats/ordination/_principal_coordinate_analysis.py:143: RuntimeWarning: The result contains negative eigenvalues. Please compare their magnitude with the magnitude of some of the largest positive eigenvalues. If the negative ones are smaller, it&#39;s probably safe to ignore them, but if they are large in magnitude, the results won&#39;t be useful. See the Notes section for more details. The smallest eigenvalue is -0.00984962671874657 and the largest is 0.2673648640921239.
  warn(
</pre></div>
</div>
</div>
</div>
<p>Just as with polar ordination, we can view a 2D scatterplot of these data.</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">_</span> <span class="o">=</span> <span class="n">sns</span><span class="o">.</span><span class="n">scatterplot</span><span class="p">(</span><span class="n">x</span><span class="o">=</span><span class="n">sequence_pcoa_ordination</span><span class="o">.</span><span class="n">samples</span><span class="p">[</span><span class="s1">&#39;PC1&#39;</span><span class="p">],</span> 
                    <span class="n">y</span><span class="o">=</span><span class="n">sequence_pcoa_ordination</span><span class="o">.</span><span class="n">samples</span><span class="p">[</span><span class="s1">&#39;PC2&#39;</span><span class="p">])</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<img alt="_images/machine-learning_67_0.png" src="_images/machine-learning_67_0.png" />
</div>
</div>
<p>That plot becomes more informative when we integrate sample labels, but like polar ordination, those sample labels were not used in the PCoA computation.</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">_</span> <span class="o">=</span> <span class="n">sns</span><span class="o">.</span><span class="n">scatterplot</span><span class="p">(</span><span class="n">x</span><span class="o">=</span><span class="n">sequence_pcoa_ordination</span><span class="o">.</span><span class="n">samples</span><span class="p">[</span><span class="s1">&#39;PC1&#39;</span><span class="p">],</span> 
                    <span class="n">y</span><span class="o">=</span><span class="n">sequence_pcoa_ordination</span><span class="o">.</span><span class="n">samples</span><span class="p">[</span><span class="s1">&#39;PC2&#39;</span><span class="p">],</span> 
                    <span class="n">hue</span><span class="o">=</span><span class="n">sequence_labels</span><span class="p">[</span><span class="s1">&#39;legend entry&#39;</span><span class="p">])</span>
<span class="n">_</span> <span class="o">=</span> <span class="n">plt</span><span class="o">.</span><span class="n">legend</span><span class="p">(</span><span class="n">bbox_to_anchor</span><span class="o">=</span><span class="p">(</span><span class="mf">1.05</span><span class="p">,</span> <span class="mi">1</span><span class="p">),</span> <span class="n">loc</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">borderaxespad</span><span class="o">=</span><span class="mf">0.</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<img alt="_images/machine-learning_69_0.png" src="_images/machine-learning_69_0.png" />
</div>
</div>
<p>PCoA (and other) ordination plots are often used in exploratory analysis, for example to see if sample similarity is associated with categories of metadata. This can be achieved by altering the metadata category that is used to define the coloring of samples in the plot. For example, in the following plot samples are colored by phlyum. Comparing this with the plot above, where samples are colored by genus and species, illustrates that samples cluster more distinctly by phylum than by species.</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">_</span> <span class="o">=</span> <span class="n">sns</span><span class="o">.</span><span class="n">scatterplot</span><span class="p">(</span><span class="n">x</span><span class="o">=</span><span class="n">sequence_pcoa_ordination</span><span class="o">.</span><span class="n">samples</span><span class="p">[</span><span class="s1">&#39;PC1&#39;</span><span class="p">],</span> 
                    <span class="n">y</span><span class="o">=</span><span class="n">sequence_pcoa_ordination</span><span class="o">.</span><span class="n">samples</span><span class="p">[</span><span class="s1">&#39;PC2&#39;</span><span class="p">],</span> 
                    <span class="n">hue</span><span class="o">=</span><span class="n">sequence_labels</span><span class="p">[</span><span class="s1">&#39;phylum&#39;</span><span class="p">])</span>
<span class="n">_</span> <span class="o">=</span> <span class="n">plt</span><span class="o">.</span><span class="n">legend</span><span class="p">(</span><span class="n">bbox_to_anchor</span><span class="o">=</span><span class="p">(</span><span class="mf">1.05</span><span class="p">,</span> <span class="mi">1</span><span class="p">),</span> <span class="n">loc</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">borderaxespad</span><span class="o">=</span><span class="mf">0.</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<img alt="_images/machine-learning_71_0.png" src="_images/machine-learning_71_0.png" />
</div>
</div>
<p>While in the examples presented here we have looked at the first two ordination axes, ordination methods typically generate many axes. The exact number of axes differs by method, but it’s generally a function of the number of samples in the analysis. Ordination axes are typically ordered by the amount of variation in the data that is explained, so the first axis explains more varation than the second; the second axis explains more variation than the third; and so on. For that reason, visualization of ordination plots typically focus on the first two or three axes. It’s possible to view additional axes however, and these may illustrate different patterns in your data. Typically however you will want to focus on the first few axes as the latter axis may be misleading if they explain relatively small amounts of variation. The following plot illustrates PCoA axis 1 versus PCoA axis 3, where the previous plots illustrate PCoA axis 1 versus PCoA axis 2.</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">_</span> <span class="o">=</span> <span class="n">sns</span><span class="o">.</span><span class="n">scatterplot</span><span class="p">(</span><span class="n">x</span><span class="o">=</span><span class="n">sequence_pcoa_ordination</span><span class="o">.</span><span class="n">samples</span><span class="p">[</span><span class="s1">&#39;PC1&#39;</span><span class="p">],</span> 
                    <span class="n">y</span><span class="o">=</span><span class="n">sequence_pcoa_ordination</span><span class="o">.</span><span class="n">samples</span><span class="p">[</span><span class="s1">&#39;PC3&#39;</span><span class="p">],</span> 
                    <span class="n">hue</span><span class="o">=</span><span class="n">sequence_labels</span><span class="p">[</span><span class="s1">&#39;legend entry&#39;</span><span class="p">])</span>
<span class="n">_</span> <span class="o">=</span> <span class="n">plt</span><span class="o">.</span><span class="n">legend</span><span class="p">(</span><span class="n">bbox_to_anchor</span><span class="o">=</span><span class="p">(</span><span class="mf">1.05</span><span class="p">,</span> <span class="mi">1</span><span class="p">),</span> <span class="n">loc</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">borderaxespad</span><span class="o">=</span><span class="mf">0.</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<img alt="_images/machine-learning_73_0.png" src="_images/machine-learning_73_0.png" />
</div>
</div>
<div class="admonition warning">
<p class="admonition-title">Warning</p>
<p>Exploratory analysis of ordination data does not replace statistical comparisons of sample composition, and identifying a pattern in an ordination plot and subsequently testing whether it’s significant is not the same as having an a priori hypothesis about how your samples will group and then testing for that statistically. If you have many metadata categories, and especially if you have relatively few samples, it’s likely that spurious patterns may present themselves. <strong>You should consider observations that result from exploratory analysis of ordination plots as hypotheses that can subsequntly be tested with different data.</strong> Remember: exploratory analysis is a tool for hypothesis generation, and hypotheses generation and hypothesis testing cannot be performed on the same data.</p>
</div>
<p>Emperor <span id="id2">[<a class="reference internal" href="#id30">VazquezBPGK13</a>]</span> is a widely used tool for visualizing ordination plots, and it makes interactive exploratory analysis of PCoA plots very straight-forward. For example, coloring samples by different metadata categories, comparing different ordination axes, and inverting ordination axes are all possible to do with a few mouse clicks. Emperor is a great place to get started when doing your own exploratory analysis of ordination plots.</p>
<div class="admonition-exercise admonition">
<p class="admonition-title">Exercise</p>
<p>How does the clustering of samples compare between polar ordination and PCoA?</p>
</div>
</div>
</div>
<div class="section" id="supervised-classification">
<h2>Supervised classification<a class="headerlink" href="#supervised-classification" title="Permalink to this headline">¶</a></h2>
<p>We’ll continue our exploration of machine learning approaches with <strong>supervised classification</strong>, and specifically with an algorithm called <strong>Naive Bayes</strong>.  We’ll implement Naive Bayes to gain an understanding of how it works. Like Polar Ordination, the math involved in Naive Bayes is relatively straight-forward, which is why I chose this algorithm to present here. There are many machine algorithms with more complex math, but Naive Bayes is widely used and powerful, so I think it’s a great place to get started.</p>
<p>We’ll explore supervised classification in the context of a now familiar topic: taxonomic classification of 16S rRNA sequences. We previously explored this problem in <a class="reference internal" href="database-searching.html"><span class="doc">Sequence homology searching</span></a>, so it’s worth spending a few minutes skimming that chapter if it’s not fresh in your mind.</p>
<p>Briefly, the problem that we are going to address here is as follows. We have a query sequence (<span class="math notranslate nohighlight">\(q_i\)</span>) which is not taxonomically annotated (meaning we don’t know the taxonomy of the organism whose genome it is found in), and a reference database (<span class="math notranslate nohighlight">\(R\)</span>) of taxonomically annotated sequences (<span class="math notranslate nohighlight">\(r_1, r_2, r_3, r_n\)</span>). We want to infer a taxonomic annotation for <span class="math notranslate nohighlight">\(q_i\)</span>. In <a class="reference internal" href="database-searching.html"><span class="doc">Sequence homology searching</span></a>, we solved this problem using pairwise sequence alignment. Here, we’ll build a Naive Bayes classifier from our sequence feature table and labels, and then apply that classifier to unlabeled data.</p>
<div class="section" id="defining-a-classification-task">
<h3>Defining a classification task<a class="headerlink" href="#defining-a-classification-task" title="Permalink to this headline">¶</a></h3>
<p>In a classification task, there are two or more pre-defined classes, and the goal is to assign observations to those classes. As humans, we perform these kinds of tasks everyday. For example, if you’re browsing a bookstore you might classify titles as ones you want to read versus everything else (the ones you’re not interested in reading). You might group the apps that you have on your phone into folders by classifying them by category (e.g., “school”, “entertainment”, or “social media”).</p>
<p>When we’re working with large data sets, supervised classification algorithms can help us with classification tasks that will make us more efficient or help us understand our data. A classic example of this outside of bioinformatics is an email spam filter. For every email that is received, the spam filter must define it as spam or not spam so the message can directed either to the user’s spam folder or the user’s inbox. The stakes can be high: a filter that is too permissive will cause the user’s inbox to get filled with junk mail, while a filter that is overly restrictive could cause relevant messages to be directed to the spam folder. In either case, the email user could miss important messages.</p>
<p>In the taxonomic assignment example that we’ll work through in this chapter, our classes will be microbial species. Our species classifier for 16S rRNA sequences will take an unannotated sequence as input and as output present the species that the sequence most likely originated from.</p>
</div>
<div class="section" id="training-data-test-data-and-cross-validation">
<h3>Training data, test data, and cross-validation<a class="headerlink" href="#training-data-test-data-and-cross-validation" title="Permalink to this headline">¶</a></h3>
<p>Supervised classification algorithms need to be provided with data that is used to develop a model for use in classification. Developing this model is referred to as training the classifier. The data that is used for this is a collection of observations with defined classes, and is referred to as the <strong>training data</strong>. These labeled examples are the “supervision” aspect of supervised learning. In the email spam filter example, this would be email messages that are annotated as either spam or not spam. In the species assignment example, this would be 16S sequences that are taxonomically annotated at the species level. It is typically important that the training data be balanced - in other words, that there are roughly the same number of examples of each class.</p>
<p>In addition to the training data, an independent collection of observations with defined classes is needed as <strong>test data</strong>. These observations are not used to train the classifier, but rather to evaluate how the classifier performs on previously unseen data. The goal of testing the classifier on these test data is to predict what performance will be on <strong>real world</strong> data. Real world data refers to data for which the class is currently unknown. In the spam filter example, real world data would be new emails that you are receiving. In the species assignment example, real world data could be sequences that you obtain from the environment using a DNA sequencing instrument. The test data shouldn’t be used for optimization of classifiers: in other words, you shouldn’t develop a classifier on training data, test it on test data, go back and make changes to the classifier, and then re-test on test data. This would risk <strong>over-fitting</strong> the classifier to a particular test data set and performance on that test data may no longer be predictive of how the classifier will perform when it is used on real world data.</p>
<p>Because training and test data sets can be very costly to develop (for example, they may require many hours of annotation by humans) we often use an approach called <strong>k-fold cross validation</strong> during classifier development and optimization <a class="reference internal" href="#cross-validation-1"><span class="std std-numref">Fig. 7</span></a>. In k-fold cross-validation, the training data is split into <code class="docutils literal notranslate"><span class="pre">k</span></code> different data sets, where <code class="docutils literal notranslate"><span class="pre">k</span></code> is usually five or ten. In each of the data sets, <span class="math notranslate nohighlight">\(1/k\)</span> of the entries are used as test data and all of the other entries are used as training data. In <code class="docutils literal notranslate"><span class="pre">k</span></code> iterations, the classifier is developed on the training data and tested on the test data. The average performance of the classifier is then computed across the <code class="docutils literal notranslate"><span class="pre">k</span></code> iterations. k-fold cross validation therefore allows for developing and optimizing a classifier without using dedicated test data.</p>
<div class="figure align-default" id="cross-validation-1">
<img alt="_images/ml-cross-validation.png" src="_images/ml-cross-validation.png" />
<p class="caption"><span class="caption-number">Fig. 7 </span><span class="caption-text">An illustration of k-fold cross validation where a single data set is split into k independent training and test data sets. Each circle represents a labeled entry for use in training or testing, and colors indicate the class of each entry. In the case of a spam filter, for example, red circles might represent spam messages while green circles represent messages that are not spam.
Image source: <a class="reference external" href="https://commons.wikimedia.org/wiki/File:K-fold_cross_validation_EN.svg">Gufosowa</a>, <a class="reference external" href="https://creativecommons.org/licenses/by-sa/4.0">CC BY-SA 4.0</a>, via Wikimedia Commons.</span><a class="headerlink" href="#cross-validation-1" title="Permalink to this image">¶</a></p>
</div>
</div>
<div class="section" id="evaluating-a-binary-classifier">
<h3>Evaluating a binary classifier<a class="headerlink" href="#evaluating-a-binary-classifier" title="Permalink to this headline">¶</a></h3>
<p>As mentioned above, in a classification task there are two or more pre-defined classes. A binary classifier would be a specific type of classifier for which there are exactly two classes - for example, spam and not spam. We’ll start talking about how classifiers are evaluated by discussing binary classifiers because they’re the easiest to understand.</p>
<p>Imagine we’re building a classifier that attempts to predict whether an individual is healthy or has some specific disease (let’s call it <em>Disease X</em>). Perhaps the data that the classifier uses is based on a variety of medical data that has undergone a feature extraction process to generate features that can be used by a supervised classification algorithm. When a classifier is developed, you can think of it like a function that will take a collection of features for a sample and return a value of “healthy” or “diseased”.</p>
<p>The goal of our classifier is to serve as a diagnostic tool that identifies whether a patient has Disease X based on features of their medical data. A positive test result therefore indicates that the patient has Disease X while a negative test result indicates that they are healthy. When we apply our classifier to test data (i.e., where we know the correct class), there are a few possible outcomes.</p>
<ul class="simple">
<li><p>The classifier predicts a positive test result, and the sample is known to come from a patient with Disease X. This is a <strong>true positive (TP)</strong>.</p></li>
<li><p>The classifier predicts a positive test result, and the sample is known to come from a healthy patient. This is a <strong>false positive (FP)</strong>. FPs are also referred to as type 1 errors.</p></li>
<li><p>The classifier predicts a negative test result, and the sample is known to come from a patient with Disease X. This is a <strong>false negative (FN)</strong>. FNs are also referred to as type 2 errors.</p></li>
<li><p>The classifier predicts a negative test result, and the sample is known to come from a healthy patient. This is a <strong>true negative (TN)</strong>.</p></li>
</ul>
<p>A classifier would typically be evaluated by running it on many samples and tallying the count of TP, FP, FN, and TN results. These tallies are typically presented in a structure known as a <strong>confusion matrix</strong>. For the confusion matrix, there many different values that can be computed which inform us of some aspect of classifier performance.</p>
<p>The simplest way to think about evaluating the performance of our classifier from a confusion matrix is to compute its <strong>accuracy</strong> as:</p>
<div class="math notranslate nohighlight" id="equation-accuracy">
<span class="eqno">(5)<a class="headerlink" href="#equation-accuracy" title="Permalink to this equation">¶</a></span>\[accuracy = \frac{TP + TN}{TP + FP + FN + TN}\]</div>
<p>In words, accuracy can be defined as the fraction of the total test cases that the classifier classified correctly. Accuracy gives us an idea of the classifier performance, but it hides some potentially relevant information from us. Specifically, it doesn’t tell us whether poor classifier performance is a result of primarily Type 1 errors, primarily Type 2 errors, or a balance of the two. A low accuracy classifier could, for example, frequently return false positives (Type 1 errors) but almost never return false negatives (Type 2 errors). Such a classifier could still be a clinically useful tool. Because false negatives are very infrequent but false positives are common, that means when the classifier indicates a negative test result that person doesn’t have the disease. If the classifier indicates a positive result, that could be an indicator that additional testing is needed. Of course we would rather our classifier achieve fewer false positives, but if this is a very cheap test and the additional tests are more expensive, it can be a useful first screening approach.</p>
<p>Two other metrics are more widely used for evaluating classifiers, and these are typically computed as a pair. These metrics are <strong>precision</strong> and <strong>recall</strong> and they are more informative than accuracy because they indicate whether a classifier might suffer more from false positives or false negatives.</p>
<p>Precision is the fraction of the positives reported by the classifier that are actually positives, or:</p>
<div class="math notranslate nohighlight" id="equation-precision">
<span class="eqno">(6)<a class="headerlink" href="#equation-precision" title="Permalink to this equation">¶</a></span>\[precision = \frac{TP}{TP + FP}\]</div>
<p>Recall is the fraction of the actual positives that are reported to be positive by the classifier, or:</p>
<div class="math notranslate nohighlight" id="equation-recall">
<span class="eqno">(7)<a class="headerlink" href="#equation-recall" title="Permalink to this equation">¶</a></span>\[recall = \frac{TP}{TP + FN}\]</div>
<p>Precision thus tells us how frequently our classifier yields false positives, while recall tells us how frequently our classifier yields false negatives. We of course would always like both of these values to be high, but depending on the application of our classifier, we may prefer high precision over high recall, or we may prefer high recall over high precision.</p>
</div>
<div class="section" id="naive-bayes-classifiers">
<h3>Naive Bayes classifiers<a class="headerlink" href="#naive-bayes-classifiers" title="Permalink to this headline">¶</a></h3>
<p>In this chapter, instead of using sequence alignment to identify the most likely taxonomic origin of a sequence, we’ll train Naive Bayes classifiers to do this by building <a class="reference internal" href="database-searching.html#kmer"><span class="std std-ref">kmer</span></a>-based models of the 16S sequences of taxa in our training data. We’ll then run test sequences through those models to identify the most likely taxonomic origin of each test sequence. Since we know the taxonomic origin of our test sequences, we can evaluate the accuracy of our classifiers by seeing how often they return the known taxonomy assignment. If our training and testing approaches are well-designed, the performance on our tests will inform us of how accurate we can expect our classifier to be on data where the actual taxonomic origin is unknown.</p>
</div>
<div class="section" id="training-a-native-bayes-classifier">
<h3>Training a Native Bayes classifier<a class="headerlink" href="#training-a-native-bayes-classifier" title="Permalink to this headline">¶</a></h3>
<p>Naive Bayes classifiers work by building a model of what different classes look like based on labeled training data. As with unsupervised learning tasks, the starting point is a feature table representing instances of the different classes. In addition to the feature table, since this is a supervised learning task, the sequence labels (i.e., the class labels) will also be used to train the classifier.</p>
<p>We’ll again use k-mers as our features, and continue with the value of <code class="docutils literal notranslate"><span class="pre">k</span></code> that we defined above. The first thing our Naive Bayes classifier will need is the set of all possible features, which in our case will be all possible words of length <code class="docutils literal notranslate"><span class="pre">k</span></code>. This will be dependent on the value of <code class="docutils literal notranslate"><span class="pre">k</span></code> and the characters in our alphabet (i.e., the characters that we should expect to find in the sequences in our reference database). This set is referred to as <code class="docutils literal notranslate"><span class="pre">W</span></code>, and can be computed as follows.</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">alphabet</span> <span class="o">=</span> <span class="n">skbio</span><span class="o">.</span><span class="n">DNA</span><span class="o">.</span><span class="n">nondegenerate_chars</span>

<span class="k">def</span> <span class="nf">compute_W</span><span class="p">(</span><span class="n">alphabet</span><span class="p">,</span> <span class="n">k</span><span class="p">):</span>
    <span class="k">return</span> <span class="nb">set</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span><span class="s1">&#39;&#39;</span><span class="o">.</span><span class="n">join</span><span class="p">,</span> <span class="n">itertools</span><span class="o">.</span><span class="n">product</span><span class="p">(</span><span class="n">alphabet</span><span class="p">,</span> <span class="n">repeat</span><span class="o">=</span><span class="n">k</span><span class="p">)))</span>

<span class="n">W</span> <span class="o">=</span> <span class="n">compute_W</span><span class="p">(</span><span class="n">alphabet</span><span class="p">,</span> <span class="n">k</span><span class="p">)</span>

<span class="nb">print</span><span class="p">(</span><span class="s1">&#39;Alphabet contains the characters: </span><span class="si">%s</span><span class="s1">&#39;</span> <span class="o">%</span> <span class="s1">&#39;, &#39;</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">alphabet</span><span class="p">))</span>
<span class="nb">print</span><span class="p">(</span><span class="s1">&#39;For an alphabet size of </span><span class="si">%d</span><span class="s1">, W contains </span><span class="si">%d</span><span class="s1"> length-</span><span class="si">%d</span><span class="s1"> kmers.&#39;</span> <span class="o">%</span> <span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">alphabet</span><span class="p">),</span> <span class="nb">len</span><span class="p">(</span><span class="n">W</span><span class="p">),</span> <span class="n">k</span><span class="p">))</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<div class="output stream highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>Alphabet contains the characters: G, A, T, C
For an alphabet size of 4, W contains 256 length-4 kmers.
</pre></div>
</div>
</div>
</div>
<div class="admonition-exercise admonition">
<p class="admonition-title">Exercise</p>
<p>Given the DNA alphabet (A, C, G, and T), how many different kmers of length 3 are there (i.e., 3-mers)? How many different 5-mers are there? How many 5-mers are there if there are twenty characters in our alphabet (as would be the case if we were working with protein sequences instead of DNA sequences)?</p>
</div>
<p>To train our taxonomic classifier, we also need to know what level of taxonomic specificity we want to classify our sequences to. We should expect to achieve higher accuracy at less specific taxonomic levels such as phylum or class, but these are likely to be less informative biologically than more specific levels such as genus or species. Again, we’ll mirror the choice we made for our unsupervised learning task and attempt to build a species classifier here.</p>
<p>With this information, we can next compute our <strong>k-mer probability table</strong>. The goal for this table is that it accurately represents the probability of observing each k-mer in <code class="docutils literal notranslate"><span class="pre">W</span></code> in a sequence from a given species. Because we don’t know these probabilities, we estimate them based on the frequency that we observe each k-mer in the sequences in our training data. Our k-mer probability table is computed using the following values:</p>
<p><span class="math notranslate nohighlight">\(N\)</span> : The total number of sequences in the training data.</p>
<p><span class="math notranslate nohighlight">\(W\)</span>: The set of all possible kmers, given <span class="math notranslate nohighlight">\(k\)</span> and an alphabet.</p>
<p><span class="math notranslate nohighlight">\(w_i\)</span>: An individual k-mer in <span class="math notranslate nohighlight">\(W\)</span>.</p>
<p><span class="math notranslate nohighlight">\(n(w_i)\)</span> : The total number of training data sequences containing <span class="math notranslate nohighlight">\(w_i\)</span>.</p>
<p><span class="math notranslate nohighlight">\(P_i\)</span> : The probability of observing <span class="math notranslate nohighlight">\(w_i\)</span> in a relevant real world sequence from any species. Initially it might seem as though this would be computed as <span class="math notranslate nohighlight">\(n(w_i) / N\)</span>, but this neglects the possibility that a k-mer in a real world sequence might not be represented in any sequences in our training data (i.e., <span class="math notranslate nohighlight">\(n(w_i) = 0\)</span>). This would cause a problem when classifing that real world sequence - we’ll revisit this shortly. As a result, 0.5 is added to the numerator and 1 is added to the denominator so that this is computed as <span class="math notranslate nohighlight">\((n(w_i) + 0.5) / (N + 1)\)</span>. When we add to our counts in this way, we refer to the values that we’re adding as <strong>pseudocounts</strong>.</p>
<p><span class="math notranslate nohighlight">\(P(w_i | species)\)</span> : The probability of observing <span class="math notranslate nohighlight">\(w_i\)</span> in a relevant real world sequence from a given species. Again, it would seem that this would be computed as the proportion of sequences in the species containing <span class="math notranslate nohighlight">\(w_i\)</span>, but this would neglect that we’re likely to observe k-mers in real-world sequences that are not represented in our training data. A pseudocount is therefore added again to the numerator and denominator. This time the pseudocount in the numerator is scaled by how frequent <span class="math notranslate nohighlight">\(w_i\)</span> is in the reference database as a whole: specifically, it is <span class="math notranslate nohighlight">\(P_i\)</span>. The pseudocount in the denominator is still 1.</p>
<p>Our “kmer probability table” is <span class="math notranslate nohighlight">\(P(w_i | species)\)</span> computed for all kmers in W and all species represented in our reference database. Let’s compute that, and then look at the first 25 rows.</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="k">def</span> <span class="nf">compute_kmer_probability_table</span><span class="p">(</span><span class="n">feature_table</span><span class="p">,</span> <span class="n">sequence_labels</span><span class="p">,</span> <span class="n">W</span><span class="p">):</span>
    <span class="n">N</span> <span class="o">=</span> <span class="n">feature_table</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="c1"># number of training sequences</span>

    <span class="c1"># number of sequences containing kmer wi</span>
    <span class="n">n_wi</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">index</span><span class="o">=</span><span class="n">W</span><span class="p">)</span>
    <span class="n">n_wi</span> <span class="o">=</span> <span class="n">n_wi</span> <span class="o">+</span> <span class="n">feature_table</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="nb">bool</span><span class="p">)</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="n">axis</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span>
    <span class="n">n_wi</span> <span class="o">=</span> <span class="n">n_wi</span><span class="o">.</span><span class="n">fillna</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span>
    <span class="n">n_wi</span><span class="o">.</span><span class="n">name</span> <span class="o">=</span> <span class="s1">&#39;n(w_i)&#39;</span>

    <span class="c1"># probabilities of observing each kmer</span>
    <span class="n">Pi</span> <span class="o">=</span> <span class="p">(</span><span class="n">n_wi</span> <span class="o">+</span> <span class="mf">0.5</span><span class="p">)</span> <span class="o">/</span> <span class="p">(</span><span class="n">N</span> <span class="o">+</span> <span class="mi">1</span><span class="p">)</span>
    <span class="n">Pi</span><span class="o">.</span><span class="n">name</span> <span class="o">=</span> <span class="s1">&#39;P_i&#39;</span>
    
    <span class="c1"># number of times each taxon appears in training set</span>
    <span class="n">taxon_counts</span> <span class="o">=</span> <span class="n">collections</span><span class="o">.</span><span class="n">Counter</span><span class="p">(</span><span class="n">sequence_labels</span><span class="p">)</span>
    <span class="n">taxon_table</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">index</span><span class="o">=</span><span class="n">taxon_counts</span><span class="o">.</span><span class="n">keys</span><span class="p">(),</span> <span class="n">columns</span><span class="o">=</span><span class="n">W</span><span class="p">)</span>
    <span class="n">taxon_table</span> <span class="o">=</span> <span class="n">taxon_table</span> <span class="o">+</span> <span class="n">feature_table</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="nb">bool</span><span class="p">)</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="n">by</span><span class="o">=</span><span class="n">sequence_labels</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span><span class="o">.</span><span class="n">sum</span><span class="p">()</span>
    <span class="n">taxon_table</span> <span class="o">=</span> <span class="n">taxon_table</span><span class="o">.</span><span class="n">fillna</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span>
    
    <span class="c1"># probabilities of observing each kmer in each taxon</span>
    <span class="n">p_wi_t</span> <span class="o">=</span> <span class="p">[]</span>
    <span class="k">for</span> <span class="n">taxon</span><span class="p">,</span> <span class="n">count</span> <span class="ow">in</span> <span class="n">taxon_counts</span><span class="o">.</span><span class="n">items</span><span class="p">():</span>
        <span class="n">p_wi_t</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">((</span><span class="n">taxon_table</span><span class="o">.</span><span class="n">loc</span><span class="p">[</span><span class="n">taxon</span><span class="p">]</span> <span class="o">+</span> <span class="n">Pi</span><span class="p">)</span> <span class="o">/</span> <span class="p">(</span><span class="n">count</span> <span class="o">+</span> <span class="mi">1</span><span class="p">),</span> <span class="n">name</span><span class="o">=</span><span class="n">taxon</span><span class="p">))</span>

    <span class="k">return</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">p_wi_t</span><span class="p">)</span><span class="o">.</span><span class="n">T</span>
</pre></div>
</div>
</div>
</div>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">kmer_probability_table</span> <span class="o">=</span> <span class="n">compute_kmer_probability_table</span><span class="p">(</span><span class="n">sequence_feature_table</span><span class="p">,</span> <span class="n">sequence_labels</span><span class="p">[</span><span class="s1">&#39;legend entry&#39;</span><span class="p">],</span> <span class="n">W</span><span class="p">)</span>
<span class="n">kmer_probability_table</span><span class="p">[:</span><span class="mi">25</span><span class="p">]</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<div class="output text_html"><div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
</style>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>Flavobacterium succinicans (Bacteroidetes)</th>
      <th>Propionibacterium acnes (Actinobacteria)</th>
      <th>Prevotella melaninogenica (Bacteroidetes)</th>
      <th>Prevotella stercorea (Bacteroidetes)</th>
      <th>Pseudomonas viridiflava (Proteobacteria)</th>
      <th>Pseudomonas veronii (Proteobacteria)</th>
      <th>Prevotella copri (Bacteroidetes)</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>AAAA</th>
      <td>0.368056</td>
      <td>0.201389</td>
      <td>0.534722</td>
      <td>0.201389</td>
      <td>0.034722</td>
      <td>0.034722</td>
      <td>0.034722</td>
    </tr>
    <tr>
      <th>AAAC</th>
      <td>0.974537</td>
      <td>0.974537</td>
      <td>0.974537</td>
      <td>0.974537</td>
      <td>0.307870</td>
      <td>0.807870</td>
      <td>0.974537</td>
    </tr>
    <tr>
      <th>AAAG</th>
      <td>0.997685</td>
      <td>0.997685</td>
      <td>0.997685</td>
      <td>0.997685</td>
      <td>0.997685</td>
      <td>0.997685</td>
      <td>0.997685</td>
    </tr>
    <tr>
      <th>AAAT</th>
      <td>0.891204</td>
      <td>0.224537</td>
      <td>0.057870</td>
      <td>0.724537</td>
      <td>0.057870</td>
      <td>0.224537</td>
      <td>0.224537</td>
    </tr>
    <tr>
      <th>AACA</th>
      <td>0.974537</td>
      <td>0.974537</td>
      <td>0.641204</td>
      <td>0.974537</td>
      <td>0.641204</td>
      <td>0.807870</td>
      <td>0.974537</td>
    </tr>
    <tr>
      <th>AACC</th>
      <td>0.784722</td>
      <td>0.951389</td>
      <td>0.951389</td>
      <td>0.951389</td>
      <td>0.118056</td>
      <td>0.284722</td>
      <td>0.951389</td>
    </tr>
    <tr>
      <th>AACG</th>
      <td>0.997685</td>
      <td>0.997685</td>
      <td>0.997685</td>
      <td>0.997685</td>
      <td>0.997685</td>
      <td>0.997685</td>
      <td>0.997685</td>
    </tr>
    <tr>
      <th>AACT</th>
      <td>0.997685</td>
      <td>0.997685</td>
      <td>0.997685</td>
      <td>0.997685</td>
      <td>0.997685</td>
      <td>0.997685</td>
      <td>0.997685</td>
    </tr>
    <tr>
      <th>AAGA</th>
      <td>0.965278</td>
      <td>0.965278</td>
      <td>0.965278</td>
      <td>0.131944</td>
      <td>0.965278</td>
      <td>0.798611</td>
      <td>0.798611</td>
    </tr>
    <tr>
      <th>AAGC</th>
      <td>0.789352</td>
      <td>0.956019</td>
      <td>0.122685</td>
      <td>0.456019</td>
      <td>0.956019</td>
      <td>0.956019</td>
      <td>0.956019</td>
    </tr>
    <tr>
      <th>AAGG</th>
      <td>0.997685</td>
      <td>0.997685</td>
      <td>0.997685</td>
      <td>0.997685</td>
      <td>0.997685</td>
      <td>0.997685</td>
      <td>0.997685</td>
    </tr>
    <tr>
      <th>AAGT</th>
      <td>0.993056</td>
      <td>0.993056</td>
      <td>0.993056</td>
      <td>0.826389</td>
      <td>0.993056</td>
      <td>0.993056</td>
      <td>0.993056</td>
    </tr>
    <tr>
      <th>AATA</th>
      <td>0.997685</td>
      <td>0.997685</td>
      <td>0.997685</td>
      <td>0.997685</td>
      <td>0.997685</td>
      <td>0.997685</td>
      <td>0.997685</td>
    </tr>
    <tr>
      <th>AATC</th>
      <td>0.937500</td>
      <td>0.270833</td>
      <td>0.770833</td>
      <td>0.270833</td>
      <td>0.937500</td>
      <td>0.937500</td>
      <td>0.270833</td>
    </tr>
    <tr>
      <th>AATG</th>
      <td>0.826389</td>
      <td>0.993056</td>
      <td>0.993056</td>
      <td>0.993056</td>
      <td>0.993056</td>
      <td>0.993056</td>
      <td>0.993056</td>
    </tr>
    <tr>
      <th>AATT</th>
      <td>0.932870</td>
      <td>0.266204</td>
      <td>0.766204</td>
      <td>0.932870</td>
      <td>0.432870</td>
      <td>0.599537</td>
      <td>0.266204</td>
    </tr>
    <tr>
      <th>ACAA</th>
      <td>0.946759</td>
      <td>0.946759</td>
      <td>0.446759</td>
      <td>0.280093</td>
      <td>0.946759</td>
      <td>0.946759</td>
      <td>0.280093</td>
    </tr>
    <tr>
      <th>ACAC</th>
      <td>0.997685</td>
      <td>0.997685</td>
      <td>0.997685</td>
      <td>0.997685</td>
      <td>0.997685</td>
      <td>0.997685</td>
      <td>0.997685</td>
    </tr>
    <tr>
      <th>ACAG</th>
      <td>0.974537</td>
      <td>0.474537</td>
      <td>0.641204</td>
      <td>0.974537</td>
      <td>0.974537</td>
      <td>0.974537</td>
      <td>0.974537</td>
    </tr>
    <tr>
      <th>ACAT</th>
      <td>0.974537</td>
      <td>0.974537</td>
      <td>0.974537</td>
      <td>0.974537</td>
      <td>0.641204</td>
      <td>0.474537</td>
      <td>0.974537</td>
    </tr>
    <tr>
      <th>ACCA</th>
      <td>0.993056</td>
      <td>0.993056</td>
      <td>0.993056</td>
      <td>0.993056</td>
      <td>0.826389</td>
      <td>0.993056</td>
      <td>0.993056</td>
    </tr>
    <tr>
      <th>ACCC</th>
      <td>0.567130</td>
      <td>0.067130</td>
      <td>0.400463</td>
      <td>0.900463</td>
      <td>0.067130</td>
      <td>0.067130</td>
      <td>0.733796</td>
    </tr>
    <tr>
      <th>ACCG</th>
      <td>0.997685</td>
      <td>0.997685</td>
      <td>0.997685</td>
      <td>0.997685</td>
      <td>0.997685</td>
      <td>0.997685</td>
      <td>0.997685</td>
    </tr>
    <tr>
      <th>ACCT</th>
      <td>0.997685</td>
      <td>0.997685</td>
      <td>0.997685</td>
      <td>0.997685</td>
      <td>0.997685</td>
      <td>0.997685</td>
      <td>0.997685</td>
    </tr>
    <tr>
      <th>ACGA</th>
      <td>0.993056</td>
      <td>0.993056</td>
      <td>0.993056</td>
      <td>0.993056</td>
      <td>0.826389</td>
      <td>0.993056</td>
      <td>0.993056</td>
    </tr>
  </tbody>
</table>
</div></div></div>
</div>
<p>This k-mer probability table represents our k-mer-based models of the species in our training data. We can use this table to compute probabilities of real world sequences belonging to each of the species represented in this table.</p>
</div>
<div class="section" id="applying-a-naive-bayes-classifier">
<h3>Applying a Naive Bayes classifier<a class="headerlink" href="#applying-a-naive-bayes-classifier" title="Permalink to this headline">¶</a></h3>
<p>With our k-mer probability table we are now ready to classify unknown sequences. We’ll begin by selecting sequences that will serve as our test data. We’ll pull sequences for our species of interest at random from our reference database, excluding sequences that were used in our training data.</p>
<div class="cell tag_hide-cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">test_seq_data</span> <span class="o">=</span> <span class="n">load_annotated_sequences</span><span class="p">(</span><span class="n">taxa_of_interest</span><span class="p">,</span> <span class="n">class_size</span><span class="o">=</span><span class="n">sequences_per_taxon</span><span class="p">,</span> 
                                          <span class="n">verbose</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">ids_to_exclude</span><span class="o">=</span><span class="n">sequence_labels</span><span class="o">.</span><span class="n">index</span><span class="p">)</span>
<span class="n">test_labels</span> <span class="o">=</span> <span class="n">feature_labels_from_sequence_records</span><span class="p">(</span><span class="n">test_seq_data</span><span class="p">)</span>
</pre></div>
</div>
</div>
</div>
<p>We can now review a few of the sequences that were selected for our test data set.</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="k">for</span> <span class="n">sr</span> <span class="ow">in</span> <span class="n">random</span><span class="o">.</span><span class="n">sample</span><span class="p">(</span><span class="nb">list</span><span class="p">(</span><span class="n">test_seq_data</span><span class="o">.</span><span class="n">values</span><span class="p">()),</span> <span class="mi">3</span><span class="p">):</span>
    <span class="nb">print</span><span class="p">(</span><span class="n">sr</span><span class="o">.</span><span class="n">identifier</span><span class="p">)</span>
    <span class="nb">print</span><span class="p">(</span><span class="n">sr</span><span class="o">.</span><span class="n">taxonomy</span><span class="p">)</span>
    <span class="nb">print</span><span class="p">(</span><span class="n">sr</span><span class="o">.</span><span class="n">sequence</span><span class="p">)</span>
    <span class="nb">print</span><span class="p">(</span><span class="s1">&#39;🦠&#39;</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<div class="output stream highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>711843
k__Bacteria;p__Bacteroidetes;c__Flavobacteriia;o__Flavobacteriales;f__Flavobacteriaceae;g__Flavobacterium;s__succinicans
AGAGTTTGATCCTGGCTCAGGATGAACGCTAGCGGCAGGCTTAACACATGCAAGTCGAGGGGTATAGTTCTTCGGAACTAGAGACCGGCGCACGGGTGCGTAACGCGTATGCAATCTACCTTTTACAGAGGGATAGCCCAGAGAAATTTGGATTAATACCTCATAGTATATAGCCCTGGCATCAGGATTATATTAAAGTCACAACGGTAAAAGATGAGCATGCGTCCCATTAGCTAGTTGGTAAGGTAACGGCTTACCAAGGCTACGATGGGTAGGGGTCCTGAGAGGGAGATCCCCCACACTGGTACTGAGACACGGACCAGACTCCTACGGGAGGCAGCAGTGAGGAATATTGGACAATGGGCGCAAGCCTGATCCAGCCATGCCGCGTGCAGGATGACGGTCCTATGGATTGTAAACTGCTTTTATACGAGAAGAAACATCCCGACGTGTCGGGACTTGACGGTATCGTAAGAATAAGGATCGGCTAACTCCGTGCC
🦠
521268
k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Prevotellaceae;g__Prevotella;s__melaninogenica
TAGAGTTTGATCCTGGCTCAGGATGAACGCTAGCTACAGGCTTAACACATGCAAGTCGAGGGGAAACGGGATTGATGGCTTGCACTCTTTGGACGTCCACCGGCGCAGGGGAGGGTAACGCATATCCACCCTTCCCATTACTGGGGGATAACCTGCAGAAAGGCACACTATCACGACGTAATCTTCTATGATCGGGTCATATTTGAAGTAAAGATTTATCGGTAATGGATGGGGATGCGTCTGATTATGTTGTTGGCGGGGTAACGGCCCACCAAGGCGACGATCAGTAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGTCCAAACTCCTACGGGAGGCAGCAGTGAGGAATATTGGTCAATGGACGGAAGTCTGAACCAGCCAAGTGGCGTGCAGGATGACGGCCCTATGGGTTGTAAACTGCTTTTGTATGGGGATAAAGTTAGGGACGTGTCCCTATTTGCAGGTACCATACGAATAAGGACC
🦠
4427609
k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Prevotellaceae;g__Prevotella;s__melaninogenica
CAGGCCTCACACACGCAAGTCGAGGGGAAAACGGCGTTGAGCGCTCGCACTTCTTCGGAACGTTCGACCGGCGCACGGGCGAGTAACGCGTCATCCAACTCTTTCCCATAACTAAGGGAATAACCCGCCGAAAGGCAGAACTAATACCTTATGTAAATTTCTTCGGATCGACATCAGAAAGAGAAATCGAAAGAATTTATCGGTTATGGATCGGGGATGCCGTCTGATTAGCTTCGTTGGCGGGGTAACGGCCCACCAAGGCGACGATCAGTAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGTCCAAACTCCTACGGGAGGCAGCAGTGAGGAATATTGGTCAATGGGCGAGAGCCTGAACCAGCCAAGTAGCGTGCAGGAAGACGGCCCTATGGGTTGTAAACTGCTTTTGTATGGGGATAAAGTCAATCACGTGTGATTGTTTGCAGGTACCATACGAATAAGGACCGGCTAATTCCGTGCCA
🦠
</pre></div>
</div>
</div>
</div>
<p>For a sequence that is provided as input to our Naive Bayes classifier, which is generally referred to as a query sequence, taxonomy will be assigned as follows. First, the set of all k-mers will be extracted from the query sequence. This set of k-mers is referred to as <span class="math notranslate nohighlight">\(V\)</span>. Then, for each species represented in the k-mer probability table, the probability of observing the sequence will be computed assuming that the sequence is a representative of that species. This is referred to as the probability of the query sequence given the species, or <span class="math notranslate nohighlight">\(P(query | species)\)</span>. This is computed as the product of all its k-mer probabilities for the given species. It should be clear based on this formula why it was necessary to add pseudocounts when computing our k-mer probability table. If not, k-mer probabilities of zero would result in a zero probability of the sequence being derived from that taxon at this step.</p>
<p>After computing <span class="math notranslate nohighlight">\(P(query | species)\)</span> for each species, the taxonomy assignment returned is simply the one achieving the highest probability. Here we’ll classify a sequence and look at the resulting taxonomy assignment.</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="c1"># This function classifies a sequence that has already been split into a list</span>
<span class="c1"># of kmers.</span>
<span class="k">def</span> <span class="nf">classify_V</span><span class="p">(</span><span class="n">V</span><span class="p">,</span> <span class="n">kmer_probability_table</span><span class="p">):</span>
    <span class="n">P_S_t</span> <span class="o">=</span> <span class="p">[]</span> <span class="c1"># probability of the sequence given the taxon</span>
    <span class="k">for</span> <span class="n">taxon</span> <span class="ow">in</span> <span class="n">kmer_probability_table</span><span class="p">:</span>
        <span class="n">kmer_probabilities</span> <span class="o">=</span> <span class="n">kmer_probability_table</span><span class="p">[</span><span class="n">taxon</span><span class="p">]</span>
        <span class="c1"># TODO: Confirm this step</span>
        <span class="c1"># Because we&#39;re multiplying many probabilities, we often will hit the lower</span>
        <span class="c1"># limit of the computer&#39;s precision (i.e., our probability will be </span>
        <span class="c1"># less than machine epsilon). We therefore take the log of each observed </span>
        <span class="c1"># probability and sum those logs. </span>
        <span class="n">query_log_probability</span> <span class="o">=</span> <span class="nb">sum</span><span class="p">(</span><span class="nb">list</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">log</span><span class="p">,</span> <span class="n">kmer_probabilities</span><span class="p">[</span><span class="n">V</span><span class="p">])))</span>
        <span class="n">P_S_t</span><span class="o">.</span><span class="n">append</span><span class="p">((</span><span class="n">query_log_probability</span><span class="p">,</span> <span class="n">taxon</span><span class="p">))</span>
    <span class="k">return</span> <span class="nb">max</span><span class="p">(</span><span class="n">P_S_t</span><span class="p">)[</span><span class="mi">1</span><span class="p">],</span> <span class="n">V</span>

<span class="c1"># This function is more convenient to use. It classifies a sequence </span>
<span class="c1"># directly, first by computing V, and then by calling classify_V.</span>
<span class="k">def</span> <span class="nf">classify_sequence</span><span class="p">(</span><span class="n">query_sequence</span><span class="p">,</span> <span class="n">kmer_probability_table</span><span class="p">,</span> <span class="n">k</span><span class="p">):</span>
    <span class="n">V</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span><span class="nb">str</span><span class="p">,</span> <span class="n">query_sequence</span><span class="o">.</span><span class="n">iter_kmers</span><span class="p">(</span><span class="n">k</span><span class="o">=</span><span class="n">k</span><span class="p">)))</span>
    <span class="k">return</span> <span class="n">classify_V</span><span class="p">(</span><span class="n">V</span><span class="p">,</span> <span class="n">kmer_probability_table</span><span class="p">)</span>
</pre></div>
</div>
</div>
</div>
<p>We can now apply the Naive Bayes classifier to sequences in our test data set. Here I select a single test sequence, and then provide that as input to the <code class="docutils literal notranslate"><span class="pre">classify_sequence</span></code> function.</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="k">def</span> <span class="nf">random_sequence_record_choice</span><span class="p">(</span><span class="n">sequence_records</span><span class="p">):</span>
    <span class="k">return</span> <span class="n">random</span><span class="o">.</span><span class="n">sample</span><span class="p">(</span><span class="nb">list</span><span class="p">(</span><span class="n">sequence_records</span><span class="o">.</span><span class="n">values</span><span class="p">()),</span> <span class="mi">1</span><span class="p">)[</span><span class="mi">0</span><span class="p">]</span>

<span class="n">query_sr</span> <span class="o">=</span> <span class="n">random_sequence_record_choice</span><span class="p">(</span><span class="n">test_seq_data</span><span class="p">)</span>
<span class="n">taxon_assignment</span><span class="p">,</span> <span class="n">V</span> <span class="o">=</span> <span class="n">classify_sequence</span><span class="p">(</span><span class="n">query_sr</span><span class="o">.</span><span class="n">sequence</span><span class="p">,</span> <span class="n">kmer_probability_table</span><span class="p">,</span> <span class="n">k</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Sequence </span><span class="si">%s</span><span class="s2"> is predicted to be from the species </span><span class="si">%s</span><span class="s2">.&quot;</span> <span class="o">%</span> <span class="p">(</span><span class="n">query_sr</span><span class="o">.</span><span class="n">identifier</span><span class="p">,</span> <span class="n">taxon_assignment</span><span class="p">))</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<div class="output stream highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>Sequence 4465561 is predicted to be from the species Prevotella melaninogenica (Bacteroidetes).
</pre></div>
</div>
</div>
</div>
<p>Because this query sequence is from our test data, we know the actual taxonomy assignment as we can look it up to determine if our classifier was correct. This is in contrast to applying our classifier to real world query sequences, where we typically won’t know what the correct assignment in. We can use our test data to estimate how well we expect our classifier to perform on real world data.</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Sequence </span><span class="si">%s</span><span class="s2"> is known to be from the species </span><span class="si">%s</span><span class="s2">.&quot;</span> <span class="o">%</span> <span class="p">(</span><span class="n">query_sr</span><span class="o">.</span><span class="n">identifier</span><span class="p">,</span> <span class="n">test_labels</span><span class="p">[</span><span class="s1">&#39;legend entry&#39;</span><span class="p">][</span><span class="n">query_sr</span><span class="o">.</span><span class="n">identifier</span><span class="p">]))</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<div class="output stream highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>Sequence 4465561 is known to be from the species Prevotella melaninogenica (Bacteroidetes).
</pre></div>
</div>
</div>
</div>
<div class="admonition-exercise admonition">
<p class="admonition-title">Exercise</p>
<p>Was this sequence classified as the correct species? If not, was it at least classified to the correct genus? What about the correct phylum?</p>
</div>
<div class="admonition-exercise admonition">
<p class="admonition-title">Exercise</p>
<p>Try classifying a few other query sequences and determining if the returned species assignment was correct. You can do this by running the previous two code cells over again in order. Does this classifier seem to be working well?</p>
</div>
</div>
<div class="section" id="evaluating-our-confidence-in-the-results-of-the-naive-bayes-classifier">
<h3>Evaluating our confidence in the results of the Naive Bayes classifier<a class="headerlink" href="#evaluating-our-confidence-in-the-results-of-the-naive-bayes-classifier" title="Permalink to this headline">¶</a></h3>
<p>Because the training and test sequences that we’re working with were randomly selected from the full reference database, each time you run this notebook you should observe different results. If you run the above steps multiple times you’ll get the wrong taxonomy assignment at least some of the time, most likely. Up to this point, we’ve left out an important piece of information: how confident should we be in our assignment? In other words, how dependent is our taxonomy assignment on our specific query sequence? If there were slight differences in our query sequence (e.g., because we observed a very closely related organism, such as one of the same species but a different strain, or because there are some sequencing errors in our sequence) would we obtain the same taxonomy assignment? If so, we should have higher confidence in our assignment. If not, we should have lower confidence in our assignment. This is important because our classifier as implemented so far will <em>always</em> return one of the classes, even if our query sequence is very different than any of the sequences in our reference database.</p>
<p>We can quantify confidence in our taxonomic assignments using an approach called bootstrapping. With a bootstrap approach, we’ll get our taxonomy assignment as we did above, but then for some user-specified number of iterations, we’ll create random subsets of <span class="math notranslate nohighlight">\(V\)</span> sampled with replacement. We’ll then assign taxonomy to each random subset of <span class="math notranslate nohighlight">\(V\)</span>, and count the number of times the resulting taxonomy assignment is the same as the one we received when assigning taxonomy to <span class="math notranslate nohighlight">\(V\)</span>. The count of times that they are the same divided by the number of iterations we’ve chosen to run will be our confidence in the assignment. If the assignments are often the same we’ll have a high confidence value, up to a maximum confidence value of 1 if the assignments are always the same. If the assignments are often different we’ll have a low confidence value, down to a minimum value of 0 if the assignments are never the same.</p>
<p>The following funtion will assign taxonomy to a query sequence, and will compute and return a confidence value for the assignment.</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="k">def</span> <span class="nf">classify_sequence_with_confidence</span><span class="p">(</span><span class="n">query_sequence</span><span class="p">,</span> <span class="n">kmer_probability_table</span><span class="p">,</span> <span class="n">k</span><span class="p">,</span>
                                      <span class="n">confidence_iterations</span><span class="o">=</span><span class="mi">100</span><span class="p">):</span>
    <span class="c1"># classify the query sequence, as we did above</span>
    <span class="n">taxon</span><span class="p">,</span> <span class="n">V</span> <span class="o">=</span> <span class="n">classify_sequence</span><span class="p">(</span><span class="n">query_sequence</span><span class="p">,</span> <span class="n">kmer_probability_table</span><span class="p">,</span> <span class="n">k</span><span class="p">)</span>

    <span class="n">count_same_taxon</span> <span class="o">=</span> <span class="mi">0</span>
    <span class="c1"># Define the size of each subsample as 10% of the actual number of</span>
    <span class="c1"># kmers in the query sequence.</span>
    <span class="n">subsample_size</span> <span class="o">=</span> <span class="nb">int</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">V</span><span class="p">)</span> <span class="o">*</span> <span class="mf">0.1</span><span class="p">)</span>
    <span class="c1"># Perform n iterations (where n is provided by the user as </span>
    <span class="c1"># confidence_iterations) where a random subset of the query sequence&#39;s</span>
    <span class="c1"># kmers are used for the classification task.</span>
    <span class="c1"># Keep track of the number of times the observed result is the same as</span>
    <span class="c1"># that for the query sequence. </span>
    <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">confidence_iterations</span><span class="p">):</span>
        <span class="n">subsample_V</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">choice</span><span class="p">(</span><span class="n">V</span><span class="p">,</span> <span class="n">subsample_size</span><span class="p">,</span> <span class="n">replace</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
        <span class="n">subsample_taxon</span><span class="p">,</span> <span class="n">_</span> <span class="o">=</span> <span class="n">classify_V</span><span class="p">(</span><span class="n">subsample_V</span><span class="p">,</span> <span class="n">kmer_probability_table</span><span class="p">)</span>
        <span class="k">if</span> <span class="n">taxon</span> <span class="o">==</span> <span class="n">subsample_taxon</span><span class="p">:</span>
            <span class="n">count_same_taxon</span> <span class="o">+=</span> <span class="mi">1</span>
    <span class="n">confidence</span> <span class="o">=</span> <span class="n">count_same_taxon</span> <span class="o">/</span> <span class="n">confidence_iterations</span>

    <span class="k">return</span> <span class="p">(</span><span class="n">taxon</span><span class="p">,</span> <span class="n">confidence</span><span class="p">)</span>
</pre></div>
</div>
</div>
</div>
<p>We can apply this to a randomly selected sequence from our test data as follows.</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">query_sr</span> <span class="o">=</span> <span class="n">random_sequence_record_choice</span><span class="p">(</span><span class="n">test_seq_data</span><span class="p">)</span>
<span class="n">taxon_assignment</span><span class="p">,</span> <span class="n">confidence</span> <span class="o">=</span> <span class="n">classify_sequence_with_confidence</span><span class="p">(</span><span class="n">query_sr</span><span class="o">.</span><span class="n">sequence</span><span class="p">,</span> <span class="n">kmer_probability_table</span><span class="p">,</span> <span class="n">k</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="n">taxon_assignment</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="n">confidence</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Sequence </span><span class="si">%s</span><span class="s2"> is predicted to be from the species </span><span class="si">%s</span><span class="s2">. Confidence in this assignment is: </span><span class="si">%1.2f</span><span class="s2">.&quot;</span> <span class="o">%</span> <span class="p">(</span><span class="n">query_sr</span><span class="o">.</span><span class="n">identifier</span><span class="p">,</span> <span class="n">taxon_assignment</span><span class="p">,</span> <span class="n">confidence</span><span class="p">))</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<div class="output stream highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>Prevotella stercorea (Bacteroidetes)
0.75
</pre></div>
</div>
<div class="output stream highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>Sequence 326403 is predicted to be from the species Prevotella stercorea (Bacteroidetes). Confidence in this assignment is: 0.75.
</pre></div>
</div>
</div>
</div>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Sequence </span><span class="si">%s</span><span class="s2"> is known to be from the species </span><span class="si">%s</span><span class="s2">.&quot;</span> <span class="o">%</span> <span class="p">(</span><span class="n">query_sr</span><span class="o">.</span><span class="n">identifier</span><span class="p">,</span> <span class="n">test_labels</span><span class="p">[</span><span class="s1">&#39;legend entry&#39;</span><span class="p">][</span><span class="n">query_sr</span><span class="o">.</span><span class="n">identifier</span><span class="p">]))</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<div class="output stream highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>Sequence 326403 is known to be from the species Prevotella stercorea (Bacteroidetes).
</pre></div>
</div>
</div>
</div>
<div class="admonition-exercise admonition">
<p class="admonition-title">Exercise</p>
<p>Was this sequence classified as the correct species? Does the confidence value align with this result?</p>
</div>
<p>At first glance, we don’t necessarily have an idea of what good versus bad confidence scores are, but we can use our test data to explore that. Once we know what a good confidence score is, we can apply a confidence threshold that we can use in our work. For example, we can define a confidence threshold above which we would accept a taxonomy assignment and below which we could label a sequence as “unclassified”. To explore this, let’s compute taxonomy assignments and confidence for all of our test sequences and then see what the distributions of confidence scores look like for correct assignments and incorrect assignments.</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">correct_assignment_confidences</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">incorrect_assignment_confidences</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">summary</span> <span class="o">=</span> <span class="p">[]</span>

<span class="k">for</span> <span class="n">query_id</span><span class="p">,</span> <span class="n">query_sr</span> <span class="ow">in</span> <span class="n">test_seq_data</span><span class="o">.</span><span class="n">items</span><span class="p">():</span>
    <span class="n">predicted_taxonomy</span><span class="p">,</span> <span class="n">confidence</span> <span class="o">=</span> <span class="n">classify_sequence_with_confidence</span><span class="p">(</span><span class="n">query_sr</span><span class="o">.</span><span class="n">sequence</span><span class="p">,</span> <span class="n">kmer_probability_table</span><span class="p">,</span> <span class="n">k</span><span class="p">)</span>
    <span class="n">known_taxonomy</span> <span class="o">=</span> <span class="n">test_labels</span><span class="p">[</span><span class="s1">&#39;legend entry&#39;</span><span class="p">][</span><span class="n">query_sr</span><span class="o">.</span><span class="n">identifier</span><span class="p">]</span>
    <span class="n">correct_assignment</span> <span class="o">=</span> <span class="n">known_taxonomy</span> <span class="o">==</span> <span class="n">predicted_taxonomy</span>
    <span class="k">if</span> <span class="n">correct_assignment</span><span class="p">:</span>
        <span class="n">correct_assignment_confidences</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">confidence</span><span class="p">)</span>
    <span class="k">else</span><span class="p">:</span>
        <span class="n">incorrect_assignment_confidences</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">confidence</span><span class="p">)</span>

    <span class="n">summary</span><span class="o">.</span><span class="n">append</span><span class="p">([</span><span class="n">predicted_taxonomy</span><span class="p">,</span> <span class="n">known_taxonomy</span><span class="p">,</span> <span class="n">confidence</span><span class="p">,</span> <span class="n">correct_assignment</span><span class="p">])</span>

<span class="n">summary</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">summary</span><span class="p">,</span> <span class="n">columns</span><span class="o">=</span><span class="p">[</span><span class="s1">&#39;Predicted taxonomy&#39;</span><span class="p">,</span> <span class="s1">&#39;Known taxonomy&#39;</span><span class="p">,</span> <span class="s1">&#39;Confidence&#39;</span><span class="p">,</span> <span class="s1">&#39;Correct assignment&#39;</span><span class="p">])</span>

<span class="n">summary</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<div class="output text_html"><div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
</style>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>Predicted taxonomy</th>
      <th>Known taxonomy</th>
      <th>Confidence</th>
      <th>Correct assignment</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>Flavobacterium succinicans (Bacteroidetes)</td>
      <td>Flavobacterium succinicans (Bacteroidetes)</td>
      <td>0.91</td>
      <td>True</td>
    </tr>
    <tr>
      <th>1</th>
      <td>Flavobacterium succinicans (Bacteroidetes)</td>
      <td>Flavobacterium succinicans (Bacteroidetes)</td>
      <td>0.84</td>
      <td>True</td>
    </tr>
    <tr>
      <th>2</th>
      <td>Flavobacterium succinicans (Bacteroidetes)</td>
      <td>Flavobacterium succinicans (Bacteroidetes)</td>
      <td>0.68</td>
      <td>True</td>
    </tr>
    <tr>
      <th>3</th>
      <td>Flavobacterium succinicans (Bacteroidetes)</td>
      <td>Flavobacterium succinicans (Bacteroidetes)</td>
      <td>0.73</td>
      <td>True</td>
    </tr>
    <tr>
      <th>4</th>
      <td>Flavobacterium succinicans (Bacteroidetes)</td>
      <td>Flavobacterium succinicans (Bacteroidetes)</td>
      <td>0.81</td>
      <td>True</td>
    </tr>
    <tr>
      <th>5</th>
      <td>Propionibacterium acnes (Actinobacteria)</td>
      <td>Propionibacterium acnes (Actinobacteria)</td>
      <td>0.81</td>
      <td>True</td>
    </tr>
    <tr>
      <th>6</th>
      <td>Propionibacterium acnes (Actinobacteria)</td>
      <td>Propionibacterium acnes (Actinobacteria)</td>
      <td>0.36</td>
      <td>True</td>
    </tr>
    <tr>
      <th>7</th>
      <td>Propionibacterium acnes (Actinobacteria)</td>
      <td>Propionibacterium acnes (Actinobacteria)</td>
      <td>0.61</td>
      <td>True</td>
    </tr>
    <tr>
      <th>8</th>
      <td>Propionibacterium acnes (Actinobacteria)</td>
      <td>Propionibacterium acnes (Actinobacteria)</td>
      <td>0.91</td>
      <td>True</td>
    </tr>
    <tr>
      <th>9</th>
      <td>Propionibacterium acnes (Actinobacteria)</td>
      <td>Propionibacterium acnes (Actinobacteria)</td>
      <td>0.70</td>
      <td>True</td>
    </tr>
    <tr>
      <th>10</th>
      <td>Prevotella melaninogenica (Bacteroidetes)</td>
      <td>Prevotella melaninogenica (Bacteroidetes)</td>
      <td>0.82</td>
      <td>True</td>
    </tr>
    <tr>
      <th>11</th>
      <td>Prevotella melaninogenica (Bacteroidetes)</td>
      <td>Prevotella melaninogenica (Bacteroidetes)</td>
      <td>0.64</td>
      <td>True</td>
    </tr>
    <tr>
      <th>12</th>
      <td>Prevotella melaninogenica (Bacteroidetes)</td>
      <td>Prevotella melaninogenica (Bacteroidetes)</td>
      <td>0.39</td>
      <td>True</td>
    </tr>
    <tr>
      <th>13</th>
      <td>Prevotella melaninogenica (Bacteroidetes)</td>
      <td>Prevotella melaninogenica (Bacteroidetes)</td>
      <td>0.68</td>
      <td>True</td>
    </tr>
    <tr>
      <th>14</th>
      <td>Prevotella melaninogenica (Bacteroidetes)</td>
      <td>Prevotella melaninogenica (Bacteroidetes)</td>
      <td>0.63</td>
      <td>True</td>
    </tr>
    <tr>
      <th>15</th>
      <td>Prevotella stercorea (Bacteroidetes)</td>
      <td>Prevotella stercorea (Bacteroidetes)</td>
      <td>0.70</td>
      <td>True</td>
    </tr>
    <tr>
      <th>16</th>
      <td>Prevotella copri (Bacteroidetes)</td>
      <td>Prevotella stercorea (Bacteroidetes)</td>
      <td>0.46</td>
      <td>False</td>
    </tr>
    <tr>
      <th>17</th>
      <td>Prevotella copri (Bacteroidetes)</td>
      <td>Prevotella stercorea (Bacteroidetes)</td>
      <td>0.48</td>
      <td>False</td>
    </tr>
    <tr>
      <th>18</th>
      <td>Prevotella copri (Bacteroidetes)</td>
      <td>Prevotella stercorea (Bacteroidetes)</td>
      <td>0.63</td>
      <td>False</td>
    </tr>
    <tr>
      <th>19</th>
      <td>Prevotella copri (Bacteroidetes)</td>
      <td>Prevotella stercorea (Bacteroidetes)</td>
      <td>0.92</td>
      <td>False</td>
    </tr>
    <tr>
      <th>20</th>
      <td>Pseudomonas veronii (Proteobacteria)</td>
      <td>Pseudomonas viridiflava (Proteobacteria)</td>
      <td>0.46</td>
      <td>False</td>
    </tr>
    <tr>
      <th>21</th>
      <td>Pseudomonas viridiflava (Proteobacteria)</td>
      <td>Pseudomonas viridiflava (Proteobacteria)</td>
      <td>0.59</td>
      <td>True</td>
    </tr>
    <tr>
      <th>22</th>
      <td>Pseudomonas viridiflava (Proteobacteria)</td>
      <td>Pseudomonas viridiflava (Proteobacteria)</td>
      <td>0.47</td>
      <td>True</td>
    </tr>
    <tr>
      <th>23</th>
      <td>Pseudomonas viridiflava (Proteobacteria)</td>
      <td>Pseudomonas viridiflava (Proteobacteria)</td>
      <td>0.54</td>
      <td>True</td>
    </tr>
    <tr>
      <th>24</th>
      <td>Pseudomonas veronii (Proteobacteria)</td>
      <td>Pseudomonas viridiflava (Proteobacteria)</td>
      <td>0.55</td>
      <td>False</td>
    </tr>
    <tr>
      <th>25</th>
      <td>Pseudomonas veronii (Proteobacteria)</td>
      <td>Pseudomonas veronii (Proteobacteria)</td>
      <td>0.71</td>
      <td>True</td>
    </tr>
    <tr>
      <th>26</th>
      <td>Pseudomonas veronii (Proteobacteria)</td>
      <td>Pseudomonas veronii (Proteobacteria)</td>
      <td>0.51</td>
      <td>True</td>
    </tr>
    <tr>
      <th>27</th>
      <td>Pseudomonas viridiflava (Proteobacteria)</td>
      <td>Pseudomonas veronii (Proteobacteria)</td>
      <td>0.43</td>
      <td>False</td>
    </tr>
    <tr>
      <th>28</th>
      <td>Pseudomonas veronii (Proteobacteria)</td>
      <td>Pseudomonas veronii (Proteobacteria)</td>
      <td>0.62</td>
      <td>True</td>
    </tr>
    <tr>
      <th>29</th>
      <td>Pseudomonas veronii (Proteobacteria)</td>
      <td>Pseudomonas veronii (Proteobacteria)</td>
      <td>0.73</td>
      <td>True</td>
    </tr>
    <tr>
      <th>30</th>
      <td>Prevotella copri (Bacteroidetes)</td>
      <td>Prevotella copri (Bacteroidetes)</td>
      <td>0.56</td>
      <td>True</td>
    </tr>
    <tr>
      <th>31</th>
      <td>Prevotella copri (Bacteroidetes)</td>
      <td>Prevotella copri (Bacteroidetes)</td>
      <td>0.90</td>
      <td>True</td>
    </tr>
    <tr>
      <th>32</th>
      <td>Prevotella copri (Bacteroidetes)</td>
      <td>Prevotella copri (Bacteroidetes)</td>
      <td>0.63</td>
      <td>True</td>
    </tr>
    <tr>
      <th>33</th>
      <td>Prevotella copri (Bacteroidetes)</td>
      <td>Prevotella copri (Bacteroidetes)</td>
      <td>0.76</td>
      <td>True</td>
    </tr>
    <tr>
      <th>34</th>
      <td>Prevotella copri (Bacteroidetes)</td>
      <td>Prevotella copri (Bacteroidetes)</td>
      <td>0.82</td>
      <td>True</td>
    </tr>
  </tbody>
</table>
</div></div></div>
</div>
<p>Comparing distributions of confidence scores for correct and incorrect assignments is possible from the table above, and the table provides details that can be useful in assessing when the classifier is working well and when it isn’t. A couple of boxplots however will make comparing these distributions trivial.</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">seaborn</span> <span class="k">as</span> <span class="nn">sns</span>

<span class="n">ax</span> <span class="o">=</span> <span class="n">sns</span><span class="o">.</span><span class="n">boxplot</span><span class="p">(</span><span class="n">data</span><span class="o">=</span><span class="p">[</span><span class="n">correct_assignment_confidences</span><span class="p">,</span> <span class="n">incorrect_assignment_confidences</span><span class="p">])</span>
<span class="n">ax</span> <span class="o">=</span> <span class="n">sns</span><span class="o">.</span><span class="n">swarmplot</span><span class="p">(</span><span class="n">data</span><span class="o">=</span><span class="p">[</span><span class="n">correct_assignment_confidences</span><span class="p">,</span> <span class="n">incorrect_assignment_confidences</span><span class="p">],</span> <span class="n">color</span><span class="o">=</span><span class="s2">&quot;black&quot;</span><span class="p">)</span>
<span class="n">_</span> <span class="o">=</span> <span class="n">ax</span><span class="o">.</span><span class="n">set_xticklabels</span><span class="p">([</span><span class="s1">&#39;Correct assignments&#39;</span><span class="p">,</span> <span class="s1">&#39;Incorrect assignments&#39;</span><span class="p">])</span>
<span class="n">_</span> <span class="o">=</span> <span class="n">ax</span><span class="o">.</span><span class="n">set_ylabel</span><span class="p">(</span><span class="s1">&#39;Confidence&#39;</span><span class="p">)</span>

<span class="n">ax</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<div class="output text_plain highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>&lt;AxesSubplot:ylabel=&#39;Confidence&#39;&gt;
</pre></div>
</div>
<img alt="_images/machine-learning_97_1.png" src="_images/machine-learning_97_1.png" />
</div>
</div>
<p>What does this plot tell you about how well setting a confidence threshold is likely to work? If you never wanted to reject a correct assignment, how often would you accept an incorrect assignment? If you never wanted to accept an incorrect assignment, how often would you reject a correct assignment?</p>
<p>We can also compute the overall accuracy of our classifier as follows:</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">n_correct_assignments</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">correct_assignment_confidences</span><span class="p">)</span>
<span class="n">n_incorrect_assignments</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">incorrect_assignment_confidences</span><span class="p">)</span>
<span class="n">accuracy</span> <span class="o">=</span> <span class="n">n_correct_assignments</span> <span class="o">/</span> <span class="p">(</span><span class="n">n_correct_assignments</span> <span class="o">+</span> <span class="n">n_incorrect_assignments</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="s1">&#39;The accuracy of the classifier was </span><span class="si">%1.3f</span><span class="s1">.&#39;</span> <span class="o">%</span> <span class="n">accuracy</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<div class="output stream highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>The accuracy of the classifier was 0.800.
</pre></div>
</div>
</div>
</div>
<p>Finally, we can summarize how this classifier worked by creating a <strong>confusion matrix</strong>. A confusion matrix has two axes - one corresponding to the actual assignments and one corresponding to the predicted assignments. The values in the confusion matrix represent how each instance of each known taxonomy was classified. The order of the classes on the two axes should always be the same in a confusion matrix. A good classifier will then have high values along the diagonal. If classifier accuracy is not great, you can see which known classes were misassigned, or where the classifier got confused.</p>
<div class="cell docutils container">
<div class="cell_input docutils container">
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">confusion_matrix</span> <span class="o">=</span> <span class="n">summary</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="s1">&#39;Predicted taxonomy&#39;</span><span class="p">)[</span><span class="s1">&#39;Known taxonomy&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">value_counts</span><span class="p">()</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span>
<span class="n">confusion_matrix</span><span class="o">.</span><span class="n">columns</span> <span class="o">=</span> <span class="p">[</span><span class="s1">&#39;count&#39;</span><span class="p">]</span>
<span class="n">confusion_matrix</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">pivot_table</span><span class="p">(</span><span class="n">confusion_matrix</span><span class="p">,</span> <span class="n">index</span><span class="o">=</span><span class="p">[</span><span class="s1">&#39;Predicted taxonomy&#39;</span><span class="p">],</span> <span class="n">columns</span><span class="o">=</span><span class="p">[</span><span class="s1">&#39;Known taxonomy&#39;</span><span class="p">],</span> <span class="n">fill_value</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span>
<span class="n">confusion_matrix</span>
</pre></div>
</div>
</div>
<div class="cell_output docutils container">
<div class="output text_html"><div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead tr th {
        text-align: left;
    }

    .dataframe thead tr:last-of-type th {
        text-align: right;
    }
</style>
<table border="1" class="dataframe">
  <thead>
    <tr>
      <th></th>
      <th colspan="7" halign="left">count</th>
    </tr>
    <tr>
      <th>Known taxonomy</th>
      <th>Flavobacterium succinicans (Bacteroidetes)</th>
      <th>Prevotella copri (Bacteroidetes)</th>
      <th>Prevotella melaninogenica (Bacteroidetes)</th>
      <th>Prevotella stercorea (Bacteroidetes)</th>
      <th>Propionibacterium acnes (Actinobacteria)</th>
      <th>Pseudomonas veronii (Proteobacteria)</th>
      <th>Pseudomonas viridiflava (Proteobacteria)</th>
    </tr>
    <tr>
      <th>Predicted taxonomy</th>
      <th></th>
      <th></th>
      <th></th>
      <th></th>
      <th></th>
      <th></th>
      <th></th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>Flavobacterium succinicans (Bacteroidetes)</th>
      <td>5</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
    </tr>
    <tr>
      <th>Prevotella copri (Bacteroidetes)</th>
      <td>0</td>
      <td>5</td>
      <td>0</td>
      <td>4</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
    </tr>
    <tr>
      <th>Prevotella melaninogenica (Bacteroidetes)</th>
      <td>0</td>
      <td>0</td>
      <td>5</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
    </tr>
    <tr>
      <th>Prevotella stercorea (Bacteroidetes)</th>
      <td>0</td>
      <td>0</td>
      <td>0</td>
      <td>1</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
    </tr>
    <tr>
      <th>Propionibacterium acnes (Actinobacteria)</th>
      <td>0</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
      <td>5</td>
      <td>0</td>
      <td>0</td>
    </tr>
    <tr>
      <th>Pseudomonas veronii (Proteobacteria)</th>
      <td>0</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
      <td>4</td>
      <td>2</td>
    </tr>
    <tr>
      <th>Pseudomonas viridiflava (Proteobacteria)</th>
      <td>0</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
      <td>0</td>
      <td>1</td>
      <td>3</td>
    </tr>
  </tbody>
</table>
</div></div></div>
</div>
<p>If the classifier generated here got some classifications wrong, did the classifier at least assign the sequences to the correct phylum?</p>
<div class="admonition-exercise admonition">
<p class="admonition-title">Exercise</p>
<p>Jump back up to where we <a class="reference internal" href="#ml-define-k"><span class="std std-ref">defined <code class="docutils literal notranslate"><span class="pre">k</span></code> and <code class="docutils literal notranslate"><span class="pre">taxonomic_level</span></code></span></a> and modify those values. How does the accuracy of the classifier change if you increase or decrease <code class="docutils literal notranslate"><span class="pre">k</span></code> while keeping the value of <code class="docutils literal notranslate"><span class="pre">taxonomic_level</span></code> fixed? How does the accuracy change if you increase or decrease the <code class="docutils literal notranslate"><span class="pre">taxonomic_level</span></code> while keeping <code class="docutils literal notranslate"><span class="pre">k</span></code> fixed?</p>
</div>
</div>
</div>
<div class="section" id="variations-on-the-input-to-machine-learning-algorithms">
<h2>Variations on the input to machine learning algorithms<a class="headerlink" href="#variations-on-the-input-to-machine-learning-algorithms" title="Permalink to this headline">¶</a></h2>
<p>As in the Iris dataset, the labels in our microbial data are discrete (i.e., categorical or qualitative) as opposed to continuous (i.e., quantitative). If our labels in a supervised learning project were continous instead of discrete - for example the abundance of an organism in an environment - we could still apply supervised learning, but we would work with different algorithms. Specifically, we’d used supervised regression algorithms, rather than supervised classification algorithms.</p>
<p>Similarly, while the features we worked with in our unsupervised and supervised learning examples were continuous values, feature values could also be discrete (e.g., the sex of a subject, or the species of a specimen in an environment). The applicable algorithms might change, but machine learning techniques in general would still be available.</p>
<p>scikit-learn provides other example datasets, including <a class="reference external" href="https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_diabetes.html#sklearn.datasets.load_diabetes">the diabetes dataset</a>, <a class="reference external" href="https://scikit-learn.org/stable/datasets/toy_dataset.html#boston-house-prices-dataset">the housing market dataset</a> and <a class="reference external" href="https://scikit-learn.org/stable/datasets/toy_dataset.html#optical-recognition-of-handwritten-digits-dataset">the hand-writing dataset</a>. These are good illustrations of other types of data that can be used in machine learning tasks. The message to take away is that if you can wrangle your data into a feature table, potentially with corresponding sample labels, you will likely be able to apply machine learning techniques to that data. That said, and as I mentioned at the beginning of this chapter, this introduction barely scratches the surface of this complex branch of statistics and computer science. Especially with the accessibility of these methods through software like scikit-learn, it’s easy to get to the point where you know enough to get yourself into trouble by using machine learning methods inappropriately. If you’d like to apply these tools in your research, you must continue your learning. I recommend continuing with <a class="reference external" href="https://scikit-learn.org/">scikit-learn’s documentation</a>.</p>
</div>
<div class="section" id="list-of-works-cited">
<h2>List of works cited<a class="headerlink" href="#list-of-works-cited" title="Permalink to this headline">¶</a></h2>
<p id="id3"><dl class="citation">
<dt class="label" id="id28"><span class="brackets"><a class="fn-backref" href="#id1">Fis36</a></span></dt>
<dd><p>R A Fisher. The use of multiple measurements in taxonomic problems. <em>Ann. Eugen.</em>, 7(2):179–188, September 1936.</p>
</dd>
<dt class="label" id="id30"><span class="brackets"><a class="fn-backref" href="#id2">VazquezBPGK13</a></span></dt>
<dd><p>Yoshiki Vázquez-Baeza, Meg Pirrung, Antonio Gonzalez, and Rob Knight. EMPeror: a tool for visualizing high-throughput microbial community data. <em>Gigascience</em>, 2(1):16, November 2013.</p>
</dd>
</dl>
</p>
</div>
</div>

    <script type="text/x-thebe-config">
    {
        requestKernel: true,
        binderOptions: {
            repo: "binder-examples/jupyter-stacks-datascience",
            ref: "master",
        },
        codeMirrorConfig: {
            theme: "abcdef",
            mode: "python"
        },
        kernelOptions: {
            kernelName: "python3",
            path: "./."
        },
        predefinedOutput: true
    }
    </script>
    <script>kernelName = 'python3'</script>

              </div>
              
        
        <div class='prev-next-bottom'>
            
    <a class='left-prev' id="prev-link" href="database-searching.html" title="previous page">Sequence homology searching</a>

        </div>
        
        </div>
    </div>
    <footer class="footer mt-5 mt-md-0">
    <div class="container">
      <p>
        
          By J Gregory Caporaso<br/>
        
            &copy; Copyright 2014-2021.<br/>
      </p>
    </div>
  </footer>
</main>


      </div>
    </div>
  
  <script src="_static/js/index.1c5a1a01449ed65a7b51.js"></script>

  
  </body>
</html>