# 5. Influence factors: Soil Properties


In [1]:
#!mkdir /home/lfloerl/cloud/lfloerl/Microterroir/artifacts/16S/lavaux/soil-properties

In [2]:
workdir = '/home/lfloerl/cloud/lfloerl/Microterroir/artifacts/16S/lavaux/soil-properties'
%cd $workdir

/home/lfloerl/cloud/lfloerl/Microterroir/artifacts/16S/lavaux/soil-properties


In [3]:
%env TMPDIR=/scratch/lfloerl/tmpdata

env: TMPDIR=/scratch/lfloerl/tmpdata


In [4]:
from qiime2 import Visualization
import qiime2 as q2
from qiime2 import Visualization
from qiime2 import Artifact
from qiime2 import Metadata
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from qiime2.plugins.diversity.visualizers import alpha_group_significance
from qiime2.plugins.feature_table.methods import (filter_samples, filter_seqs)

from qiime2.plugins import diversity as q2d

%matplotlib inline

# Soil pH 

## PERMANOVA 

### Subset and calculate diversity 

In [8]:
%%bash 

mkdir soil-ph
cd soil-ph

# subset to make sure that all samples have pH measurement (they should)
qiime feature-table filter-samples \
      --i-table /home/lfloerl/cloud/lfloerl/Microterroir/artifacts/16S/lavaux/soil_filtered_table.qza \
      --m-metadata-file /home/lfloerl/microterroir/Microbiome/Metadata/16S_Lavaux_Soil.tsv \
      --p-where "[pH] IS NOT NULL" \
      --o-filtered-table pH_filtered_table.qza

qiime feature-table filter-seqs \
        --i-data /home/lfloerl/cloud/lfloerl/Microterroir/artifacts/16S/bac-dada2-single/dada-rep-seqs-220-ee4-fa4.qza \
        --i-table pH_filtered_table.qza \
        --o-filtered-data soil_pH_filtered_rep_seqs.qza

# Core diversity metrics
qiime diversity core-metrics \
        --i-table pH_filtered_table.qza \
        --p-sampling-depth 5000 \
        --m-metadata-file /home/lfloerl/microterroir/Microbiome/Metadata/16S_Lavaux_Soil.tsv \
        --p-n-jobs 5 \
        --output-dir cm5000/

# Kmer diversity
qiime kmerizer core-metrics \
        --i-sequences soil_pH_filtered_rep_seqs.qza \
        --i-table pH_filtered_table.qza \
        --p-sampling-depth 5000 \
        --m-metadata-file /home/lfloerl/microterroir/Microbiome/Metadata/16S_Lavaux_Soil.tsv \
        --p-n-jobs auto \
        --p-max-features 5000 \
        --output-dir kmer_cm5000/


Saved FeatureTable[Frequency] to: pH_filtered_table.qza
Saved FeatureData[Sequence] to: soil_pH_filtered_rep_seqs.qza
Saved FeatureTable[Frequency] to: cm5000/rarefied_table.qza
Saved SampleData[AlphaDiversity] to: cm5000/observed_features_vector.qza
Saved SampleData[AlphaDiversity] to: cm5000/shannon_vector.qza
Saved SampleData[AlphaDiversity] to: cm5000/evenness_vector.qza
Saved DistanceMatrix to: cm5000/jaccard_distance_matrix.qza
Saved DistanceMatrix to: cm5000/bray_curtis_distance_matrix.qza
Saved PCoAResults to: cm5000/jaccard_pcoa_results.qza
Saved PCoAResults to: cm5000/bray_curtis_pcoa_results.qza
Saved Visualization to: cm5000/jaccard_emperor.qzv
Saved Visualization to: cm5000/bray_curtis_emperor.qzv
Saved FeatureTable[Frequency] to: kmer_cm5000/rarefied_table.qza
Saved FeatureTable[Frequency] to: kmer_cm5000/kmer_table.qza
Saved SampleData[AlphaDiversity] to: kmer_cm5000/observed_features_vector.qza
Saved SampleData[AlphaDiversity] to: kmer_cm5000/shannon_vector.qza
Saved Di

### PERMANOVA thereof

In [14]:
formula = ' Plot_ID * Year * pH'

beta_metrics = ['bray_curtis', 'jaccard']

# PERMANOVA with core metrics 
div_dir = 'cm5000'
for m in beta_metrics:
    !qiime diversity adonis \
        --i-distance-matrix "soil-ph/{div_dir}/{m}_distance_matrix.qza" \
        --m-metadata-file /home/lfloerl/microterroir/Microbiome/Metadata/16S_Lavaux_Soil.tsv \
        --p-formula "{formula}" \
        --p-n-jobs 5 \
        --o-visualization "soil-ph/{div_dir}/adonis_{m}.qzv"
    

# PERMANOVA with kmer core metrics 
kmer_dir = 'kmer_cm5000'
for m in beta_metrics:
    !qiime diversity adonis \
        --i-distance-matrix "soil-ph/{kmer_dir}/{m}_distance_matrix.qza" \
        --m-metadata-file /home/lfloerl/microterroir/Microbiome/Metadata/16S_Lavaux_Soil.tsv \
        --p-formula "{formula}" \
        --p-n-jobs 5 \
        --o-visualization "soil-ph/{kmer_dir}/adonis_{m}.qzv"

[32mSaved Visualization to: soil-ph/cm5000/adonis_bray_curtis.qzv[0m
[0m[32mSaved Visualization to: soil-ph/cm5000/adonis_jaccard.qzv[0m
[0m[32mSaved Visualization to: soil-ph/kmer_cm5000/adonis_bray_curtis.qzv[0m
[0m[32mSaved Visualization to: soil-ph/kmer_cm5000/adonis_jaccard.qzv[0m
[0m

### Check it out! 

In [15]:
Visualization.load('soil-ph/cm5000/adonis_bray_curtis.qzv')

In [16]:
Visualization.load('soil-ph/cm5000/adonis_jaccard.qzv')

In [12]:
Visualization.load('soil-ph/kmer_cm5000/adonis_bray_curtis.qzv')

In [13]:
Visualization.load('soil-ph/kmer_cm5000/adonis_jaccard.qzv')

## Regress Samples

In [17]:
%%bash 

# Run the sample-classifier regress-samples command
qiime sample-classifier regress-samples \
        --i-table soil-ph/pH_filtered_table.qza \
        --m-metadata-file /home/lfloerl/microterroir/Microbiome/Metadata/16S_Lavaux_Soil.tsv \
        --m-metadata-column "pH" \
        --p-n-jobs 10 \
        --output-dir soil-ph/regression

Saved SampleEstimator[Regressor] to: soil-ph/regression/sample_estimator.qza
Saved FeatureData[Importance] to: soil-ph/regression/feature_importance.qza
Saved SampleData[RegressorPredictions] to: soil-ph/regression/predictions.qza
Saved Visualization to: soil-ph/regression/model_summary.qzv
Saved Visualization to: soil-ph/regression/accuracy_results.qzv


In [18]:
Visualization.load('soil-ph/regression/accuracy_results.qzv')

# Soil Conseil: Properties 

## PERMANOVA 

### Subset and calculate diversity 

In [19]:
!rm -r soil-properties

rm: das Entfernen von 'soil-properties' ist nicht möglich: No such file or directory


In [20]:
%%bash 

mkdir soil-properties
cd soil-properties

# subset to make sure that all samples have pH measurement (they should)
qiime feature-table filter-samples \
      --i-table /home/lfloerl/cloud/lfloerl/Microterroir/artifacts/16S/lavaux/soil_filtered_table.qza \
      --m-metadata-file /home/lfloerl/microterroir/Microbiome/Metadata/16S_Lavaux_Soil.tsv \
      --p-where "[C_total] IS NOT NULL" \
      --o-filtered-table filtered_table.qza

qiime feature-table filter-seqs \
        --i-data /home/lfloerl/cloud/lfloerl/Microterroir/artifacts/16S/bac-dada2-single/dada-rep-seqs-220-ee4-fa4.qza \
        --i-table filtered_table.qza \
        --o-filtered-data filtered_rep_seqs.qza

# Core diversity metrics
qiime diversity core-metrics \
        --i-table filtered_table.qza \
        --p-sampling-depth 5000 \
        --m-metadata-file /home/lfloerl/microterroir/Microbiome/Metadata/16S_Lavaux_Soil.tsv \
        --p-n-jobs 5 \
        --output-dir cm5000/

# Kmer diversity
qiime kmerizer core-metrics \
        --i-sequences filtered_rep_seqs.qza \
        --i-table filtered_table.qza \
        --p-sampling-depth 5000 \
        --m-metadata-file /home/lfloerl/microterroir/Microbiome/Metadata/16S_Lavaux_Soil.tsv \
        --p-n-jobs auto \
        --p-max-features 5000 \
        --output-dir kmer_cm5000/

Saved FeatureTable[Frequency] to: filtered_table.qza
Saved FeatureData[Sequence] to: filtered_rep_seqs.qza
Saved FeatureTable[Frequency] to: cm5000/rarefied_table.qza
Saved SampleData[AlphaDiversity] to: cm5000/observed_features_vector.qza
Saved SampleData[AlphaDiversity] to: cm5000/shannon_vector.qza
Saved SampleData[AlphaDiversity] to: cm5000/evenness_vector.qza
Saved DistanceMatrix to: cm5000/jaccard_distance_matrix.qza
Saved DistanceMatrix to: cm5000/bray_curtis_distance_matrix.qza
Saved PCoAResults to: cm5000/jaccard_pcoa_results.qza
Saved PCoAResults to: cm5000/bray_curtis_pcoa_results.qza
Saved Visualization to: cm5000/jaccard_emperor.qzv
Saved Visualization to: cm5000/bray_curtis_emperor.qzv
Saved FeatureTable[Frequency] to: kmer_cm5000/rarefied_table.qza
Saved FeatureTable[Frequency] to: kmer_cm5000/kmer_table.qza
Saved SampleData[AlphaDiversity] to: kmer_cm5000/observed_features_vector.qza
Saved SampleData[AlphaDiversity] to: kmer_cm5000/shannon_vector.qza
Saved DistanceMatri

### PERMANOVA thereof

In [21]:
formula = 'N_total + Soil_thickness + Clay_Percentage + Soil_depth + Organic_Matter + Soil_type + Gravel_Estimate + C_total + CN_ratio + Hydromorphie + Geology'

beta_metrics = ['bray_curtis', 'jaccard']

# PERMANOVA with core metrics 
div_dir = 'cm5000'
for m in beta_metrics:
    !qiime diversity adonis \
        --i-distance-matrix "soil-properties/{div_dir}/{m}_distance_matrix.qza" \
        --m-metadata-file /home/lfloerl/microterroir/Microbiome/Metadata/16S_Lavaux_Soil.tsv \
        --p-formula "{formula}" \
        --p-n-jobs 5 \
        --o-visualization "soil-properties/{div_dir}/adonis_{m}.qzv"
    

# PERMANOVA with kmer core metrics 
kmer_dir = 'kmer_cm5000'
for m in beta_metrics:
    !qiime diversity adonis \
        --i-distance-matrix "soil-properties/{kmer_dir}/{m}_distance_matrix.qza" \
        --m-metadata-file /home/lfloerl/microterroir/Microbiome/Metadata/16S_Lavaux_Soil.tsv \
        --p-formula "{formula}" \
        --p-n-jobs 5 \
        --o-visualization "soil-properties/{kmer_dir}/adonis_{m}.qzv"

[32mSaved Visualization to: soil-properties/cm5000/adonis_bray_curtis.qzv[0m
[0m[32mSaved Visualization to: soil-properties/cm5000/adonis_jaccard.qzv[0m
[0m[32mSaved Visualization to: soil-properties/kmer_cm5000/adonis_bray_curtis.qzv[0m
[0m[32mSaved Visualization to: soil-properties/kmer_cm5000/adonis_jaccard.qzv[0m
[0m

### Check it out! 

In [22]:
Visualization.load('soil-properties/cm5000/adonis_bray_curtis.qzv')

In [23]:
Visualization.load('soil-properties/cm5000/adonis_jaccard.qzv')

In [24]:
Visualization.load('soil-properties/kmer_cm5000/adonis_bray_curtis.qzv')

In [25]:
Visualization.load('soil-properties/kmer_cm5000/adonis_jaccard.qzv')

## Regress Samples

In [26]:
%%bash 

columns=('N_total' 'Soil_thickness' 'Clay_Percentage' 'Soil_depth' 'Organic_Matter' 'Soil_type' 'Gravel_Estimate' 'C_total' 'CN_ratio' 'Hydromorphie' 'Geology')

for column in "${columns[@]}"
do
    echo "Processing regression for $column"
    
    qiime sample-classifier regress-samples \
        --i-table soil-properties/filtered_table.qza \
        --m-metadata-file /home/lfloerl/microterroir/Microbiome/Metadata/16S_Lavaux_Soil.tsv \
        --m-metadata-column "$column" \
        --p-n-jobs 10 \
        --output-dir "soil-properties/$column"
done


Processing regression for N_total
Saved SampleEstimator[Regressor] to: soil-properties/N_total/sample_estimator.qza
Saved FeatureData[Importance] to: soil-properties/N_total/feature_importance.qza
Saved SampleData[RegressorPredictions] to: soil-properties/N_total/predictions.qza
Saved Visualization to: soil-properties/N_total/model_summary.qzv
Saved Visualization to: soil-properties/N_total/accuracy_results.qzv
Processing regression for Soil_thickness


Usage: qiime sample-classifier regress-samples [OPTIONS]

  Predicts a continuous sample metadata column using a supervised learning
  regressor. Splits input data into training and test sets. The training set
  is used to train and test the estimator using a stratified k-fold cross-
  validation scheme. This includes optional steps for automated feature
  extraction and hyperparameter optimization. The test set validates
  classification accuracy of the optimized estimator. Outputs classification
  results for test set. For more details on the learning algorithm, see
  http://scikit-learn.org/stable/supervised_learning.html

Inputs:
  --i-table ARTIFACT FeatureTable[Frequency | RelativeFrequency |
    PresenceAbsence | Composition]
                          Feature table containing all features that should
                          be used for target prediction.            [required]
Parameters:
  --m-metadata-file METADATA
  --m-metadata-column COLUMN  MetadataColumn[Numeric]
       

Processing regression for Clay_Percentage
Saved SampleEstimator[Regressor] to: soil-properties/Clay_Percentage/sample_estimator.qza
Saved FeatureData[Importance] to: soil-properties/Clay_Percentage/feature_importance.qza
Saved SampleData[RegressorPredictions] to: soil-properties/Clay_Percentage/predictions.qza
Saved Visualization to: soil-properties/Clay_Percentage/model_summary.qzv
Saved Visualization to: soil-properties/Clay_Percentage/accuracy_results.qzv
Processing regression for Soil_depth


Usage: qiime sample-classifier regress-samples [OPTIONS]

  Predicts a continuous sample metadata column using a supervised learning
  regressor. Splits input data into training and test sets. The training set
  is used to train and test the estimator using a stratified k-fold cross-
  validation scheme. This includes optional steps for automated feature
  extraction and hyperparameter optimization. The test set validates
  classification accuracy of the optimized estimator. Outputs classification
  results for test set. For more details on the learning algorithm, see
  http://scikit-learn.org/stable/supervised_learning.html

Inputs:
  --i-table ARTIFACT FeatureTable[Frequency | RelativeFrequency |
    PresenceAbsence | Composition]
                          Feature table containing all features that should
                          be used for target prediction.            [required]
Parameters:
  --m-metadata-file METADATA
  --m-metadata-column COLUMN  MetadataColumn[Numeric]
       

Processing regression for Organic_Matter
Saved SampleEstimator[Regressor] to: soil-properties/Organic_Matter/sample_estimator.qza
Saved FeatureData[Importance] to: soil-properties/Organic_Matter/feature_importance.qza
Saved SampleData[RegressorPredictions] to: soil-properties/Organic_Matter/predictions.qza
Saved Visualization to: soil-properties/Organic_Matter/model_summary.qzv
Saved Visualization to: soil-properties/Organic_Matter/accuracy_results.qzv
Processing regression for Soil_type


Usage: qiime sample-classifier regress-samples [OPTIONS]

  Predicts a continuous sample metadata column using a supervised learning
  regressor. Splits input data into training and test sets. The training set
  is used to train and test the estimator using a stratified k-fold cross-
  validation scheme. This includes optional steps for automated feature
  extraction and hyperparameter optimization. The test set validates
  classification accuracy of the optimized estimator. Outputs classification
  results for test set. For more details on the learning algorithm, see
  http://scikit-learn.org/stable/supervised_learning.html

Inputs:
  --i-table ARTIFACT FeatureTable[Frequency | RelativeFrequency |
    PresenceAbsence | Composition]
                          Feature table containing all features that should
                          be used for target prediction.            [required]
Parameters:
  --m-metadata-file METADATA
  --m-metadata-column COLUMN  MetadataColumn[Numeric]
       

Processing regression for Gravel_Estimate


Usage: qiime sample-classifier regress-samples [OPTIONS]

  Predicts a continuous sample metadata column using a supervised learning
  regressor. Splits input data into training and test sets. The training set
  is used to train and test the estimator using a stratified k-fold cross-
  validation scheme. This includes optional steps for automated feature
  extraction and hyperparameter optimization. The test set validates
  classification accuracy of the optimized estimator. Outputs classification
  results for test set. For more details on the learning algorithm, see
  http://scikit-learn.org/stable/supervised_learning.html

Inputs:
  --i-table ARTIFACT FeatureTable[Frequency | RelativeFrequency |
    PresenceAbsence | Composition]
                          Feature table containing all features that should
                          be used for target prediction.            [required]
Parameters:
  --m-metadata-file METADATA
  --m-metadata-column COLUMN  MetadataColumn[Numeric]
       

Processing regression for C_total
Saved SampleEstimator[Regressor] to: soil-properties/C_total/sample_estimator.qza
Saved FeatureData[Importance] to: soil-properties/C_total/feature_importance.qza
Saved SampleData[RegressorPredictions] to: soil-properties/C_total/predictions.qza
Saved Visualization to: soil-properties/C_total/model_summary.qzv
Saved Visualization to: soil-properties/C_total/accuracy_results.qzv
Processing regression for CN_ratio
Saved SampleEstimator[Regressor] to: soil-properties/CN_ratio/sample_estimator.qza
Saved FeatureData[Importance] to: soil-properties/CN_ratio/feature_importance.qza
Saved SampleData[RegressorPredictions] to: soil-properties/CN_ratio/predictions.qza
Saved Visualization to: soil-properties/CN_ratio/model_summary.qzv
Saved Visualization to: soil-properties/CN_ratio/accuracy_results.qzv
Processing regression for Hydromorphie


Usage: qiime sample-classifier regress-samples [OPTIONS]

  Predicts a continuous sample metadata column using a supervised learning
  regressor. Splits input data into training and test sets. The training set
  is used to train and test the estimator using a stratified k-fold cross-
  validation scheme. This includes optional steps for automated feature
  extraction and hyperparameter optimization. The test set validates
  classification accuracy of the optimized estimator. Outputs classification
  results for test set. For more details on the learning algorithm, see
  http://scikit-learn.org/stable/supervised_learning.html

Inputs:
  --i-table ARTIFACT FeatureTable[Frequency | RelativeFrequency |
    PresenceAbsence | Composition]
                          Feature table containing all features that should
                          be used for target prediction.            [required]
Parameters:
  --m-metadata-file METADATA
  --m-metadata-column COLUMN  MetadataColumn[Numeric]
       

Processing regression for Geology


Usage: qiime sample-classifier regress-samples [OPTIONS]

  Predicts a continuous sample metadata column using a supervised learning
  regressor. Splits input data into training and test sets. The training set
  is used to train and test the estimator using a stratified k-fold cross-
  validation scheme. This includes optional steps for automated feature
  extraction and hyperparameter optimization. The test set validates
  classification accuracy of the optimized estimator. Outputs classification
  results for test set. For more details on the learning algorithm, see
  http://scikit-learn.org/stable/supervised_learning.html

Inputs:
  --i-table ARTIFACT FeatureTable[Frequency | RelativeFrequency |
    PresenceAbsence | Composition]
                          Feature table containing all features that should
                          be used for target prediction.            [required]
Parameters:
  --m-metadata-file METADATA
  --m-metadata-column COLUMN  MetadataColumn[Numeric]
       

CalledProcessError: Command 'b'\ncolumns=(\'N_total\' \'Soil_thickness\' \'Clay_Percentage\' \'Soil_depth\' \'Organic_Matter\' \'Soil_type\' \'Gravel_Estimate\' \'C_total\' \'CN_ratio\' \'Hydromorphie\' \'Geology\')\n\nfor column in "${columns[@]}"\ndo\n    echo "Processing regression for $column"\n    \n    qiime sample-classifier regress-samples \\\n        --i-table soil-properties/filtered_table.qza \\\n        --m-metadata-file /home/lfloerl/microterroir/Microbiome/Metadata/16S_Lavaux_Soil.tsv \\\n        --m-metadata-column "$column" \\\n        --p-n-jobs 10 \\\n        --output-dir "soil-properties/$column"\ndone\n'' returned non-zero exit status 1.

In [27]:
Visualization.load('soil-properties/N_total/accuracy_results.qzv')

In [28]:
Visualization.load('soil-properties/Clay_Percentage/accuracy_results.qzv')

In [29]:
Visualization.load('soil-properties/Organic_Matter/accuracy_results.qzv')

In [30]:
Visualization.load('soil-properties/C_total/accuracy_results.qzv')

In [31]:
Visualization.load('soil-properties/CN_ratio/accuracy_results.qzv')

In [32]:
Visualization.load('soil-properties/N_total/accuracy_results.qzv')