### Run this cell :)

In [6]:
#Only need to rerun this cell once everything is downloaded:
import os
import pandas as pd
from qiime2 import Visualization
import matplotlib.pyplot as plt

%matplotlib inline
data_dir = 'CE'

### creating data directory 

In [2]:
data_dir = 'CE'

if not os.path.isdir(data_dir):
    os.makedirs(data_dir)

In [3]:
print([f for f in os.listdir('.') if os.path.isfile(f)])

['CE_file_downloads.ipynb', 'metadata.ipynb', 'README.md', '.gitignore']


### don't need to run every time? qza file download

In [4]:
! wget -nv -O $data_dir/cheese-single-end-demux.qza 'https://polybox.ethz.ch/index.php/s/2AYhc9EgbtA7wsE/download'

2022-10-14 13:07:10 URL:https://polybox.ethz.ch/index.php/s/2AYhc9EgbtA7wsE/download [959420090/959420090] -> "CE/cheese-single-end-demux.qza" [1]


In [5]:
! ls -1 $data_dir

cheese-single-end-demux.qza
cheese-single-end-demux.qzv
food-metadata.tsv


### tsv file download

In [6]:
! wget -nv -O $data_dir/food-metadata.tsv 'https://polybox.ethz.ch/index.php/s/nEd4l5CWGWGEtae/download'

2022-10-14 13:07:12 URL:https://polybox.ethz.ch/index.php/s/nEd4l5CWGWGEtae/download [42810/42810] -> "CE/food-metadata.tsv" [1]


In [7]:
! ls -1 $data_dir

cheese-single-end-demux.qza
cheese-single-end-demux.qzv
food-metadata.tsv


In [8]:
df_meta = pd.read_csv(f'{data_dir}/food-metadata.tsv', sep='\t', index_col=0)

### Denoising

Visual summary of the QIIME Artifact without changes:

In [9]:
! qiime demux summarize \
    --i-data $data_dir/cheese-single-end-demux.qza \
    --o-visualization $data_dir/cheese-single-end-demux.qzv

[32mSaved Visualization to: CE/cheese-single-end-demux.qzv[0m
[0m

In [4]:
Visualization.load(f'{data_dir}/cheese-single-end-demux.qzv')
#Overview: normal distribution from 0 to almost 100000 with max at 100 Nr. of samples. 
#Overview: Some random blocks to max 10 Nr. of samples between 100000-400000 nuber of sequences
#Interactive quality plot looks good! nothing under QS25! -> no need to trimm off less quality sequences :)

Denoising:
    --p-trunc-len: the length of the reads will be truncated to 100bp (maybe use different truncating length? Max. length is 101)
        **-> maybe don't even need to truncate due to high quality sequences :)**

In [14]:
! qiime dada2 denoise-single \
    --i-demultiplexed-seqs $data_dir/cheese-single-end-demux.qza \
    --p-trunc-len 101 \
    --p-n-threads 3 \
    --o-table $data_dir/dada2_table.qza \
    --o-representative-sequences $data_dir/dada2_rep_set.qza \
    --o-denoising-stats $data_dir/dada2_stats.qza

[32mSaved FeatureTable[Frequency] to: CE/dada2_table.qza[0m
[32mSaved FeatureData[Sequence] to: CE/dada2_rep_set.qza[0m
[32mSaved SampleData[DADA2Stats] to: CE/dada2_stats.qza[0m
[0m

#### Inspect denoising stats:

In [15]:
! qiime metadata tabulate \
    --m-input-file $data_dir/dada2_stats.qza \
    --o-visualization $data_dir/dada2_stats.qzv

[32mSaved Visualization to: CE/dada2_stats.qzv[0m
[0m

In [7]:
Visualization.load(f'{data_dir}/dada2_stats.qzv')
#colums are: sample-id, input, filtered, percentage of input passed filter, 
#denoised, non-chimeric, percentage of input non-chimeric
#downloaded as TSV file

#### Visualization resulting sequences:
- basic statistics about the identified features 
- table mapping featured IDs to DNA sewquences

In [18]:
! qiime feature-table tabulate-seqs \
    --i-data $data_dir/dada2_rep_set.qza \
    --o-visualization $data_dir/dada2_rep_set.qzv

[32mSaved Visualization to: CE/dada2_rep_set.qzv[0m
[0m

In [8]:
Visualization.load(f'{data_dir}/dada2_rep_set.qzv')
#Sequence Length Statistics
    #sequence count = 792, min and max length is 101
#Seven-Nr. Summary of Sequence Lengths
#Sequence table
    #Feature ID, Sequence length and Sequence

#### Visualitazion of feature table

In [25]:
! qiime feature-table summarize \
    --i-table $data_dir/dada2_table.qza \
    --o-visualization $data_dir/dada2_table.qzv

[32mSaved Visualization to: CE/dada2_table.qzv[0m
[0m

In [27]:
Visualization.load(f'{data_dir}/dada2_table.qzv')
#Overview
    #Table sumamry: 362 samples, 792 features, total frequency of 23'794'545
    #Frequency per sample, downloaded histogram as PDF
    #frequency per feature, histogram not downloaded
#Interactive sample detail
    #sampling depth bar
    #sample ID and Feature count table
#Feature Detail
    #ID as index, `corresponding frequency` and `#of samples observed in` as columns