In [2]:
# Check if python is 3.9.5
import sys
import os
print(sys.version)
%load_ext autoreload
%autoreload 2

3.9.5 (default, May 18 2021, 12:31:01) 
[Clang 10.0.0 ]


# Get the OTUS for our samples

For the EMP we can load some preprocessed OTU information, which can be downloaded from this page: [A communal catalogue reveals Earth's multiscale microbial diversity](https://zenodo.org/record/890000#.YdMPsBOZP_Q)

In [5]:
import utils
import pandas as pd

In [6]:
OTU_INFO_FILE = "../data/otu_summary.emp_deblur_90bp.subset_2k.rare_5000.tsv"

In [7]:
utils.subset_for_soil()
soil_emp_df = utils.subset_for_coordprec(utils.emp_sdf)

If someone wants, one could subsample by filtering out samples on locations with multiple descriptions...

In [12]:
for i, (group, df) in enumerate(soil_emp_df.groupby(["latitude_deg", "longitude_deg"])):
    print(df[["sample_scientific_name", "Description"]], end="\n\n\n\n")

     sample_scientific_name                                        Description
2070        soil metagenome  soil sample from Ring 1 of NZ FACE expt . In r...
2071        soil metagenome  soil sample from Ring 1 of NZ FACE expt . In r...
2072        soil metagenome  soil sample from rhizosphere soil of ryegrass ...
2073        soil metagenome  soil sample from rhizosphere soil of browntop ...
2074        soil metagenome  soil sample from Ring 2 of NZ FACE expt . In r...
2075        soil metagenome  soil sample from Ring 2 of NZ FACE expt . In r...
2076        soil metagenome  soil sample from rhizosphere soil of ryegrass ...
2077        soil metagenome  soil sample from rhizosphere soil of browntop ...
2078        soil metagenome  soil sample from Ring 3 of NZ FACE expt . In r...
2079        soil metagenome  soil sample from Ring 3of NZ FACE expt . In ri...
2080        soil metagenome  soil sample from rhizosphere soil of ryegrass ...
2081        soil metagenome  soil sample from rhizos

But we do not care this time. Here we collect all sample IDs.

In [37]:
all_samples = soil_emp_df["#SampleID"].to_list()

we read the OTU-file and store all occurences within soil samples in a csv file

In [58]:
# functions to extract line info
get_taxon_line = lambda x : {y.split("__")[0]: y.split("__")[1] for y in x.split("\t")[8].split("; ")}
get_relevant_samples = lambda x : [smp for smp in x.split("\t")[9].split(",") if smp in all_samples] 
get_sequence = lambda x : x.split("\t")[1]


In [66]:
# we initialize a csv file for the occurence matrix
import csv
OTU_MATRIX = "../data/emp_soil_preciseloc_otu_matrix.csv"
assert not os.path.exists(OTU_MATRIX)
with open(OTU_MATRIX, "w") as csv_file:
    csv_writer = csv.DictWriter(csv_file, ["otu_seq"] + all_samples)
    csv_writer.writeheader()
    
    # start iterating through all otu hits and filter if found in soil
    with open(OTU_INFO_FILE, "r") as ofile:
        for i, line in enumerate(ofile.readlines()):
            if any([smp in line for smp in all_samples]) : 
    
                # init place to store the occurence of seq in any sample
                sample_dict = {smp: 0 for smp in all_samples}
                seq = get_sequence(line)
                sample_dict["otu_seq"] = seq
                for smp in all_samples:
                    if smp in line : sample_dict[smp] += line.count(smp)
                csv_writer.writerow(sample_dict)

As taxonomic annotation could be interesting, we read the OTU-file again and store the taxonomic annotation in another csv_file.

In [64]:
OTU_INFO = "../data/emp_soil_preciseloc_otu_taxonomy.csv"
assert not os.path.exists(OTU_INFO)
TAXLEVEL = ["k", "p", "c", "o", "f", "g", "s"]
with open(OTU_INFO, "w") as csv_file:
    csv_writer = csv.DictWriter(csv_file, ["otu_seq"]+TAXLEVEL)
    csv_writer.writeheader()
    
    # start iterating through all otu hits and filter if found in soil
    with open(OTU_INFO_FILE, "r") as ofile:
        for i, line in enumerate(ofile.readlines()):
            if not any([smp in line for smp in all_samples]) : continue
                
            # init place to store the occurence of seq in any sample
            seq = get_sequence(line)
            if "Unclassified" in line : {ch: "" for ch in TAXLEVEL}
            else : tax_dict = get_taxon_line(line)
            tax_dict["otu_seq"] = seq
            csv_writer.writerow(tax_dict)