# Metadata Microterroir 

In [1]:
wd = '/home/lfloerl/microterroir/Microbiome/Metadata'
%cd $wd 

/home/lfloerl/microterroir/Microbiome/Metadata


In [2]:
import pandas as pd
import biom
import re
import numpy as np

## Import raw metadata files

In [3]:
# FGCZ metadata 
md_pooling = pd.read_csv('RawMD-Demux.csv')
md_pooling['Library_ID'] = md_pooling['Name'].str.split('-').str[-2] + '-' + md_pooling['Name'].str.split('-').str[-1]

# our own sample file 
md_samples = pd.read_csv('RawMD-LibraryPrep-LP3.csv')
md_samples['Library_ID_16S'] = md_samples['unique_16S_ID'].str.split('-').str[-2] + '-' + md_samples['unique_16S_ID'].str.split('-').str[-1]
md_samples['Library_ID_ITS'] = md_samples['unique_ITS_ID'].str.split('-').str[-2] + '-' + md_samples['unique_ITS_ID'].str.split('-').str[-1]

# Make basic 16S / ITS metadata files 

In [4]:
# 16S 
md_16S = pd.merge(md_pooling, md_samples, left_on='Library_ID', right_on='Library_ID_16S')
md_16S = md_16S[['Name', 'SAMPLE_NAME', 'CTRL', 'Project', 'DNA_extraction_plate', 'Plate', 'Bacterial conc. (ng/uL)']]
md_16S['Bacterial conc. (ng/uL)'] = md_16S['Bacterial conc. (ng/uL)'].apply(lambda x: 0.001 if x <= 0 else x)
md_16S.set_index('Name', inplace=True)
md_16S.index.name = 'id'
md_16S.to_csv('16S_md.tsv', sep='\t')

# ITS 
md_ITS = pd.merge(md_pooling, md_samples, left_on='Library_ID', right_on='Library_ID_ITS')
md_ITS = md_ITS[['Name', 'SAMPLE_NAME', 'CTRL', 'Project', 'DNA_extraction_plate', 'Plate', 'Fungal conc. (ng/uL)']]
md_ITS['Fungal conc. (ng/uL)'] = md_ITS['Fungal conc. (ng/uL)'].apply(lambda x: 0.001 if x <= 0 else x)
md_ITS.set_index('Name', inplace=True)
md_ITS.index.name = 'id'
md_ITS.to_csv('ITS_md.tsv', sep='\t')

In [5]:
md_16S['Project'].unique()

array(['SoilColonialization', 'BotrytizedWine', 'NOT-USE', 'WINE',
       'Lavaux', 'PNA-test', 'SamplingBenchmarking', 'Valais'],
      dtype=object)

## check sample ID and frequency   

In [1]:
!pwd

/home/lfloerl/microterroir/Microbiome/Metadata


In [4]:
!qiime tools export --input-path /home/lfloerl/cloud/lfloerl/Microterroir/artifacts/16S/bac-dada2/dada-table-220-120-ee4-fa4.qza  --output-path /home/lfloerl/cloud/lfloerl/Microterroir/artifacts/16S/bac-dada2/dada-table-220-120-ee4-fa4

[32mExported /home/lfloerl/cloud/lfloerl/Microterroir/artifacts/16S/bac-dada2/dada-table-220-120-ee4-fa4.qza as BIOMV210DirFmt to directory /home/lfloerl/cloud/lfloerl/Microterroir/artifacts/16S/bac-dada2/dada-table-220-120-ee4-fa4[0m
[0m

In [5]:
!ls /home/lfloerl/cloud/lfloerl/Microterroir/artifacts/16S/bac-dada2/dada-table-220-120-ee4-fa4

feature-table.biom


In [9]:
# Load the feature table
feature_table = biom.load_table("/home/lfloerl/cloud/lfloerl/Microterroir/artifacts/16S/bac-dada2/dada-table-220-120-ee4-fa4/feature-table.biom")
feature_df = pd.DataFrame(feature_table.to_dataframe().transpose())

# Load the metadata
metadata = pd.read_csv("/home/lfloerl/microterroir/Microbiome/Metadata/16S_md.tsv", sep="\t", index_col="id")

# Merge feature table with metadata
merged_df = feature_df.join(metadata["SAMPLE_NAME"])

# Set SAMPLE_NAME as index
merged_df = merged_df.set_index("SAMPLE_NAME")

# Calculate frequency per sample
merged_df['Frequency'] = merged_df.sum(axis=1)

# Keep only the Frequency column
frequency_df = merged_df[['Frequency']]

# Sort by frequency in descending order
frequency_df = frequency_df.sort_values('Frequency', ascending=False)

# Export as CSV
frequency_df.to_csv("16S_denoised-220-120-realxed-samples_frequency.csv")

In [35]:
frequency_df

Unnamed: 0_level_0,Frequency
SAMPLE_NAME,Unnamed: 1_level_1
Grenchen_2021-06-11_soil_1A_ref,1584272.0
Grenchen_2021-07-21_soil_1A,1156457.0
Grenchen_2021-07-21_soil_1C,1009814.0
Grenchen_2022-03-08_1A,735721.0
Erlenbach_2022-10-20_3,722455.0
...,...
21DL05S_2751_wine,0
Lavaux_2023-09-20_must_Plot12_RepC,0
Lavaux_2023-08-24_leaf_Plot5_washed,0
Lavaux_2021-09-30_must_Plot20_Row13/Vine20_ID215,0


In [29]:
# Calculate and print the total overall frequency
total_frequency = frequency_df['Frequency'].sum()
print(f"Total overall frequency: {total_frequency}")

Total overall frequency: 25447899.0
