In [1]:
import pandas as pd

In [2]:
path_faire = '/Users/luke.thompson/Google Drive/My Drive/NOAA/mbon-seus/metadata/FAIRe-ODE_noaa-aoml-seusmbon_20250909.xlsx'
absolute_path_sequences = '/work/orion/projects/datasets/aomlomics/sequences/MichiganState'
output_directory = '.' #'/Users/luke.thompson/Google Drive/My Drive/NOAA/mbon-seus/metadata/metadata'

In [3]:
df_project = pd.read_excel(path_faire, sheet_name='projectMetadata', comment='#')
df_sample = pd.read_excel(path_faire, sheet_name='sampleMetadata', comment='#')
df_exptrun = pd.read_excel(path_faire, sheet_name='experimentRunMetadata', comment='#')

In [4]:
# Remove first 2 columns from df_project, set field_name column as the index, and transpose the DataFrame
df_project_first_2_cols_removed = df_project.iloc[:, 2:]
df_project_first_2_cols_removed.set_index('term_name', inplace=True)
df_project_wide = df_project_first_2_cols_removed.transpose()
# From df_project_wide, get the project_id
project_id = df_project_wide['project_id'].iloc[0]
# Fill missing values in assay-level data with project-level data
first_row_values = df_project_wide.loc['project_level']
df_project_filled = df_project_wide.fillna(first_row_values)

In [5]:
# From df_exptrun get the unique values of the 'seq_run_id' and 'assay_name' columns
seq_run_ids = df_exptrun['seq_run_id'].unique()
assay_names = df_exptrun['assay_name'].unique()
# Throw error if the values of 'assay_name' are not the same in df_project and df_exptrun
if set(assay_names) != set(df_exptrun['assay_name'].dropna()):
    raise ValueError("Inconsistent assay names between projectMetadata and experimentRunMetadata.")

In [None]:
# Create metadata file for each assay_name (filename with abbreviated marker gene and subfragment)
# For each assay_name, get the corresponding rows of df_exptrun, merge with df_samples on samp_name, add project/assay-specific metadata as new columns, drop empty columns, and save to a tsv file
for assay_name in df_exptrun['assay_name'].unique():
    metadata = df_exptrun[df_exptrun['assay_name'] == assay_name].merge(df_sample, on='samp_name', how='left')
    metadata = pd.merge(metadata, df_project_filled.loc[assay_name].to_frame().transpose(), how='cross')
    metadata.dropna(axis=1, how='all', inplace=True)
    gene = df_project_filled.loc[assay_name]['target_gene'].split(' ')[0]
    subfragment = df_project_filled.loc[assay_name]['target_subfragment']
    subfragment_part = subfragment.split(' ')[0].replace('-', '').replace('_', '') if isinstance(subfragment, str) and subfragment.strip() else ''
    if subfragment_part:
        target_gene_subfragment = f"{gene}-{subfragment_part}"
    else:
        target_gene_subfragment = gene
    metadata.to_csv(f"{output_directory}/seusmbon_{target_gene_subfragment}_metadata.tsv", sep='\t', index=False)

In [None]:
# Create manifest file for each seq_run_id
for seq_run_id in seq_run_ids:
    manifest = df_exptrun[df_exptrun['seq_run_id'] == seq_run_id][['samp_name', 'filename', 'filename2']]
    manifest['filename'] = absolute_path_sequences + '/' + f'{seq_run_id}' + '/' + manifest['filename']
    manifest['filename2'] = absolute_path_sequences + '/' + f'{seq_run_id}' + '/' + manifest['filename2']
    manifest.columns = ['sample-id', 'forward-absolute-filepath', 'reverse-absolute-filepath']
    manifest.to_csv(f"{output_directory}/{seq_run_id}_manifest.tsv", sep='\t', index=False)