In [32]:
import pandas as pd
import argparse
import os

pd.set_option('future.no_silent_downcasting', True)

path_faire = '~/sedtrap/metadata/FAIRe-NOAA_noaa-aoml-ngmt_20260109.xlsx'
absolute_path_sequences = '/data/sequences/MichiganState'
output_directory = '.'
column_suffix = 'pcr_plate_id'
direction_suffix = 'right'
num_chars_suffix = 8
delimiter_suffix = '_'
         
df_project = pd.read_excel(path_faire, sheet_name='projectMetadata', comment='#')
df_sample = pd.read_excel(path_faire, sheet_name='sampleMetadata', comment='#')
df_exptrun = pd.read_excel(path_faire, sheet_name='experimentRunMetadata', comment='#')

df_project_first_2_cols_removed = df_project.iloc[:, 2:]
df_project_first_2_cols_removed.set_index('term_name', inplace=True)
df_project_wide = df_project_first_2_cols_removed.transpose()
project_id = df_project_wide['project_id'].iloc[0]
first_row_values = df_project_wide.loc['project_level']
df_project_filled = df_project_wide.fillna(first_row_values)

seq_run_ids = df_exptrun['seq_run_id'].unique()
assay_names = df_exptrun['assay_name'].unique()
if set(assay_names) != set(df_exptrun['assay_name'].dropna()):
    raise ValueError("Inconsistent assay names between projectMetadata and experimentRunMetadata.")

# Metadata file for each assay_name
dict_assay_short = {}
for assay_name in df_exptrun['assay_name'].unique()[::-1]:
    # For each assay, merge experiment run metadata with sample metadata and project metadata
    metadata = df_exptrun[df_exptrun['assay_name'] == assay_name].merge(df_sample, on='samp_name', how='left', suffixes=('', '_SAMPLE'))
    metadata = pd.merge(metadata, df_project_filled.loc[assay_name].to_frame().transpose(), how='cross', suffixes=('', '_PROJECT'))
    metadata.dropna(axis=1, how='all', inplace=True)
    metadata.rename(columns={'samp_name': 'sample_name'}, inplace=True)
    
    # Add suffix to sample names that are duplicated within the assay using column_suffix (if provided), direction_suffix, num_chars_suffix, and delimiter_suffix
    if column_suffix is not None:
        duplicated_samples = metadata['sample_name'][metadata['sample_name'].duplicated(keep=False)].unique()
        for sample in duplicated_samples:
            sample_rows = metadata[metadata['sample_name'] == sample]
            for idx, row in sample_rows.iterrows():
                if direction_suffix == 'right':
                    suffix = f'{delimiter_suffix}{row[column_suffix][-num_chars_suffix:]}'  # Placeholder suffix
                else:               
                    suffix = f'{delimiter_suffix}{row[column_suffix][:num_chars_suffix]}'  # Placeholder suffix
                #new_sample_name = f"{row['sample_name']}{suffix}"
                #metadata.at[idx, 'sample_name'] = new_sample_name
    
    # Short assay name to use for output files
    gene = df_project_filled.loc[assay_name]['target_gene'].split(' ')[0]
    subfragment = df_project_filled.loc[assay_name]['target_subfragment']
    subfragment_part = subfragment.split(' ')[0].replace('-', '').replace('_', '') if isinstance(subfragment, str) and subfragment.strip() else ''
    if subfragment_part:
        dict_assay_short[assay_name] = f"{gene}-{subfragment_part}"
    else:
        dict_assay_short[assay_name] = gene
    metadata.to_csv(os.path.join(output_directory, f"{project_id}_{dict_assay_short[assay_name]}_metadata.tsv"), sep='\t', index=False)
    print(f"Generated metadata file {project_id}_{dict_assay_short[assay_name]}_metadata.tsv")

# Manifest file for each seq_run_id
for seq_run_id in seq_run_ids:
    for assay_name in df_exptrun[df_exptrun['seq_run_id'] == seq_run_id]['assay_name'].unique():
        manifest = df_exptrun[(df_exptrun['seq_run_id'] == seq_run_id) & (df_exptrun['assay_name'] == assay_name)][['samp_name', 'filename', 'filename2']]
        manifest['filename'] = manifest['filename'].apply(lambda x: os.path.join(absolute_path_sequences, f'{seq_run_id}', str(x)))
        manifest['filename2'] = manifest['filename2'].apply(lambda x: os.path.join(absolute_path_sequences, f'{seq_run_id}', str(x)))
        manifest.columns = ['sample-id', 'forward-absolute-filepath', 'reverse-absolute-filepath']
        manifest.to_csv(os.path.join(output_directory, f"{seq_run_id}_{dict_assay_short[assay_name]}_manifest.tsv"), sep='\t', index=False)
        print(f"Generated manifest file {seq_run_id}_{dict_assay_short[assay_name]}_manifest.tsv")


Generated metadata file noaa-aoml-ngmt_18S-V9_metadata.tsv
Generated metadata file noaa-aoml-ngmt_16S-V4V5_metadata.tsv
Generated manifest file 20220912_16S-Amplicon_PE250_16S-V4V5_manifest.tsv
Generated manifest file 20231110_AND14513_Amplicon_PE250_16S-V4V5_manifest.tsv
Generated manifest file 20240325_16S-Amplicon_PE250_16S-V4V5_manifest.tsv
Generated manifest file 20250423_16S-Amplicon_PE250_16S-V4V5_manifest.tsv
Generated manifest file 20250728_THO16883_16S-Amplicon_PE250_16S-V4V5_manifest.tsv
Generated manifest file 20220916_18S-Amplicon_PE250_18S-V9_manifest.tsv
Generated manifest file 20231107_AND14514_Amplicon_PE150_18S-V9_manifest.tsv
Generated manifest file 20240401_18S-Amplicon_PE150_18S-V9_manifest.tsv
Generated manifest file 20250423_18S-Amplicon_PE150_18S-V9_manifest.tsv
Generated manifest file 20250730_THO16884_18S-Amplicon_PE150_18S-V9_manifest.tsv


In [33]:
suffix

'_20250626'

In [16]:
metadata[metadata['sample_name'].str.contains('NGMT_GMT1_Sed_550m_10A')]

Unnamed: 0,sample_name,assay_name,pcr_plate_id,pcr_well_position,pcr_well_number,lib_id,seq_run_id,filename,filename2,checksum_filename,...,sequencing_location,platform,instrument,seq_kit,adapter_forward,adapter_reverse,lib_screen,checksum_method,seq_method_additional,ship_crs_expocode
27,NGMT_GMT1_Sed_550m_10A,Bacteria-16S-V4V5-Parada,GMT-1 Sed 16S 2022-09,D4,28,GMT1_16S_10A_S28,20220912_16S-Amplicon_PE250,GMT1_16S_10A_S28_L001_R1_001.fastq.gz,GMT1_16S_10A_S28_L001_R2_001.fastq.gz,d678bd9054c9e861f2e21f3b19f050d7,...,Michigan State University Research Technology ...,ILLUMINA,Illumina MiSeq [OBI_0002003],MiSeq v2 500,ACACTGACGACATGGTTCTACA,TACGGTAGCAGAGACTTGGTCT,The Genomics Core performed secondary PCR usin...,MD5,PhiX control library was spiked in at 10%.,32PE | 320G
49,NGMT_GMT1_Sed_550m_10A_Extracted2306,Bacteria-16S-V4V5-Parada,GMT-2 Sed 16S 2023-08,D1,4,GMT1_dup_16S_10A_S4,20231110_AND14513_Amplicon_PE250,GMT1_dup_16S_10A_S4_L001_R1_001.fastq.gz,GMT1_dup_16S_10A_S4_L001_R2_001.fastq.gz,3c544b57b6949fd3ca628f2596c03dc9,...,Michigan State University Research Technology ...,ILLUMINA,Illumina MiSeq [OBI_0002003],MiSeq v2 500,ACACTGACGACATGGTTCTACA,TACGGTAGCAGAGACTTGGTCT,The Genomics Core performed secondary PCR usin...,MD5,PhiX control library was spiked in at 10%.,32PE | 320G
50,NGMT_GMT1_Sed_550m_10A_Extracted2306,Bacteria-16S-V4V5-Parada,GMT-3 Sed 16S 2023-07-28,F6,46,GMT1_dup2_16S_10A_S91,20231110_AND14513_Amplicon_PE250,GMT1_dup2_16S_10A_S91_L001_R1_001.fastq.gz,GMT1_dup2_16S_10A_S91_L001_R2_001.fastq.gz,4c762a1dafa1b6094495bf1c9ebcc76f,...,Michigan State University Research Technology ...,ILLUMINA,Illumina MiSeq [OBI_0002003],MiSeq v2 500,ACACTGACGACATGGTTCTACA,TACGGTAGCAGAGACTTGGTCT,The Genomics Core performed secondary PCR usin...,MD5,PhiX control library was spiked in at 10%.,32PE | 320G
192,NGMT_GMT1_Sed_550m_10A_Extracted2306,Bacteria-16S-V4V5-Parada,GMT-2 Sed 16S Redo 2024-01-26,D1,4,GMT1_dup_16S_10A_S58,20240325_16S-Amplicon_PE250,GMT1_dup_16S_10A_S58_L001_R1_001.fastq.gz,GMT1_dup_16S_10A_S58_L001_R2_001.fastq.gz,404331aa25b1ac9374de1c00eac5838f,...,Michigan State University Research Technology ...,ILLUMINA,Illumina MiSeq [OBI_0002003],MiSeq v2 500,ACACTGACGACATGGTTCTACA,TACGGTAGCAGAGACTTGGTCT,The Genomics Core performed secondary PCR usin...,MD5,PhiX control library was spiked in at 10%.,32PE | 320G
193,NGMT_GMT1_Sed_550m_10A_Extracted2401,Bacteria-16S-V4V5-Parada,GMT-4 Sed + GMT-5 CTD 16S 2024-01-25,C1,3,GMT1_dup3_16S_10A_S3,20240325_16S-Amplicon_PE250,GMT1_dup3_16S_10A_S3_L001_R1_001.fastq.gz,GMT1_dup3_16S_10A_S3_L001_R2_001.fastq.gz,f5481be9c865798c1a07e4f15329d1c8,...,Michigan State University Research Technology ...,ILLUMINA,Illumina MiSeq [OBI_0002003],MiSeq v2 500,ACACTGACGACATGGTTCTACA,TACGGTAGCAGAGACTTGGTCT,The Genomics Core performed secondary PCR usin...,MD5,PhiX control library was spiked in at 10%.,32PE | 320G
291,NGMT_GMT1_Sed_550m_10A_Extracted2412,Bacteria-16S-V4V5-Parada,GMT-5 Sed 16S 2024-12-18,H8,64,GMT1_10A_dup4_16S_S64,20250423_16S-Amplicon_PE250,GMT1_10A_dup4_16S_S64_L001_R1_001.fastq.gz,GMT1_10A_dup4_16S_S64_L001_R2_001.fastq.gz,58e9115f5c418a301623e380546194b3,...,Michigan State University Research Technology ...,ILLUMINA,Illumina MiSeq [OBI_0002003],MiSeq v2 500,ACACTGACGACATGGTTCTACA,TACGGTAGCAGAGACTTGGTCT,The Genomics Core performed secondary PCR usin...,MD5,PhiX control library was spiked in at 10%.,32PE | 320G
380,NGMT_GMT1_Sed_550m_10A_Extracted2306,Bacteria-16S-V4V5-Parada,GMT-2 Sed 16S Redo 2025-06-26,D1,4,GMT1_10A_dup_16S_S65,20250728_THO16883_16S-Amplicon_PE250,GMT1_10A_dup_16S_S65_L001_R1_001.fastq.gz,GMT1_10A_dup_16S_S65_L001_R2_001.fastq.gz,2852e9d675100a17471bd0d04dcbd07c,...,Michigan State University Research Technology ...,ILLUMINA,Illumina MiSeq [OBI_0002003],MiSeq v2 500,ACACTGACGACATGGTTCTACA,TACGGTAGCAGAGACTTGGTCT,The Genomics Core performed secondary PCR usin...,MD5,PhiX control library was spiked in at 10%.,32PE | 320G
381,NGMT_GMT1_Sed_550m_10A_Extracted2506,Bacteria-16S-V4V5-Parada,GMT-6 Sed + GMT-7 CTD 16S 2025-06-26,D7,52,GMT1_10A_dup5_16S_S52,20250728_THO16883_16S-Amplicon_PE250,GMT1_10A_dup5_16S_S52_L001_R1_001.fastq.gz,GMT1_10A_dup5_16S_S52_L001_R2_001.fastq.gz,41e619ead2a35132ee2309cc443391b3,...,Michigan State University Research Technology ...,ILLUMINA,Illumina MiSeq [OBI_0002003],MiSeq v2 500,ACACTGACGACATGGTTCTACA,TACGGTAGCAGAGACTTGGTCT,The Genomics Core performed secondary PCR usin...,MD5,PhiX control library was spiked in at 10%.,32PE | 320G


In [23]:
ex = 'ABCDEFGHIJ'

In [24]:
ex[:3]

'ABC'

In [27]:
ex[-3:]

'HIJ'