## edna2qiime.ipynb

In [1]:
import pandas as pd

In [2]:
# Load the first sheet from each Excel file
studyMetadata = pd.read_excel('/Users/luke.thompson/node/test-data/studyMetadata_gomecc4.xlsx', sheet_name=0)
sampleMetadata = pd.read_excel('/Users/luke.thompson/node/test-data/sampleMetadata_gomecc4.xlsx', sheet_name=0, comment='#')
libraryMetadata = pd.read_excel('/Users/luke.thompson/node/test-data/libraryMetadata_gomecc4.xlsx', sheet_name=0, comment='#')

In [3]:
# Refactor studyMetadata to get the last 4 columns, set field_name column as the index, and transpose the DataFrame
last_four_columns = studyMetadata.iloc[:, -4:]
last_four_columns.set_index('field_name', inplace=True)
studyMetadataT = last_four_columns.transpose()

In [4]:
# From studyMetadataT, get the project_id
project_id = studyMetadataT['project_id'][0]

In [5]:
# From libraryMetadata get the unique values of the 'seq_run_id' and 'assay_name' columns
seq_run_ids = libraryMetadata['seq_run_id'].unique()
assay_names = libraryMetadata['assay_name'].unique()

In [6]:
# Check if the values of 'assay_name' are the same in studyMetadata and libraryMetadata
set(assay_names) == set(studyMetadataT['assay_name'].dropna())

True

In [None]:
# Create dict of DataFrames to store the merged DataFrames
merged_dfs = {}

for runID in seq_run_ids:
    for assay in assay_names:
        # Get the subset of libraryMetadata for the current 'seq_run_id' and 'assay_name'
        libraryMetadata_subset = libraryMetadata[(libraryMetadata['seq_run_id'] == runID) & (libraryMetadata['assay_name'] == assay)]
        # Merge the subset of libraryMetadata with sampleMetadata using 'samp_name' as the key and add to list of merged DataFrames
        merged = pd.merge(libraryMetadata_subset, sampleMetadata, left_on='samp_name', right_on='samp_name')
        # If the merged DataFrame is not empty, continue
        if merged.shape[0] > 0:
            # Create a dictionary to hold the new columns
            new_columns = {}
            # Add each column from studyMetadataT to the dictionary
            for column in studyMetadataT.columns:
                new_columns[column] = studyMetadataT.loc['study_level', column]
                if not pd.isna(studyMetadataT.loc[assay, column]):
                    new_columns[column] = studyMetadataT.loc[assay, column]
            # Convert the dictionary to a DataFrame and repeat it for each row in the merged DataFrame
            new_columns_df = pd.DataFrame(new_columns, index=merged.index)
            # Concatenate the new columns DataFrame with the merged DataFrame
            merged = pd.concat([merged, new_columns_df], axis=1)
            # Drop columns that contain only NaN values
            merged = merged.dropna(axis=1, how='all')
            # Add merged DataFrame to the dict of merged DataFrames
            merged_dfs[(runID, assay)] = merged
            # Save the merged DataFrame to a tab-delimited file and drop the index column
            merged.to_csv(f'{project_id}.{runID}.{assay}.tsv', sep='\t', index=False)