In [2]:
import os
import pandas as pd
from itertools import takewhile

def merge_and_transpose_corrected(input_dir, output_file, gtdb=False):
    """
    Merges and transposes multiple MetaPhlAn output files into one table, and corrects the header format.
    
    :param input_dir: Directory containing the input .tsv files.
    :param output_file: Path to save the transposed and corrected output file.
    :param gtdb: If True, include GTDB-based profiles in the table.
    """
    listmpaVersion = set()
    profiles_list = []
    merged_tables = None

    # Loop over all files in the input directory
    for root, dirs, files in os.walk(input_dir):
        for file in files:
            if file.endswith('.rmhost.all_metaphlan_bugs_list.tsv'):
                file_path = os.path.join(root, file)
                
                # Get the headers (commented lines)
                headers = [x.strip() for x in takewhile(lambda x: x.startswith('#'), open(file_path))]
                listmpaVersion.add(headers[0])
                
                # Check if files are from different versions
                if len(listmpaVersion) > 1:
                    print('merge_metaphlan_tables: Profiles from different versions of MetaPhlAn detected. Please ensure all files use the same MetaPhlAn version.')
                    return
                
                # Load data, skipping the header lines
                names = headers[-1].split('#')[1].strip().split('\t')
                try:
                    df = pd.read_csv(file_path, sep='\t', skiprows=len(headers), names=names, index_col=0)
                    
                    # Check if "Metaphlan2_Analysis" is in the columns
                    if "Metaphlan2_Analysis" in df.columns:
                        sample_id = os.path.basename(file).split('.')[0]  # Extract sample ID from the file name
                        profiles_list.append(pd.Series(data=df['Metaphlan2_Analysis'], index=df.index, name=sample_id))
                    else:
                        print(f"Warning: 'Metaphlan2_Analysis' column not found in file {file}. Skipping this file.")
                except Exception as e:
                    print(f"Error reading {file}: {e}")
    
    # Concatenate all profiles into a single DataFrame, filling NaN values with 0
    if profiles_list:
        merged_tables = pd.concat(profiles_list, axis=1).fillna(0)
        
        # Transpose the table to switch rows and columns
        transposed_table = merged_tables.T
        
        # Adjust the header and remove the empty row
        transposed_table.index.name = 'SampleID'  # Set 'SampleID' in the first column header
        transposed_table.reset_index(inplace=True)  # Reset index to move SampleID from index to column
        
        # Save the file with the correct format
        transposed_table.to_csv(output_file, sep='\t', index=False)  # Save without an extra row
    else:
        print("No valid files were found to merge.")

# Specify your input directory and output file name 'Metaphlan2_Analysis.tsv'
input_dir = 'F:/notebook/data-nas/sample857'
output_file = 'F:/notebook/data-nas/Metaphlan2_Analysis.tsv'

# Run the merge, transpose, and correction function
merge_and_transpose_corrected(input_dir, output_file, gtdb=False)


In [3]:
# Transform the sampleID into uid
import pandas as pd

# Define the file paths
input_file = 'F:/notebook/data-nas/Metaphlan2_Analysis.tsv'
id_table_file = 'F:/notebook/data-nas/IDtable.csv'
output_file = 'F:/notebook/data-nas/nametrans.tsv'

# Load the Metaphlan2_Analysis.tsv file into a dataframe
metaphlan_df = pd.read_csv(input_file, sep='\t')

# Load the IDtable.csv file into a dataframe
id_table_df = pd.read_csv(id_table_file)

# Merge the two dataframes on 'SampleID'
merged_df = pd.merge(metaphlan_df, id_table_df, on='SampleID', how='left')

# Replace the 'SampleID' column with the 'uid' column
merged_df['SampleID'] = merged_df['uid']

# Drop the now unnecessary 'uid' column
merged_df.drop(columns=['uid'], inplace=True)

# Save the modified dataframe to the output file
merged_df.to_csv(output_file, sep='\t', index=False)

print(f"File saved as {output_file}")

File saved as F:/notebook/data-nas/nametrans.tsv
