In [91]:
from Bio import SeqIO
import pandas as pd
import os

path_to_data = "/home/abelardoacm/Storage/Abelardo/projects/mini-devel/data/1_environmental_seqs_from_JGI_IMGvir/"
fasta_file_path = path_to_data + "DTRs_20kb.fna"
biomes_csv = path_to_data + "DTRs_20kb.csv"
output_dir = path_to_data + "biome_fasta_files"


# read the sequences
sequences = SeqIO.parse(fasta_file_path, "fasta") # read the sequences from the fasta file
csv_file = biomes_csv # specify the path to the CSV file containing the biome information
# read the biomes csv
biomes_df = pd.read_csv(csv_file) # read the csv file

# for columns biome3, biome2 and biome1, replas nan with "unassigned"
biomes_df["biome3"] = biomes_df["biome3"].fillna("unassigned")
biomes_df["biome2"] = biomes_df["biome2"].fillna("unassigned")
biomes_df["biome1"] = biomes_df["biome1"].fillna("unassigned")


In [92]:
def identify_corresponding_biomes_from_level_3(biome_level_3, biomes_df):
    """
    Identify the corresponding biomes from higher levels for a given biome level 3.
    
    Parameters:
    biome_level_3 (str): The biome level 3 to identify corresponding biomes for.
    biomes_df (DataFrame): The DataFrame containing the biome information.
    """
    corresponding_biomes = biomes_df[(biomes_df['biome3'] == biome_level_3)][['biome1', 'biome2', 'biome3']]
    corresponding_biomes_string = '_'.join(corresponding_biomes.values[0])
    # sample usage: identify_corresponding_biomes_from_level_3("Marine_Wetlands", biomes_df)
    # sample output: 'Aquatic_Marine_Wetlands'
    return corresponding_biomes_string

def identify_corresponding_biomes_from_level_2(biome_level_2, biomes_df):
    """
    Identify the corresponding biomes from higher levels for a given biome level 2.
    
    Parameters:
    biome_level_2 (str): The biome level 2 to identify corresponding biomes for.
    biomes_df (DataFrame): The DataFrame containing the biome information.
    """
    corresponding_biomes = biomes_df[(biomes_df['biome2'] == biome_level_2)][['biome1', 'biome2']]
    corresponding_biomes_string = '_'.join(corresponding_biomes.values[0])
    # sample usage: identify_corresponding_biomes_from_level_2("Marine", biomes_df)
    # sample output: 'Aquatic_Marine'
    return corresponding_biomes_string

In [99]:

def split_sequences_by_biome(biome_level, fasta_file_path, output_dir, biomes_df):
    """
    Split the sequences by biome and write them to separate FASTA files.
    
    Parameters:
    biome_level (str): The biome level to split by (e.g., "biome1").
    fasta_file_path (str): The path to the input FASTA file.
    output_dir (str): The directory to save the output FASTA files.
    """
    os.makedirs(output_dir, exist_ok=True)
    
    for biome in biomes_df[biome_level].unique():
        if biome_level == "biome3":
            full_biome = identify_corresponding_biomes_from_level_3(biome, biomes_df)
        elif biome_level == "biome2":
            full_biome = identify_corresponding_biomes_from_level_2(biome, biomes_df)
        else:
            full_biome = biome

        fasta_output_path = os.path.join(output_dir, f"{biome_level}_{full_biome}.fasta")
        print(f"Processing {biome_level}: {full_biome} into file: {fasta_output_path}")

        if pd.isna(biome):
            # Handle missing values (NaN) in the biome column
            filtered_df = biomes_df[biomes_df[biome_level].isna()]
            genome_ids = set(filtered_df["genome_id"].tolist())
        else:
            filtered_df = biomes_df[biomes_df[biome_level] == biome]
            genome_ids = set(filtered_df["genome_id"].tolist())  # Convert list to set for faster lookup
        
        # Reinitialize the sequences iterator for each biome by opening the FASTA file here
        with open(fasta_file_path, "r") as fasta_file:
            sequences = SeqIO.parse(fasta_file, "fasta")
            with open(fasta_output_path, "w") as output_file:
                for record in sequences:
                    if record.id in genome_ids:
                        SeqIO.write(record, output_file, "fasta")


# Split sequences at the 3 levels of the biome hierarchy
                        
#split_sequences_by_biome("biome3", fasta_file_path, output_dir, biomes_df=biomes_df)
split_sequences_by_biome("biome2", fasta_file_path, output_dir, biomes_df=biomes_df)
#split_sequences_by_biome("biome1", fasta_file_path, output_dir, biomes_df=biomes_df)



Processing biome2: Engineered_Wastewater into file: /home/abelardoacm/Storage/Abelardo/projects/mini-devel/data/1_environmental_seqs_from_JGI_IMGvir/biome_fasta_files/biome2_Engineered_Wastewater.fasta
Processing biome2: Host-Associated_Mammals into file: /home/abelardoacm/Storage/Abelardo/projects/mini-devel/data/1_environmental_seqs_from_JGI_IMGvir/biome_fasta_files/biome2_Host-Associated_Mammals.fasta
Processing biome2: Aquatic_Marine into file: /home/abelardoacm/Storage/Abelardo/projects/mini-devel/data/1_environmental_seqs_from_JGI_IMGvir/biome_fasta_files/biome2_Aquatic_Marine.fasta
Processing biome2: Host-Associated_Invertebrates into file: /home/abelardoacm/Storage/Abelardo/projects/mini-devel/data/1_environmental_seqs_from_JGI_IMGvir/biome_fasta_files/biome2_Host-Associated_Invertebrates.fasta
Processing biome2: Engineered_Biotransformation into file: /home/abelardoacm/Storage/Abelardo/projects/mini-devel/data/1_environmental_seqs_from_JGI_IMGvir/biome_fasta_files/biome2_Engin