In [8]:
from Bio import SeqIO
import pandas as pd
import os

In [9]:
# variables to be modified by the user
    # input paths
VIBRANT_output_path = '/home/abelardoacm/Storage/Abelardo/projects/mini-devel/results/1_biome_fasta_files/biome0/VIBRANT_DTRs_20kb/'
project_root = '/home/abelardoacm/Storage/Abelardo/projects/mini-devel/'
JGI_global_metadata_csv_path = project_root + 'data/1_environmental_seqs_from_JGI_IMGvir/DTRs_20kb.csv'
    # parameters
min_quality_by_usr = "complete" # 'low', 'medium', 'high', 'complete'
min_contig_length_by_usr = 0 # in bp
max_contig_length_by_usr = 5000 # in bp
mcp_terms_by_usr = None #(list, optional): Terms related to MCP. Defaults to a predefined list if None.
false_terms_by_usr = None #(list, optional): Terms to identify and exclude MPC false positives. Defaults to a predefined list if None.

In [10]:
# create the output directory if it does not exist
output_directory = VIBRANT_output_path + '2_filtered_VIBRANT_output/'
if not os.path.exists(output_directory):
    os.makedirs(output_directory)
os.chdir(project_root)
VIBRANT_output_path_elements = VIBRANT_output_path.split('/')
VIBRANT_folder_name = next((element for element in reversed(VIBRANT_output_path_elements) if element), None)
VIBRANT_input_name = '_'.join(VIBRANT_folder_name.split('_')[1:])

In [11]:
# load the JGI global metadata
JGI_global_metadata = pd.read_csv(JGI_global_metadata_csv_path)

# See VIBRANT Output explanations for more information on the output files and numbers
# output 38 is the VIBRANT annotation file
output38_file_path =  VIBRANT_output_path + 'VIBRANT_results_' + VIBRANT_input_name +'/VIBRANT_annotations_' + VIBRANT_input_name + '.tsv'
VIBRANT_annotations = pd.read_csv(output38_file_path, sep='\t')

# output 42 is the list of predicted genome quality and type
output42_file_path =  VIBRANT_output_path + 'VIBRANT_results_' + VIBRANT_input_name +'/VIBRANT_genome_quality_' + VIBRANT_input_name + '.tsv'
VIBRANT_genome_quality = pd.read_csv(output42_file_path, sep='\t')

# output 23 corresponds to all encoded proteins among identified phages
output23_file_path =  VIBRANT_output_path + 'VIBRANT_phages_' + VIBRANT_input_name + '/' + VIBRANT_input_name + '.phages_combined.faa'
VIBRANT_phages_proteins = SeqIO.parse(output23_file_path, "fasta") # read the sequences from the fasta file


In [12]:
from src.auxiliary_functions.VIBRANT_filtering_functions import filter_VIBRANT_annotations_by_mcp
# Filter the VIBRANT annotations DataFrame based on MCP-related terms
filtered_VIBRANT_annotations = filter_VIBRANT_annotations_by_mcp(VIBRANT_annotations,mcp_terms=mcp_terms_by_usr, false_terms=false_terms_by_usr)
filtered_VIBRANT_annotations


Unnamed: 0,protein,scaffold,KO,AMG,KO name,KO evalue,KO score,KO v-score,Pfam,Pfam name,Pfam evalue,Pfam score,Pfam v-score,VOG,VOG name,VOG evalue,VOG score,VOG v-score
26,DTR_156067_2,DTR_156067,K06904,,K06904; uncharacterized protein,5.000000e-41,140.3,2.41,PF05065.13,Phage capsid family,7.200000e-39,133.0,5.12,VOG00633,sp|O64210|CAPSD_BPMD2 Probable major capsid pr...,1.500000e-45,155.6,10.00
213,DTR_381167_12,DTR_381167,,,,,,,PF05065.13,Phage capsid family,2.100000e-37,128.2,5.12,VOG00633,sp|O64210|CAPSD_BPMD2 Probable major capsid pr...,1.400000e-37,129.4,10.00
543,DTR_885259_17,DTR_885259,,,,,,,,,,,,VOG02473,sp|G9M952|CAPSD_BPPS4 Major capsid protein,1.400000e-91,306.7,1.19
580,DTR_316164_22,DTR_316164,,,,,,,PF05065.13,Phage capsid family,2.300000e-20,72.3,5.12,VOG00633,sp|O64210|CAPSD_BPMD2 Probable major capsid pr...,1.900000e-16,59.7,10.00
618,DTR_166448_17,DTR_166448,,,,,,,PF05065.13,Phage capsid family,9.800000e-18,63.7,5.12,VOG00633,sp|O64210|CAPSD_BPMD2 Probable major capsid pr...,2.800000e-23,82.2,10.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93452,DTR_282867_5,DTR_282867,,,,,,,,,,,,VOG01505,REFSEQ capsid protein,2.700000e-13,49.7,1.00
93501,DTR_545965_14,DTR_545965,,,,,,,PF05065.13,Phage capsid family,1.500000e-41,141.9,5.12,VOG00633,sp|O64210|CAPSD_BPMD2 Probable major capsid pr...,2.000000e-45,155.3,10.00
93625,DTR_890275_1,DTR_890275,,,,,,,PF05065.13,Phage capsid family,4.200000e-14,51.8,5.12,VOG00633,sp|O64210|CAPSD_BPMD2 Probable major capsid pr...,6.100000e-20,71.3,10.00
94005,DTR_348973_11,DTR_348973,,,,,,,PF11651.8,P22 coat protein - gene protein 5,2.700000e-12,45.3,1.00,VOG00241,sp|P19727|CAPSB_BPT7 Minor capsid protein,2.500000e-33,115.1,10.00


In [13]:
from src.auxiliary_functions.VIBRANT_filtering_functions import filter_VIBRANT_genome_quality
# Filter the VIBRANT genome quality DataFrame based on the completeness of the genomes
filtered_VIBRANT_genome_quality = filter_VIBRANT_genome_quality(VIBRANT_genome_quality, min_quality_by_usr)
filtered_VIBRANT_genome_quality

Unnamed: 0,scaffold,type,Quality
5711,DTR_889600,lysogenic,complete circular
5712,DTR_263482,lysogenic,complete circular
5713,DTR_484729,lysogenic,complete circular
5714,DTR_767073,lysogenic,complete circular
5715,DTR_408256,lysogenic,complete circular
...,...,...,...
11400,DTR_272404,lytic,complete circular
11401,DTR_123191,lytic,complete circular
11402,DTR_630980,lytic,complete circular
11403,DTR_614252,lytic,complete circular


In [14]:
from src.auxiliary_functions.VIBRANT_filtering_functions import filter_scaffolds_by_contig_length
# Filter the JGI global metadata DataFrame based on the length of the contigs
filtered_JGI_global_metadata = filter_scaffolds_by_contig_length(JGI_global_metadata, min_contig_length=min_contig_length_by_usr, max_contig_length=max_contig_length_by_usr)
filtered_JGI_global_metadata

Unnamed: 0,genome_id,data_source,contig_length,dtr_length,genes_viral,genes_plasmid,genes_microbial,genes_unknown,biome1,biome2,biome3,gca_name
1,DTR_003403,IMG/M,4496,91,2,0,0,4,Engineered,Wastewater,Nutrient removal,
4,DTR_003989,IMG/M,4748,50,3,0,0,3,Host-Associated,Mammals,Digestive system,GCA_001503475.1
11,DTR_005113,IMG/M,4691,75,1,0,0,6,Aquatic,Marine,Intertidal zone,
12,DTR_005266,IMG/M,3026,98,1,0,0,2,Host-Associated,Invertebrates,Digestive system,
19,DTR_006007,IMG/M,4651,79,1,0,0,7,Aquatic,Marine,Neritic zone,
...,...,...,...,...,...,...,...,...,...,...,...,...
7629,DTR_893354,GOV2,4751,55,1,0,0,8,Aquatic,Marine,,
7630,DTR_893395,GOV2,4777,50,1,0,0,7,Aquatic,Marine,,
7634,DTR_893440,GOV2,3416,55,1,0,0,7,Aquatic,Marine,,GCA_003651745.1
7641,DTR_893610,GOV2,4726,55,1,0,0,11,Aquatic,Marine,,


In [15]:
def filter_and_write_MCP_sequences(filtered_VIBRANT_annotations, VIBRANT_phages_proteins, output_path):
    # Read the column "protein" from filtered_VIBRANT_annotations and create a list of MCP IDs
    filtered_VIBRANT_annotations_MCPs_IDS = filtered_VIBRANT_annotations['protein'].tolist()
    
    # Filter the fasta sequences from VIBRANT_phages_proteins by retaining only relevant IDs
    filtered_sequences = [seq_record for seq_record in VIBRANT_phages_proteins if seq_record.id in filtered_VIBRANT_annotations_MCPs_IDS]
    
    # Write the filtered sequences to a fasta file at the specified output path
    SeqIO.write(filtered_sequences, output_path, "fasta")


