In [1]:
import pandas as pd

# Specify the file path of the TSV file
file_path = '/home/abelardoacm/Storage/Abelardo/projects/mini-devel/results/biome_fasta_files/biome0/VIBRANT_DTRs_20kb/VIBRANT_results_DTRs_20kb/VIBRANT_annotations_DTRs_20kb.tsv'

# Read the TSV file into a dataframe
VIBRANT_annotations = pd.read_csv(file_path, sep='\t')

# Display the dataframe
VIBRANT_annotations.head()


Unnamed: 0,protein,scaffold,KO,AMG,KO name,KO evalue,KO score,KO v-score,Pfam,Pfam name,Pfam evalue,Pfam score,Pfam v-score,VOG,VOG name,VOG evalue,VOG score,VOG v-score
0,DTR_028342_1,DTR_028342,K06909,,xtmB; phage terminase large subunit,1.1e-33,116.2,4.5,,,,,,VOG00381,"sp|P54308|TERL_BPSPP Terminase, large subunit gp2",3.2000000000000003e-27,94.5,8.34
1,DTR_028342_2,DTR_028342,,,,,,,PF05133.14,"Phage portal protein, SPP1 Gp6-like",1.1e-80,271.0,4.16,VOG00213,sp|Q05220|PORTL_BPML5 Portal protein,2.1e-61,207.5,10.0
2,DTR_028342_3,DTR_028342,,,,,,,,,,,,,,,,
3,DTR_028342_4,DTR_028342,,,,,,,,,,,,VOG00241,sp|P19727|CAPSB_BPT7 Minor capsid protein,6.5e-18,64.5,10.0
4,DTR_028342_5,DTR_028342,,,,,,,,,,,,,,,,


In [3]:
def filter_df_by_strings(df, strings_to_exclude):
    # Create a boolean mask to filter out rows containing any of the strings
    mask = df.apply(lambda row: any(string in row.values for string in strings_to_exclude), axis=1)
    
    # Invert the mask to get rows that do not contain any of the strings
    filtered_df = df[~mask]
    
    return filtered_df


def filter_VIBRANT_annotations_mcp(df, mcp_terms=None, false_terms=None):
    """
    Filters a DataFrame based on annotations related to major coat proteins (MCP) across multiple columns.
    Only columns with 'name' in their titles are considered for filtering to find or exclude specific terms.

    Args:
        df (pandas.DataFrame): The DataFrame to filter.
        mcp_terms (list, optional): Terms related to MCP. Defaults to a predefined list if None.
        false_terms (list, optional): Terms to identify and exclude false positives. Defaults to a predefined list if None.

    Returns:
        pandas.DataFrame: A filtered DataFrame containing only rows likely related to MCPs, across specified columns.
    """
    if mcp_terms is None:
        mcp_terms = ["mcp", "major", "coat", "capsid"
        ]
    if false_terms is None:
        false_terms = [
            "minor", "fiber", "tropism", "non-structural", "envelope protein",
            "replicase", "polymerase", "regulatory protein", "accessory protein", "tail", "assembly",
            "protease", "encapsidation"
        ]
    
    # Identify columns with 'name' in their title to filter based on annotations within these columns
    name_columns = [col for col in df.columns if 'name' in col.lower()]
    
    # Filter rows by checking for MCP-related terms and excluding false positive terms across the identified columns
    def is_relevant_row(row):
        for col in name_columns:
            cell_content = str(row[col]).lower()
            if any(mcp_term in cell_content for mcp_term in mcp_terms) and not any(false_term in cell_content for false_term in false_terms):
                return True
        return False

    filtered_df = df[df.apply(is_relevant_row, axis=1)]

    filtered_df = filter_df_by_strings(filtered_df, false_terms)
    
    return filtered_df

# Assuming 'VIBRANT_annotations' is your DataFrame
filtered_VIBRANT_annotations = filter_VIBRANT_annotations_mcp(VIBRANT_annotations)

# Display the first few rows of the filtered DataFrame
filtered_VIBRANT_annotations.head()

Unnamed: 0,protein,scaffold,KO,AMG,KO name,KO evalue,KO score,KO v-score,Pfam,Pfam name,Pfam evalue,Pfam score,Pfam v-score,VOG,VOG name,VOG evalue,VOG score,VOG v-score
26,DTR_156067_2,DTR_156067,K06904,,K06904; uncharacterized protein,5e-41,140.3,2.41,PF05065.13,Phage capsid family,7.2e-39,133.0,5.12,VOG00633,sp|O64210|CAPSD_BPMD2 Probable major capsid pr...,1.5e-45,155.6,10.0
213,DTR_381167_12,DTR_381167,,,,,,,PF05065.13,Phage capsid family,2.1e-37,128.2,5.12,VOG00633,sp|O64210|CAPSD_BPMD2 Probable major capsid pr...,1.4e-37,129.4,10.0
543,DTR_885259_17,DTR_885259,,,,,,,,,,,,VOG02473,sp|G9M952|CAPSD_BPPS4 Major capsid protein,1.4e-91,306.7,1.19
580,DTR_316164_22,DTR_316164,,,,,,,PF05065.13,Phage capsid family,2.3e-20,72.3,5.12,VOG00633,sp|O64210|CAPSD_BPMD2 Probable major capsid pr...,1.9e-16,59.7,10.0
618,DTR_166448_17,DTR_166448,,,,,,,PF05065.13,Phage capsid family,9.8e-18,63.7,5.12,VOG00633,sp|O64210|CAPSD_BPMD2 Probable major capsid pr...,2.8000000000000003e-23,82.2,10.0
