In [96]:
import os
import subprocess


def run_prodigal(prodigal_path, input_fna, output_fasta):
    # Run Prodigal using subprocess with the provided path
    # 
    prodigal_command = f'"{prodigal_path}" -a "{output_fasta}.faa" -d "{output_fasta}.ffn" -n -i "{input_fna}" -o "{output_fasta}.gff"'
    subprocess.run(prodigal_command, shell=True)

def make_AAseq():
    # Replace the path and file names with your actual paths and file names
    #Keep in mind, Prodigal on Windows will have a .exe extension
    prodigal_path = input("Please input directory path to Prodigal (e.g., '/directory/path/to/prodigal'):")
    input_fna = input("Please input directory path to the .fna file(s) (e.g., '/directory/path/to/something/like/GCA_001953955.1.fna'):")
       
    # Extract the directory and base name of the input file
    input_directory, input_base_name = os.path.split(input_fna)
    
    # Generate the output file name with the new extension ".faa"
    output_fasta = os.path.join(input_directory, os.path.splitext(input_base_name)[0])
    output_faa = f'"{output_fasta}.faa"'
    
    
    try:
        # Run Prodigal on the input file
        run_prodigal(prodigal_path, input_fna, output_fasta)
        print(f"> Prodigal successfully run on {input_fna}. Output .faa file saved as {output_fasta}.faa")
    except Exception as e:
        print(f"> Error: {e}")
    
    
make_AAseq()



Please input directory path to Prodigal (e.g., '/directory/path/to/prodigal'): /home/padawan/anaconda3/pkgs/prodigal-2.6.3-hec16e2b_4/bin/prodigal
Please input directory path to the .fna file(s) (e.g., '/directory/path/to/something/like/GCA_001953955.1.fna'): /home/padawan/kofamscan/bin/kofam_scan-1.3.0/GCF_003812925.1_ASM381292v1_genomic.fna


-------------------------------------
PRODIGAL v2.6.3 [February, 2016]         
Univ of Tenn / Oak Ridge National Lab
Doug Hyatt, Loren Hauser, et al.     
-------------------------------------
Request:  Single Genome, Phase:  Training
Reading in the sequence(s) to train...5879112 bp seq created, 55.13 pct GC
Locating all potential starts and stops...279905 nodes
Looking for GC bias in different frames...frame bias scores: 0.71 0.16 2.12
Building initial set of genes to train from...done!
Creating coding model and scoring nodes...done!
Examining upstream regions and training starts...done!
-------------------------------------
Request:  Single Genome, Phase:  Gene Finding
Finding genes in sequence #1 (5864574 bp)...done!


> Prodigal successfully run on /home/padawan/kofamscan/bin/kofam_scan-1.3.0/GCF_003812925.1_ASM381292v1_genomic.fna. Output .faa file saved as /home/padawan/kofamscan/bin/kofam_scan-1.3.0/GCF_003812925.1_ASM381292v1_genomic.faa


Finding genes in sequence #2 (6078 bp)...done!
Finding genes in sequence #3 (8424 bp)...done!


In [101]:
def run_kofamscan(kofamscan_path, kolist, profiles, output_ko, input_faa):
    # Change to KOFamScan's directory, and run using fixed values. Adjust as needed, particularly number of CPUs used.
    kofamscan_command = f'cd {kofamscan_path} ; ./exec_annotation -k {kolist} -p {profiles} --cpu 1 -E 0.00001 -o {output_ko} {input_faa} 1>G12.ko.log 2>G12.ko.err &'
    subprocess.run(kofamscan_command, shell=True)
    print("> Translation to Unix:", kofamscan_command)
        
def analyze_proteins():
    kofamscan_path = input("Please input directory path to the KOFamScan folder containing the exec_annotation file (e.g., '/directory/path/to/kofam_scan-1.3.0'):")
    kolist = input("Please enter the directory path to the KO_list (e.g., 'path/to/db/ko_list'")
    profiles = input("Please enter the directory path to the HMM profiles database (e.g., 'path/to/db/profiles'")
    output_ko = input("Please name the output file (e.g., 'GCA_001953955.1.ko'): ")
    input_faa = input("Please input the path to the .faa file(s) (e.g., '/directory/path/to/something/like/GCA_001953955.1.faa'):")
    
    
    try:
        # Run KOFamScan on the input file
        run_kofamscan(kofamscan_path, kolist, profiles, output_ko, input_faa)
        print(f"> KOFamScan was run on {input_faa}. Output saved as {output_ko}")
        print("If you\'re unable to find your .ko file, check the folder where your exec_annotation is saved!")

    except Exception as e:
        print(f"> Error: {e}")

analyze_proteins()


Please input directory path to the KOFamScan folder containing the exec_annotation file (e.g., '/directory/path/to/kofam_scan-1.3.0'): /home/padawan/kofamscan/bin/kofam_scan-1.3.0
Please enter the directory path to the KO_list (e.g., 'path/to/db/ko_list' /home/padawan/kofamscan/db/ko_list
Please enter the directory path to the HMM profiles database (e.g., 'path/to/db/profiles' /home/padawan/kofamscan/db/profiles
Please name the output file (e.g., 'GCA_001953955.1.ko'):  GCF_003812925.1_ASM381292v1_genomic.ko
Please input the path to the .faa file(s) (e.g., '/directory/path/to/something/like/GCA_001953955.1.faa'): /home/padawan/kofamscan/bin/kofam_scan-1.3.0/GCF_003812925.1_ASM381292v1_genomic.faa


> Translation to Unix: cd /home/padawan/kofamscan/bin/kofam_scan-1.3.0 ; ./exec_annotation -k /home/padawan/kofamscan/db/ko_list -p /home/padawan/kofamscan/db/profiles --cpu 6 -E 0.00001 -o GCF_003812925.1_ASM381292v1_genomic.ko /home/padawan/kofamscan/bin/kofam_scan-1.3.0/GCF_003812925.1_ASM381292v1_genomic.faa 1>G12.ko.log 2>G12.ko.err &
> KOFamScan was run on /home/padawan/kofamscan/bin/kofam_scan-1.3.0/GCF_003812925.1_ASM381292v1_genomic.faa. Output saved as GCF_003812925.1_ASM381292v1_genomic.ko
If you're unable to find your .ko file, check the folder where your exec_annotation is saved!


In [42]:
def install_pandas():
    # Install pandas locally if you don't already have it, ONLY NEEDS TO BE DONE ONCE!!
    pandasinstall = "pip3 install pandas"
    subprocess.run(pandasinstall, shell=True)
install_pandas()



In [106]:
import pandas as pd
from io import StringIO

# Specify the path to .ko file - Check folder where exec_annotation is saved!
output_ko = input("Please input the path to the .ko file (e.g., '/directory/path/to/something/like/GCA_001953955.1.ko'):")

# Read the data line by line, extracting relevant information
data_lines = []
with open(output_ko, 'r') as file:
    for line in file:
        # Skip lines that are only the separator
        if line.strip() == "#-------------------- ------ ------- ------ --------- ---------------------":
            continue

        # Remove space following asterisk
        line = line.replace('* ', '*')

        # Split the line into fields
        fields = line.split()

        # Check if the line has the expected number of fields
        if len(fields) >= 6:
            # Extract relevant information from the line
            gene_name = fields[0]
            ko_number = fields[1]
            threshold = fields[2]
            score = fields[3]
            e_value = fields[4]
            ko_definition = ' '.join(fields[5:])

            # Append the information to the data_lines list
            data_lines.append([gene_name, ko_number, threshold, score, e_value, ko_definition])

# Create a DataFrame from the collected information
data = pd.DataFrame(data_lines, columns=["Gene Name", "KO#", "Threshold", "Score", "E-value", "KO Definition"])

# Sort the DataFrame by "Gene Name"
data.sort_values(by="Gene Name", inplace=True)

# Left-align the content of the DataFrame
styled_data = data.style.set_properties(**{'text-align': 'left'})

# Display the styled DataFrame
styled_data



Please input the path to the .ko file (e.g., '/directory/path/to/something/like/GCA_001953955.1.ko'): /home/padawan/kofamscan/bin/kofam_scan-1.3.0/GCF_003812925.1_ASM381292v1_genomic.ko
Please input the path to save the CSV file (e.g., '/directory/path/to/something/like/output.csv'): /home/padawan/kofamscan/bin/kofam_scan-1.3.0/GCF_003812925.1_ASM381292v1_genomic.ko.csv


In [None]:
# Save the DataFrame to a CSV file
output_csv = input("Please input the path to save the CSV file (e.g., '/directory/path/to/something/like/output.csv'):")
data.to_csv(output_csv, index=False)
