# Neural Network
Understanding the selectivity of kmers in phages, against bacteria targets\
Each phage has a confirmed binding to a bacteria. The kmers of the phage can therefore be denoted with True/False for binding to a bacteria.\
Using MinHash sketches, the kmers are compressed, while its uniqueness is still preserved.\
With MinHashed kmers however, we cant "go back" and extract which kmers were most important.

## Prepping data 


In [121]:
import sourmash, os
from tqdm import tqdm
raw_data_path = "../raw_data/"
data_prod_path = "../data_prod/"
SKETCH_DIR = data_prod_path + "phage_minhash_18/"
K = 18 #kmer size; equal to 6 aa.

def binarize_host_range(host_range_dict):
    binary_dict = {}
    for host, val in host_range_dict.items():
        if pd.isna(val) or val == 0:
            binary_dict[host] = 0
        else:
            binary_dict[host] = 1
    return binary_dict

def short_species_name(full_name):
    if len(full_name.split(" ")) < 2:
        return full_name
    else:
        return full_name.split(" ")[0][0] + "." + full_name.split(" ")[1]

### Phage kmer data - MinHashed sketches

Prepping hostrange data

In [114]:
import pandas as pd

# Load the host range data from the Excel file
file_path = raw_data_path + "phagehost_KU/Hostrange_data_all_crisp_iso.xlsx"
sheet_name = "sum_hostrange"  # replace with your sheet name
host_range_df = pd.read_excel(
    file_path,
    sheet_name='sum_hostrange',
    header=1).drop(columns=["isolate ID", "Hostrange_analysis", "Phage"])

# Create a lookup dictionary for bacteria species based on Seq ID - dict
bact_lookup = host_range_df[["Seq ID", "Species"]].drop_duplicates(subset=['Seq ID']).set_index('Seq ID').to_dict()['Species']
print("Bacteria lookup dictionary created with", len(bact_lookup), "entries.")
print(bact_lookup)

# Make Seq ID to phage name mapping - pandas df
host_range_df = host_range_df.drop(columns=["Species"]).set_index('Seq ID').rename_axis('phage').reset_index()
display(host_range_df.head())

# Convert the host range data into a dictionary
host_range_data = {}
for index, row in host_range_df.iterrows():
    cleaned_index = row[1:].index.str.replace(" ", "")
    curr_bact_series = row[1:]
    curr_bact_series.index = cleaned_index
    host_range_data[row['phage']] = curr_bact_series.to_dict()

host_range_data["J10_21_reoriented"]

Bacteria lookup dictionary created with 110 entries.
{'J14_21_reoriented': 'Acinetobacter calcoaceticus', 'J53_21_reoriented': 'Acinetobacter calcoaceticus', 'J105_22_reoriented': 'Chishuiella', 'J46_21_reoriented': 'Chryseobacterium', 'J50_21_reoriented': 'Chryseobacterium', 'J2264_1_22_KMC_reoriented': 'Chryseobacterium', 'J2264_3_22_KMC_reoriented': 'Chryseobacterium', 'J63_22_reoriented': 'Chryseobacterium', 'J64_22_reoriented': 'Chryseobacterium', 'J1_21_reoriented': 'Lelliottia', 'J91_22_reoriented': 'Lelliottia', 'J51_21_reoriented': 'Morganella morganii', 'J57_21_reoriented': 'Morganella morganii', 'J10_21_reoriented': 'Pectobacterium atrosepticum', 'J11_21_reoriented': 'Pectobacterium atrosepticum', 'J126_23_reoriented': 'Pectobacterium atrosepticum', 'J12_21_reoriented': 'Pectobacterium atrosepticum', 'J16_21_reoriented': 'Pectobacterium atrosepticum', 'J22_21_reoriented': 'Pectobacterium atrosepticum', 'J28_21_reoriented': 'Pectobacterium atrosepticum', 'J33_21_reoriented': 

Unnamed: 0,phage,Ymer,Taid,Poppous,Koroua,Abuela,Amona,Sabo,Mimer,Crus,...,Vims,Echoes,Galvinrad,Uther,Rip,Rup,Slaad,Pantea,Rap,Zann
0,J14_21_reoriented,,,,,,,,,,...,,,,,,,,,,
1,J53_21_reoriented,,,,,,,,,,...,,,,,,,,,,
2,J105_22_reoriented,,,,,,,,,,...,,,,,,,,,,
3,J46_21_reoriented,,,,,,,,,,...,,,,,,,,,,
4,J50_21_reoriented,,,,,,,,,,...,,,,,,,,,,


{'Ymer': nan,
 'Taid': nan,
 'Poppous': nan,
 'Koroua': nan,
 'Abuela': nan,
 'Amona': nan,
 'Sabo': nan,
 'Mimer': nan,
 'Crus': nan,
 'Gander': nan,
 'Guf': nan,
 'Hoejben': nan,
 'Magnum': 200000000,
 'Vims': nan,
 'Echoes': nan,
 'Galvinrad': nan,
 'Uther': nan,
 'Rip': nan,
 'Rup': nan,
 'Slaad': nan,
 'Pantea': nan,
 'Rap': nan,
 'Zann': nan}

In [None]:
def binarize_host_range(host_range_dict):
    binary_dict = {}
    for host, val in host_range_dict.items():
        if pd.isna(val) or val == 0:
            binary_dict[host] = 0
        else:
            binary_dict[host] = 1
    return binary_dict

def short_species_name(full_name):
    if len(full_name.split(" ")) < 2:
        return full_name
    else:
        return full_name.split(" ")[0][0] + "." + full_name.split(" ")[1]

Loading MinHash sketches

In [120]:
from sourmash import load_one_signature
import shutil

parent_out_dir = data_prod_path + f"phage_minhash_{K}_txt/"
if not os.path.exists(parent_out_dir):
    os.makedirs(parent_out_dir)
else:
    shutil.rmtree(parent_out_dir)
    os.makedirs(parent_out_dir)

for selected_bact_species in set(bact_lookup.values()): 
    ### PREPPING HOST RANGE DATA ###
    # Select a specific bacteria species for host range analysis
    print(f"\nProcessing host range data for bacteria species: {selected_bact_species}")

    #obtain all the keys where the value is equal to selected_bact_species
    selected_seqIDs = [key for key, value in bact_lookup.items() if value == selected_bact_species]
    print("Seq IDs for selected species:", selected_seqIDs)

    # Acceptive approach: since all seqIDs for the same species should have similar host ranges, we combine their host range data.
    # if non-zero is found for any seqID, we set it to 1 in the final host range data.
    def combine_host_ranges(seqID_list, approach="acceptive", threshold=0.5, TS = False):
        combined_host_range = {}
        # Acceptive approach: if any seqID has a non-zero value for a host, set to 1
        if approach == "acceptive":
            for seqID in seqID_list:
                curr_host_range = binarize_host_range(host_range_data[seqID])
                for host, val in curr_host_range.items():
                    if host not in combined_host_range:
                        combined_host_range[host] = val
                    else:
                        if not pd.isna(val) and val != 0:
                            combined_host_range[host] = 1
            return combined_host_range
        
        # Count occurrences of non-zero values for each host, if higher than threshold, set to 1
        elif approach == "consensus":
            host_counts = {}
            for seqID in seqID_list:
                curr_host_range = binarize_host_range(host_range_data[seqID])
                for host, val in curr_host_range.items():
                    if host not in host_counts:
                        host_counts[host] = 0
                    if not pd.isna(val) and val != 0:
                        host_counts[host] += 1
            for host, count in host_counts.items():
                if TS: print(f"Host: {host}, Count: {count}, Total SeqIDs: {len(seqID_list)}, Ratio: {count / len(seqID_list)}")
                if count / len(seqID_list) >= threshold:
                    combined_host_range[host] = 1
                else:
                    combined_host_range[host] = 0
            return combined_host_range

    combined_host_range = combine_host_ranges(selected_seqIDs, approach="acceptive")
    print("Combined host range data for selected species:", combined_host_range)

    ### LOADING MINHASH SKETCHES ###
    minhash_data = {}
    print(f"\nLoading sketches from: {SKETCH_DIR}")
    for filename in os.listdir(SKETCH_DIR):
        if filename.endswith(('.sig', '.json')): # sourmash signature files
            filepath = os.path.join(SKETCH_DIR, filename)
            try:
                # sourmash.load_signatures returns an iterator
                sig = load_one_signature(filepath, K)
                
                if not sig:
                    print(f"Warning: No signatures found in {filename}. Skipping.")
                    continue

                phage_name = str(sig)
                
                if phage_name in combined_host_range:
                    # Extract the hash values (sorted for consistency)
                    hashes = sorted(sig.minhash.hashes.keys())
                    minhash_data[phage_name] = hashes
                else:
                    print(f"Warning: Sketch for {phage_name} found, but no matching entry in host range data. Skipping.")
                
            except Exception as e:
                print(f"Error loading sketch file {filename}: {e}. Skipping.")

    print(f"Loaded sketches for {len(minhash_data)} phages.")

    ### OUTPUTTING MINHASH TXT FILES ###
    out_dir = parent_out_dir + f"{str(selected_bact_species).replace(' ', '_')}/"
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    for phage_name in minhash_data.keys():
        print(f"Phage: {phage_name}, Number of hashes: {len(minhash_data[phage_name])}")
        print(f"Writing {out_dir}+{phage_name}_{short_species_name(selected_bact_species)}...")
        with open(os.path.join(out_dir, f"{phage_name}.txt"), 'w') as f:
            for hash_value in minhash_data[phage_name]:
                f.write(f"{hash_value}\t{combined_host_range[phage_name]}\n")


Processing host range data for bacteria species: Pseudomonas marginalis
Seq IDs for selected species: ['J101_22_reoriented']
Combined host range data for selected species: {'Ymer': 0, 'Taid': 0, 'Poppous': 0, 'Koroua': 0, 'Abuela': 0, 'Amona': 0, 'Sabo': 0, 'Mimer': 0, 'Crus': 0, 'Gander': 0, 'Guf': 0, 'Hoejben': 0, 'Magnum': 0, 'Vims': 0, 'Echoes': 0, 'Galvinrad': 0, 'Uther': 0, 'Rip': 0, 'Rup': 0, 'Slaad': 0, 'Pantea': 0, 'Rap': 0, 'Zann': 0}

Loading sketches from: ../data_prod/phage_minhash_18/
Loaded sketches for 23 phages.
Phage: Uther, Number of hashes: 43728
Writing ../data_prod/phage_minhash_18_txt/Pseudomonas_marginalis/+Uther_P.marginalis...
Phage: Echoes, Number of hashes: 59620
Writing ../data_prod/phage_minhash_18_txt/Pseudomonas_marginalis/+Echoes_P.marginalis...
Phage: Gander, Number of hashes: 43574
Writing ../data_prod/phage_minhash_18_txt/Pseudomonas_marginalis/+Gander_P.marginalis...
Phage: Ymer, Number of hashes: 41205
Writing ../data_prod/phage_minhash_18_txt/Pse

### Make combined txt files per bacteria
each file will contain which minhashes has been in a phage that could infect the bacteria.

In [None]:
### Concatenate all phage minhash txt files into a single file for easier NN training ###
all_phage_minhash_txt_path = data_prod_path + f"phage_minhash_18_txt/all_skectes/"
if not os.path.exists(all_phage_minhash_txt_path):
    os.makedirs(all_phage_minhash_txt_path)

for filename in os.listdir(parent_out_dir):
    if os.path.isdir(os.path.join(parent_out_dir, filename)):
        bact_folder = os.path.join(parent_out_dir, filename)
        combined_output_file = os.path.join(all_phage_minhash_txt_path, f"{filename}_combined.txt")
        with open(combined_output_file, 'w') as outfile:
            for phage_file in os.listdir(bact_folder):
                if phage_file.endswith('.txt'):
                    phage_filepath = os.path.join(bact_folder, phage_file)
                    with open(phage_filepath, 'r') as infile:
                        shutil.copyfileobj(infile, outfile)
        print(f"Combined sketches for {filename} into {combined_output_file}")

## Running NN

Inits

In [None]:
selected_bact_species = "Pectobacterium brasiliense" 

### Loading bacteria specific data
each folder in phage_minhash_K_txt is a bacteria name, specifying whether the sketches of each underlying phage txt file, can infect it.

In [None]:
### Concatenate all phage minhash txt files into a single file for easier NN training ###
all_phage_minhash_txt_path = data_prod_path + f"phage_minhash_18_txt/all_skectes/"
if not os.path.exists(all_phage_minhash_txt_path):
    os.makedirs(all_phage_minhash_txt_path)

for filename in os.listdir(parent_out_dir):
    if os.path.isdir(os.path.join(parent_out_dir, filename)):
        bact_folder = os.path.join(parent_out_dir, filename)
        combined_output_file = os.path.join(all_phage_minhash_txt_path, f"{filename}_combined.txt")
        with open(combined_output_file, 'w') as outfile:
            for phage_file in os.listdir(bact_folder):
                if phage_file.endswith('.txt'):
                    phage_filepath = os.path.join(bact_folder, phage_file)
                    with open(phage_filepath, 'r') as infile:
                        shutil.copyfileobj(infile, outfile)
        print(f"Combined sketches for {filename} into {combined_output_file}")

Combined sketches for Chryseobacterium into ../data_prod/phage_minhash_18_txt/all_skectes/Chryseobacterium_combined.txt
Combined sketches for Lelliottia into ../data_prod/phage_minhash_18_txt/all_skectes/Lelliottia_combined.txt
Combined sketches for Acinetobacter_calcoaceticus into ../data_prod/phage_minhash_18_txt/all_skectes/Acinetobacter_calcoaceticus_combined.txt
Combined sketches for Chishuiella into ../data_prod/phage_minhash_18_txt/all_skectes/Chishuiella_combined.txt
Combined sketches for Vagococcus into ../data_prod/phage_minhash_18_txt/all_skectes/Vagococcus_combined.txt
Combined sketches for Pectobacterium_polaris into ../data_prod/phage_minhash_18_txt/all_skectes/Pectobacterium_polaris_combined.txt
Combined sketches for Pseudomonas_chlororaphis into ../data_prod/phage_minhash_18_txt/all_skectes/Pseudomonas_chlororaphis_combined.txt
Combined sketches for Morganella_morganii into ../data_prod/phage_minhash_18_txt/all_skectes/Morganella_morganii_combined.txt
