In [2]:
import sys
import json
import ast
import os
sys.path.append('../utils')
import utils

In [3]:
def calculate_variation_presence(data, input_position, input_kmer):
    """
    Calculate the percentage presence of a specific k-mer variation at a given position across different classes.

    Parameters:
    - data (dict): The data containing sequences and their associated classes.
    - input_position (tuple or str): The position to check. If a string, it will be converted to a tuple.
    - input_kmer (str): The k-mer variation to look for.

    Returns:
    - variation_percentages (dict): A dictionary with class IDs as keys and percentage presence as values.
    """
    # If input_position is a string, convert it to a tuple safely
    if isinstance(input_position, str):
        input_position = ast.literal_eval(input_position)
    
    class_counts = {}
    class_variation_counts = {}

    # Count occurrences and prepare data
    for gene_sequences in data.values():
        for seq_id, positions in gene_sequences.items():
            class_id = seq_id.split('|')[-1]
            class_counts.setdefault(class_id, {})
            class_variation_counts.setdefault(class_id, {})

            # Initialize counts if not present
            class_counts[class_id].setdefault(input_position, 0)
            class_variation_counts[class_id].setdefault(input_position, 0)

            # Convert keys to tuples for comparison
            for key_str, pos_data in positions.items():
                key_tuple = ast.literal_eval(key_str)

                # Check for matching position
                if input_position == key_tuple:
                    class_counts[class_id][input_position] += 1

                    # Check for matching k-mer variation
                    if pos_data['variations'] == input_kmer:
                        class_variation_counts[class_id][input_position] += 1

    # Calculate percentages
    variation_percentages = {}
    for class_id in class_counts:
        count = class_variation_counts[class_id][input_position]
        total = class_counts[class_id][input_position]
        percentage = (count / total) * 100 if total > 0 else 0
        variation_percentages[class_id] = percentage

    return variation_percentages

In [4]:
def get_top_n_scores(file_paths, score_index, top_n):
    """
    Retrieve the top N scores from given JSON files based on a specified score index.

    Parameters:
    - file_paths (list): List of paths to JSON files containing scores.
    - score_index (int): The index of the score to sort by in the scores list.
    - top_n (int): The number of top scores to retrieve.

    Returns:
    - list: A list of tuples containing (file_path, key, score) of the top N scores.
    """
    all_scores = []

    # Loop through each file and collect scores
    for json_file_path in file_paths:
        try:
            scores = utils.load_data_from_json(json_file_path)
            # Collect score tuples
            score_tuples = [
                (json_file_path, key, values[score_index])
                for key, values in scores.items()
                if isinstance(values, list) and len(values) > score_index
            ]
            all_scores.extend(score_tuples)
        except Exception as e:
            print(f"Error processing file {json_file_path}: {str(e)}")

    # Sort and retrieve top N scores
    sorted_scores = sorted(all_scores, key=lambda x: x[2], reverse=True)
    return sorted_scores[:top_n]

In [5]:
def process_dataset(name, score_index, n):
    """
    Process a single dataset to extract top N k-mer variations and their percentages.

    Parameters:
    - name (str): The name of the dataset.
    - score_index (int): The index of the score to sort by.
    - n (int): The number of top scores to retrieve.
    """
    print(f"\nProcessing: {name}")

    # Determine file paths and genes based on dataset name
    if name == "Human_immunodeficiency_virus_1":
        genes = ['gag-pol', 'gag', 'env']
        base_path = f"../data/{name}/results"
        compiled_results_path = os.path.join(base_path, f"{name}_compiled_results.json")
        data_path = os.path.join(base_path, f"{name}_results.json")
    elif name == "Severe_acute_respiratory_syndrome_coronavirus_2":
        genes = ['ORF1a', 'ORF1b', 'S', 'N', 'M', 'E']
        base_path = f"../data/{name}/results"
        compiled_results_path = os.path.join(base_path, f"{name}_compiled_results.json")
        data_path = os.path.join(base_path, f"{name}_results.json")
    else:
        genes = [name]
        base_path = f"../data/Human_betaherpesvirus_5/{name}/results"
        compiled_results_path = os.path.join(base_path, f"{name}_compiled_results.json")
        data_path = os.path.join(base_path, f"{name}_results.json")

    # Prepare file paths for score files
    file_paths = [os.path.join(base_path, f"{gene}_uncategorized_scores.json") for gene in genes]

    # Get top N scores
    top_n_scores = get_top_n_scores(file_paths, score_index, n)

    # Extract positions and associated genes
    positions = []
    for file_path, key, _ in top_n_scores:
        position_part = key.split(',', 1)[0][1:]  # Extract position
        gene = os.path.basename(file_path).split('_')[0]
        positions.append((position_part, gene, key))
    
    if name == "UL55": 
        positions.extend([('82', 'UL55', "(82, 'ACTCGTGGA')"), 
                          ('100', 'UL55', "(100, 'ACTCACAGT')"), 
                          ('145', 'UL55', "(145, 'CGATCCGGT')")])

    # Load compiled results and data once
    with open(compiled_results_path, 'r') as file:
        compiled_data = json.load(file)
    data = utils.load_data_from_json(data_path)

    # Collect k-mers and their variations
    k_mers_variations_dict = {}
    for position, gene, key_in_scores in positions:
        # Retrieve variations for the position
        gene_data = compiled_data.get(gene, {})
        for key, value in gene_data.items():
            current_position = key.split(',', 1)[0][1:]
            if current_position == position:
                k_mers_variations_dict[(key, gene)] = [value['variations'], value['amino_acid_changes']]

    # Print the results
    for (key, gene), variations in k_mers_variations_dict.items():
        for kmer in variations[0]:
            print(f"K-mer variations: {variations[0]}, Key: {key}, K-mer: {kmer}, Gene: {gene}")
            variation_percentages = calculate_variation_presence(data, key, kmer)
            print(f"Percentages of {kmer} at key {key}:")
            for class_id, percentage in variation_percentages.items():
                if percentage != 0:
                    print(f"  Class {class_id}: {percentage:.2f}%")
            print()

In [13]:
# Usage example
datasets = [
    'US28',
    'UL55',
    'UL73',
    'Severe_acute_respiratory_syndrome_coronavirus_2',
    'Human_immunodeficiency_virus_1',
]
score_index = 3  # Index of the score to sort by
n = 5  # Number of top scores to retrieve

for dataset_name in datasets:
    process_dataset(dataset_name, score_index, n)



Processing: US28
K-mer variations: ['GACGCGGCT', 'GCCGCGACT', 'GAAGCGACT', 'GGAGCAACC', 'GACGCGACT'], Key: (55, 'GACGCGACT'), K-mer: GACGCGGCT, Gene: US28
Percentages of GACGCGGCT at key (55, 'GACGCGACT'):
  Class C: 100.00%

K-mer variations: ['GACGCGGCT', 'GCCGCGACT', 'GAAGCGACT', 'GGAGCAACC', 'GACGCGACT'], Key: (55, 'GACGCGACT'), K-mer: GCCGCGACT, Gene: US28
Percentages of GCCGCGACT at key (55, 'GACGCGACT'):
  Class A1: 100.00%

K-mer variations: ['GACGCGGCT', 'GCCGCGACT', 'GAAGCGACT', 'GGAGCAACC', 'GACGCGACT'], Key: (55, 'GACGCGACT'), K-mer: GAAGCGACT, Gene: US28
Percentages of GAAGCGACT at key (55, 'GACGCGACT'):
  Class D: 100.00%

K-mer variations: ['GACGCGGCT', 'GCCGCGACT', 'GAAGCGACT', 'GGAGCAACC', 'GACGCGACT'], Key: (55, 'GACGCGACT'), K-mer: GGAGCAACC, Gene: US28
Percentages of GGAGCAACC at key (55, 'GACGCGACT'):
  Class B2: 100.00%
  Class B1: 100.00%

K-mer variations: ['GACGCGGCT', 'GCCGCGACT', 'GAAGCGACT', 'GGAGCAACC', 'GACGCGACT'], Key: (55, 'GACGCGACT'), K-mer: GACGCGAC