In [5]:
import re
import sys
sys.path.append('utils')
import utils
import pandas as pd
import mutation_score

# Constants
MUTATIONS = [
    '-180I', '-180T', 'A103S', 'A103T', 'A234T', 'A272T', 'D14-', 'D14S',
    'D14T', 'D195E', 'D277E', 'D277K', 'D277Q', 'E26G', 'E279K', 'E279R',
    'F156L', 'F41L', 'G176D', 'G176E', 'G179-', 'G179D', 'I111V', 'I149M',
    'I149T', 'I150V', 'I173M', 'I193V', 'I20D', 'I20S', 'I24M', 'I263L',
    'I38L', 'I55V', 'L152I', 'L152T', 'L293V', 'M163V', 'M276T', 'N10-',
    'N10I', 'N10R', 'N10T', 'N175H', 'N178D', 'N183T', 'P275S', 'P27T',
    'Q180E', 'Q180N', 'Q180P', 'Q180T', 'R11N', 'S101G', 'S12N', 'S181N',
    'S182A', 'S182T', 'S291A', 'S29F', 'S362N', 'T15P', 'T177A', 'T361P',
    'T40A', 'T8I', 'T8S', 'T9S', 'V18I', 'V261I', 'Y198H', 'Y198S'
]

DOMAINS = [
    {"Domain_Name": "N-ter", "Start_Merlin": 1, "End_Merlin": 35},
    {"Domain_Name": "ECL1", "Start_Merlin": 102, "End_Merlin": 106},
    {"Domain_Name": "ECL2", "Start_Merlin": 169, "End_Merlin": 206},
    {"Domain_Name": "ECL3", "Start_Merlin": 266, "End_Merlin": 292},
    {"Domain_Name": "ICL1", "Start_Merlin": 57, "End_Merlin": 80},
    {"Domain_Name": "ICL2", "Start_Merlin": 128, "End_Merlin": 147},
    {"Domain_Name": "ICL3", "Start_Merlin": 228, "End_Merlin": 244},
    {"Domain_Name": "C-ter", "Start_Merlin": 314, "End_Merlin": 412},
    {"Domain_Name": "TM1", "Start_Merlin": 36, "End_Merlin": 56},
    {"Domain_Name": "TM2", "Start_Merlin": 81, "End_Merlin": 101},
    {"Domain_Name": "TM3", "Start_Merlin": 107, "End_Merlin": 127},
    {"Domain_Name": "TM4", "Start_Merlin": 148, "End_Merlin": 168},
    {"Domain_Name": "TM5", "Start_Merlin": 207, "End_Merlin": 227},
    {"Domain_Name": "TM6", "Start_Merlin": 245, "End_Merlin": 265},
    {"Domain_Name": "TM7", "Start_Merlin": 293, "End_Merlin": 313}
]

def get_position(mutation):
    """
    Extract the numeric position from a mutation string.

    Parameters:
        mutation (str): The mutation string.

    Returns:
        int: The position of the mutation, or None if not found.
    """
    match = re.search(r'(\d+)', mutation)
    return int(match.group(1)) if match else None

def map_domain(mutation, domains_df):
    """
    Map a mutation to its domain based on its position.

    Parameters:
        mutation (str): The mutation string.
        domains_df (pd.DataFrame): DataFrame containing domain information.

    Returns:
        str: The domain name, or 'Unknown' if not found.
    """
    position = get_position(mutation)
    if position is None:
        return "Unknown"
    domain_row = domains_df[
        (domains_df['Start_Merlin'] <= position) & (domains_df['End_Merlin'] >= position)
    ]
    return domain_row.iloc[0]['Domain_Name'] if not domain_row.empty else "Unknown"

def process_data(data, mutations):
    """
    Process the data to count mutations and sequences per genotype.

    Parameters:
        data (dict): The loaded data from JSON.
        mutations (list): List of mutations to consider.

    Returns:
        tuple: (mutation_counts, sequence_counts)
    """
    mutation_counts = {}
    sequence_counts = {}

    for seq_id, seq_info in data.items():
        for seq_name, seq_details in seq_info.items():
            genotype = seq_name.split("|")[-1]
            mutation_counts.setdefault(genotype, {mutation: 0 for mutation in mutations})
            sequence_counts.setdefault(genotype, 0)
            sequence_counts[genotype] += 1

            for _, mutation_info in seq_details.items():
                amino_acid_changes = mutation_info.get('amino_acid_changes', [])
                unique_mutations = set(amino_acid_changes)
                for mutation in unique_mutations:
                    if mutation in mutations:
                        mutation_counts[genotype][mutation] += 1
    return mutation_counts, sequence_counts

def create_dataframe(mutations, mutation_counts, sequence_counts, domains_df, mutational_scores):
    """
    Create a DataFrame containing mutation percentages per genotype and domain.

    Parameters:
        mutations (list): List of mutations.
        mutation_counts (dict): Counts of mutations per genotype.
        sequence_counts (dict): Counts of sequences per genotype.
        domains_df (pd.DataFrame): DataFrame with domain information.
        mutational_scores (dict): Mutational scores for each mutation.

    Returns:
        pd.DataFrame: The resulting DataFrame.
    """
    data_for_df = []

    for mutation in mutations:
        row = {'Mutation': mutation}
        for genotype in sequence_counts:
            total_sequences = sequence_counts[genotype]
            count = mutation_counts[genotype].get(mutation, 0)
            percentage = (count / total_sequences * 100) if total_sequences > 0 else 0
            row[genotype] = round(percentage, 2)
        row['Domain'] = map_domain(mutation, domains_df)
        row['Mutational_Score'] = mutational_scores.get(mutation, 0)
        data_for_df.append(row)

    df = pd.DataFrame(data_for_df)
    df = df.sort_values(by='Mutation').reset_index(drop=True)
    return df

def main():
    data = utils.load_data_from_json("data/results/UL33_results.json")
    domains_df = pd.DataFrame(DOMAINS)
    mutation_counts, sequence_counts = process_data(data, MUTATIONS)
    mutational_scores = mutation_score.get_mutational_scores(MUTATIONS, categorize=True)
    df = create_dataframe(MUTATIONS, mutation_counts, sequence_counts, domains_df, mutational_scores)
    df.to_excel("data/results/mutation_percentages_by_genotype.xlsx", index=False)
    print(df)

if __name__ == "__main__":
    main()

   Mutation      C5  TOWNE  MERLIN  AD169  TOLEDO Domain  Mutational_Score
0     -180I   88.89    0.0    0.00    0.0    0.00   ECL2                 5
1     -180T    0.00    0.0    0.00   95.9   91.95   ECL2                 5
2     A103S  100.00    0.0    0.00    0.0    0.00   ECL1                 1
3     A103T    0.00  100.0    0.00    0.0    0.00   ECL1                 2
4     A234T  100.00    0.0    0.00  100.0  100.00   ICL3                 2
..      ...     ...    ...     ...    ...     ...    ...               ...
67      T9S    0.00    0.0    0.00    4.1    8.05  N-ter                 2
68     V18I    0.00    0.0    0.79  100.0  100.00  N-ter                 2
69    V261I    0.00    0.0    0.00    0.0   98.85    TM6                 2
70    Y198H    0.00    0.0    0.00  100.0  100.00   ECL2                 4
71    Y198S  100.00    0.0    0.00    0.0    0.00   ECL2                 4

[72 rows x 8 columns]
