# Convert Exon Amino Acid Positions to Coding Sequence Domains (CDS)
## Resitance Mutations are show in final products, but databases store Amino Acid Coding Regions

We now need to translate.

Let's try to dio this with pyranges. – it seems to be made for this. 

Using this GFF file:
- https://github.com/LaraFuhrmann/Scan-for-mutations-of-interest-NGS-samples/blob/main/resources/GCF_009858895.2_ASM985889v3_genomic.gbff


In [56]:
# load from files

options = {
    "3C-like proteinase": '3CLpro_inhibitors_datasheet.csv',
    "RNA-dependent RNA polymerase": 'RdRP_inhibitors_datasheet.csv',
    "spike glycoprotein": 'spike_mAbs_datasheet.csv'
}

# load the data, the first time i each column is the amino acid change e.g. T124I, ignore the rest
dfs = {}
for product, file in options.items():
    try:
        # Read the CSV file
        df = pd.read_csv(file)
        # Store the dataframe in the dictionary with product as key
        dfs[product] = df
        print(f"Loaded {len(df)} mutations for {product}")
    except FileNotFoundError:
        print(f"Warning: File {file} not found")
    except Exception as e:
        print(f"Error loading {file}: {e}")

Loaded 71 mutations for 3C-like proteinase
Loaded 19 mutations for RNA-dependent RNA polymerase
Loaded 164 mutations for spike glycoprotein


In [59]:
import re


def translate_mutation(mutation, offset):
    """Translate a single mutation by adding an offset to its position."""
    match = re.match(r'([A-Za-z])(\d+)([A-Za-z])', mutation)
    if match:
        original = match.group(1)
        position = int(match.group(2))
        new = match.group(3)
        new_position = position + offset
        return f"{original}{new_position}{new}"
    else:
        print(f"Invalid mutation format: {mutation}")
        return None

def translate_mutations(mutations, subregion, data):
    """Translate a list of mutations from a subregion to its parent ORF."""
    # Define parent ORF and retrieve start positions
    if subregion == "RdRp":
        orf = "ORF1b"
    elif subregion == "3CLpro":
        orf = "ORF1a"
    else:
        raise ValueError("Unknown subregion")

    start_orf = next(item["Start"] for item in data if item["Region"] == orf)
    start_sub = next(item["Start"] for item in data if item["Region"] == subregion)

    # Calculate the starting amino acid position in the ORF
    start_aa = ((start_sub - start_orf) // 3) + 1
    offset = start_aa - 1  # Offset to add to subregion positions

    # Translate each mutation
    translated = []
    for mutation in mutations:
        trans_mut = translate_mutation(mutation, offset)
        if trans_mut:
            translated.append(f"{orf}:{trans_mut}")
    return translated

# Coordinate data from GenBank file
data = [
    {"Region": "ORF1a", "Start": 266, "End": 13468},
    {"Region": "ORF1b", "Start": 13468, "End": 21555},
    {"Region": "RdRp", "Start": 13468, "End": 16236},
    {"Region": "3CLpro", "Start": 10055, "End": 10972}
]

# Example mutation lists
rdrp_mutations = dfs["RNA-dependent RNA polymerase"].Mutation.to_list()
clpro_mutations = dfs["3C-like proteinase"].Mutation.to_list()

# Translate mutations
translated_rdrp = translate_mutations(rdrp_mutations, "RdRp", data)
translated_clpro = translate_mutations(clpro_mutations, "3CLpro", data)

# Output results
print("Translated mutations for RdRp in ORF1b:")
for mut in translated_rdrp:
    print(mut)

print("\nTranslated mutations for 3CLpro in ORF1a:")
for mut in translated_clpro:
    print(mut)

# Save translated mutations to CSV files
import pandas as pd
rdrp_df = pd.DataFrame(translated_rdrp, columns=["Mutation"])
clpro_df = pd.DataFrame(translated_clpro, columns=["Mutation"])
rdrp_df.to_csv("translated_RdRp_mutations.csv", index=False)
clpro_df.to_csv("translated_3CLpro_mutations.csv", index=False)
print("\nTranslated mutations saved to CSV files.")


Translated mutations for RdRp in ORF1b:
ORF1b:V166A
ORF1b:V166L
ORF1b:N198S
ORF1b:R285C
ORF1b:A376V
ORF1b:A449V
ORF1b:F480L
ORF1b:D484Y
ORF1b:A526V
ORF1b:V557L
ORF1b:G671S
ORF1b:S759A
ORF1b:V792I
ORF1b:E796G
ORF1b:C799F
ORF1b:C799R
ORF1b:E802A
ORF1b:E802D
ORF1b:M924R

Translated mutations for 3CLpro in ORF1a:
ORF1a:T3284I
ORF1a:T3288A
ORF1a:T3288N
ORF1a:T3308I
ORF1a:D3311Y
ORF1a:M3312I
ORF1a:M3312L
ORF1a:M3312T
ORF1a:M3312d
ORF1a:L3313F
ORF1a:G3401S
ORF1a:F3403L
ORF1a:F3403S
ORF1a:N3405D
ORF1a:N3405L
ORF1a:N3405S
ORF1a:G3406S
ORF1a:S3407A
ORF1a:S3407E
ORF1a:S3407L
ORF1a:S3407P
ORF1a:C3423F
ORF1a:M3428R
ORF1a:M3428T
ORF1a:E3429A
ORF1a:E3429G
ORF1a:E3429K
ORF1a:E3429Q
ORF1a:E3429V
ORF1a:L3430F
ORF1a:P3431d
ORF1a:T3432I
ORF1a:H3435L
ORF1a:H3435N
ORF1a:H3435Q
ORF1a:H3435Y
ORF1a:A3436T
ORF1a:A3436V
ORF1a:V3449A
ORF1a:R3451G
ORF1a:R3451S
ORF1a:Q3452I
ORF1a:Q3452K
ORF1a:T3453I
ORF1a:A3454T
ORF1a:A3454V
ORF1a:Q3455A
ORF1a:Q3455C
ORF1a:Q3455D
ORF1a:Q3455E
ORF1a:Q3455F
ORF1a:Q3455G
ORF1a:Q3455H
