In [6]:
import csv
import os
import time
import pandas as pd

from src.auto_acmg import AutoACMG, AutoACMGPrediction
from src.defs.genome_builds import GenomeRelease

In [7]:
# Get the current path
from src.core.config import settings

path_to_root = settings.PATH_TO_ROOT

In [8]:
# Load the original dataset
original_path_to_csv = os.path.join(path_to_root, 'src', 'bench', 'results.csv')
original_df = pd.read_csv(original_path_to_csv)

# Load the new dataset
new_path_to_csv = os.path.join(path_to_root, 'output.csv')
new_df = pd.read_csv(new_path_to_csv)


In [10]:
# Ensure that we have the same amount of records and align them properly
assert len(original_df) == len(new_df), "The datasets do not have the same number of records."


In [11]:
new_df.fillna('', inplace=True)
original_df.fillna("", inplace=True)

# Extract relevant columns from the new dataset
new_df = new_df[['acmg_criteria']]

# Rename the new dataset's columns to distinguish them
new_df.rename(columns={'acmg_criteria': 'Genebe Criteria'}, inplace=True)

# Combine datasets based on the index
combined_df = original_df.join(new_df)

# Calculate metrics for Genebe
def calculate_metrics(row, column_name):
    genebe_criteria = row[column_name].split(';')
    expected_criteria = row['Expected Criteria'].split(';')
    
    true_positives = list(set(genebe_criteria) & set(expected_criteria))
    false_positives = list(set(genebe_criteria) - set(expected_criteria))
    false_negatives = list(set(expected_criteria) - set(genebe_criteria))
    
    return ';'.join(true_positives), ';'.join(false_positives), ';'.join(false_negatives)

# Apply the function to create new columns
combined_df['Genebe True Positives'], combined_df['Genebe False Positives'], combined_df['Genebe False Negatives'] = zip(*combined_df.apply(calculate_metrics, axis=1, column_name='Genebe Criteria'))

# Export the dataframe to CSV
combined_df.to_csv("updated_results_with_genebe.csv", index=False)

# Display the first few rows of the dataframe
combined_df.head()

  new_df.fillna('', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df.rename(columns={'acmg_criteria': 'Genebe Criteria'}, inplace=True)


Unnamed: 0,Variant,Expected Criteria,AutoACMG Criteria,AutoACMG Prediction time,AutoACMG True Positives,AutoACMG False Negatives,AutoACMG False Positives,Intervar Criteria,Intervar Prediction time,Intervar True Positives,Intervar False Negatives,Intervar False Positives,Comment,AutoACMG Full Response,Intervar Full Response,Genebe Criteria,Genebe True Positives,Genebe False Positives,Genebe False Negatives
0,4-113568536-G-GA,PVS1,PVS1;PM1;PM2,3.329291,PVS1,PM1;PM2,,,0.0,,,,Gene: LARP7,"{'pvs1': {'name': 'PVS1', 'prediction': <AutoA...",,,,,PVS1
1,NM_004360.3:c.1085delT,PVS1,PVS1;PM1,1.597754,PVS1,PM1,,,0.0,,,,Gene: CDH1; p.Val362GlyfsTer31; From Recommend...,"{'pvs1': {'name': 'PVS1', 'prediction': <AutoA...",,"PM2_Supporting,PS4_Supporting,PM5_Supporting,PVS1",,"PM2_Supporting,PS4_Supporting,PM5_Supporting,PVS1",PVS1
2,NM_000152.4:c.525delT,PVS1,PVS1;PM1;PM2,1.649675,PVS1,PM1;PM2,,,0.0,,,,Gene: GAA; p.Glu176ArgfsTer45; From Recommenda...,"{'pvs1': {'name': 'PVS1', 'prediction': <AutoA...",,"PVS1,PM2,PM3,PP4",,"PVS1,PM2,PM3,PP4",PVS1
3,NM_000152.4:c.1987delC,PVS1,PVS1;PM1,1.531021,PVS1,PM1,,,0.0,,,,Gene: GAA; p.Gln663SerfsTer33; From Recommenda...,"{'pvs1': {'name': 'PVS1', 'prediction': <AutoA...",,"PVS1,PM2,PP5_Moderate",,"PVS1,PM2,PP5_Moderate",PVS1
4,NM_000152.4:c.2706delG,PVS1,PVS1;PM1,1.397128,PVS1,PM1,,,0.0,,,,Gene: GAA; p.Lys903ArgfsTer2; From Recommendat...,"{'pvs1': {'name': 'PVS1', 'prediction': <AutoA...",,"PVS1,PP4_Moderate,PM3,PM2_Supporting",,"PVS1,PP4_Moderate,PM3,PM2_Supporting",PVS1


In [13]:
import requests
import time

In [15]:

# Function to get GeneBe response
from src.defs.seqvar import SeqVar


def genebe_response(variant: str):
    """
    Implement searching for ACMG classification for SNVs and indels using GeneBe.

    :param variant: variant string
    :return: GeneBe response
    :rtype: dict
    """
    auto_acmg = AutoACMG(variant, GenomeRelease.GRCh37)
    seqvar = auto_acmg.resolve_variant()
    assert isinstance(seqvar, SeqVar)
    chromosome = seqvar.chrom
    position = seqvar.pos
    reference = seqvar.delete
    alternative = seqvar.insert

    if not chromosome or not position or not reference or not alternative:
        return

    url = (
        f"https://api.genebe.net/cloud/api-public/v1/variant?"
        f"chr={chromosome}&pos={position}"
        f"&ref={reference}&alt={alternative}&genome=hg38"
    )
    backend_resp = requests.get(url)
    backend_resp.raise_for_status()
    return backend_resp.json()

# Update the AutoACMG Prediction time for GeneBe
def update_genebe_prediction_time(row):
    try:
        start_time = time.time()
        resp = genebe_response(row['Variant'])
        end_time = time.time()
        
        row["Genebe Prediction time"] = end_time - start_time
        row["Genebe Full Response"] = resp
    except Exception as e:
        print(f"Exception was raised for {row['Variant']} in GeneBe:\n{e}")
    return row

# Apply the function to calculate prediction time for GeneBe
combined_df = combined_df.apply(update_genebe_prediction_time, axis=1)

# Export the dataframe to CSV
combined_df.to_csv("updated_results_with_genebe.csv", index=False)

# Display the first few rows of the dataframe
combined_df.head()

[32m2024-07-09 15:47:54.523[0m | [34m[1mDEBUG   [0m | [36msrc.auto_acmg[0m:[36m__init__[0m:[36m70[0m - [34m[1mAutoACMG initialized with variant: 4-113568536-G-GA and genome release: GenomeRelease.GRCh37[0m
[32m2024-07-09 15:47:54.524[0m | [34m[1mDEBUG   [0m | [36msrc.auto_acmg[0m:[36mresolve_variant[0m:[36m89[0m - [34m[1mResolving variant: 4-113568536-G-GA[0m
[32m2024-07-09 15:47:54.525[0m | [34m[1mDEBUG   [0m | [36msrc.auto_acmg[0m:[36mresolve_variant[0m:[36m93[0m - [34m[1mResolved sequence variant: 4-113568536-G-GA[0m
[32m2024-07-09 15:47:54.793[0m | [34m[1mDEBUG   [0m | [36msrc.auto_acmg[0m:[36m__init__[0m:[36m70[0m - [34m[1mAutoACMG initialized with variant: NM_004360.3:c.1085delT and genome release: GenomeRelease.GRCh37[0m
[32m2024-07-09 15:47:54.794[0m | [34m[1mDEBUG   [0m | [36msrc.auto_acmg[0m:[36mresolve_variant[0m:[36m89[0m - [34m[1mResolving variant: NM_004360.3:c.1085delT[0m
[32m2024-07-09 15:47:54.794[

Unnamed: 0,Variant,Expected Criteria,AutoACMG Criteria,AutoACMG Prediction time,AutoACMG True Positives,AutoACMG False Negatives,AutoACMG False Positives,Intervar Criteria,Intervar Prediction time,Intervar True Positives,...,Intervar False Positives,Comment,AutoACMG Full Response,Intervar Full Response,Genebe Criteria,Genebe True Positives,Genebe False Positives,Genebe False Negatives,Genebe Prediction time,Genebe Full Response
0,4-113568536-G-GA,PVS1,PVS1;PM1;PM2,3.329291,PVS1,PM1;PM2,,,0.0,,...,,Gene: LARP7,"{'pvs1': {'name': 'PVS1', 'prediction': <AutoA...",,,,,PVS1,0.268924,"{'variants': [{'chr': '4', 'pos': 113568536, '..."
1,NM_004360.3:c.1085delT,PVS1,PVS1;PM1,1.597754,PVS1,PM1,,,0.0,,...,,Gene: CDH1; p.Val362GlyfsTer31; From Recommend...,"{'pvs1': {'name': 'PVS1', 'prediction': <AutoA...",,"PM2_Supporting,PS4_Supporting,PM5_Supporting,PVS1",,"PM2_Supporting,PS4_Supporting,PM5_Supporting,PVS1",PVS1,0.307956,"{'variants': [{'chr': '16', 'pos': 68846112, '..."
2,NM_000152.4:c.525delT,PVS1,PVS1;PM1;PM2,1.649675,PVS1,PM1;PM2,,,0.0,,...,,Gene: GAA; p.Glu176ArgfsTer45; From Recommenda...,"{'pvs1': {'name': 'PVS1', 'prediction': <AutoA...",,"PVS1,PM2,PM3,PP4",,"PVS1,PM2,PM3,PP4",PVS1,0.361843,"{'variants': [{'chr': '17', 'pos': 78078908, '..."
3,NM_000152.4:c.1987delC,PVS1,PVS1;PM1,1.531021,PVS1,PM1,,,0.0,,...,,Gene: GAA; p.Gln663SerfsTer33; From Recommenda...,"{'pvs1': {'name': 'PVS1', 'prediction': <AutoA...",,"PVS1,PM2,PP5_Moderate",,"PVS1,PM2,PP5_Moderate",PVS1,0.339899,"{'variants': [{'chr': '17', 'pos': 78086770, '..."
4,NM_000152.4:c.2706delG,PVS1,PVS1;PM1,1.397128,PVS1,PM1,,,0.0,,...,,Gene: GAA; p.Lys903ArgfsTer2; From Recommendat...,"{'pvs1': {'name': 'PVS1', 'prediction': <AutoA...",,"PVS1,PP4_Moderate,PM3,PM2_Supporting",,"PVS1,PP4_Moderate,PM3,PM2_Supporting",PVS1,0.344012,"{'variants': [{'chr': '17', 'pos': 78092510, '..."


In [16]:
combined_df.head()

Unnamed: 0,Variant,Expected Criteria,AutoACMG Criteria,AutoACMG Prediction time,AutoACMG True Positives,AutoACMG False Negatives,AutoACMG False Positives,Intervar Criteria,Intervar Prediction time,Intervar True Positives,...,Intervar False Positives,Comment,AutoACMG Full Response,Intervar Full Response,Genebe Criteria,Genebe True Positives,Genebe False Positives,Genebe False Negatives,Genebe Prediction time,Genebe Full Response
0,4-113568536-G-GA,PVS1,PVS1;PM1;PM2,3.329291,PVS1,PM1;PM2,,,0.0,,...,,Gene: LARP7,"{'pvs1': {'name': 'PVS1', 'prediction': <AutoA...",,,,,PVS1,0.268924,"{'variants': [{'chr': '4', 'pos': 113568536, '..."
1,NM_004360.3:c.1085delT,PVS1,PVS1;PM1,1.597754,PVS1,PM1,,,0.0,,...,,Gene: CDH1; p.Val362GlyfsTer31; From Recommend...,"{'pvs1': {'name': 'PVS1', 'prediction': <AutoA...",,"PM2_Supporting,PS4_Supporting,PM5_Supporting,PVS1",,"PM2_Supporting,PS4_Supporting,PM5_Supporting,PVS1",PVS1,0.307956,"{'variants': [{'chr': '16', 'pos': 68846112, '..."
2,NM_000152.4:c.525delT,PVS1,PVS1;PM1;PM2,1.649675,PVS1,PM1;PM2,,,0.0,,...,,Gene: GAA; p.Glu176ArgfsTer45; From Recommenda...,"{'pvs1': {'name': 'PVS1', 'prediction': <AutoA...",,"PVS1,PM2,PM3,PP4",,"PVS1,PM2,PM3,PP4",PVS1,0.361843,"{'variants': [{'chr': '17', 'pos': 78078908, '..."
3,NM_000152.4:c.1987delC,PVS1,PVS1;PM1,1.531021,PVS1,PM1,,,0.0,,...,,Gene: GAA; p.Gln663SerfsTer33; From Recommenda...,"{'pvs1': {'name': 'PVS1', 'prediction': <AutoA...",,"PVS1,PM2,PP5_Moderate",,"PVS1,PM2,PP5_Moderate",PVS1,0.339899,"{'variants': [{'chr': '17', 'pos': 78086770, '..."
4,NM_000152.4:c.2706delG,PVS1,PVS1;PM1,1.397128,PVS1,PM1,,,0.0,,...,,Gene: GAA; p.Lys903ArgfsTer2; From Recommendat...,"{'pvs1': {'name': 'PVS1', 'prediction': <AutoA...",,"PVS1,PP4_Moderate,PM3,PM2_Supporting",,"PVS1,PP4_Moderate,PM3,PM2_Supporting",PVS1,0.344012,"{'variants': [{'chr': '17', 'pos': 78092510, '..."


In [20]:
# Define the list of criteria to remove
not_implemented = [
    "PS2", "PS3", "PS4", "PM3", "PM6",
    "PP1", "PP4", "PP5", "BS3", "BS4",
    "BP2", "BP5", "BP6"
]

# Function to remove not_implemented criteria from a given criteria string
def filter_criteria(criteria_str):
    criteria_list = criteria_str.split(';')
    filtered_criteria = [c for c in criteria_list if c not in not_implemented]
    return ';'.join(filtered_criteria)

# Apply the function to the 'AutoACMG Criteria' and 'AutoACMG True Positives' columns
combined_df['AutoACMG Criteria'] = combined_df['AutoACMG Criteria'].apply(filter_criteria)
combined_df['AutoACMG True Positives'] = combined_df['AutoACMG True Positives'].apply(filter_criteria)


In [21]:
combined_df.to_csv("final.csv", index=False)