##### imports

In [5]:
import pandas as pd
import os
import requests
import time
import pprint
from urllib.parse import quote_plus
import re
import json
from tqdm import tqdm



### DFCI_2014_PES study

In [6]:
# get data
#TODO import as objects from harvester
## add parameter to skip first 4 lines in patient and study sample data

init_mut_df = pd.read_csv('/Users/costellh/repos/metakb/hmc_notebooks/es_dfarber_broad_2014_extracted/es_dfarber_broad_2014/data_mutations.txt', sep='\t')
init_study_meta = pd.read_csv('/Users/costellh/repos/metakb/hmc_notebooks/es_dfarber_broad_2014_extracted/es_dfarber_broad_2014/meta_study.txt', sep='\t')
init_patient_df = pd.read_csv('/Users/costellh/repos/metakb/hmc_notebooks/es_dfarber_broad_2014_extracted/es_dfarber_broad_2014/data_clinical_patient.txt', sep='\t', skiprows=4)
init_sample_df = pd.read_csv('/Users/costellh/repos/metakb/hmc_notebooks/es_dfarber_broad_2014_extracted/es_dfarber_broad_2014/data_clinical_sample.txt', sep='\t', skiprows=4)


#### Variant data: subset columns, check for duplicates 

In [7]:
# clean variant data

# subset for necessary columns
mut_df = init_mut_df.filter(['Hugo_Symbol',
                        'Chromosome',
                        'Start_Position', 
                        'End_Position',
                        'Consequence',
                        'Variant_Classification',
                        'Variant_Type',
                        'Reference_Allele',
                        'Tumor_Seq_Allele2',
                        'Tumor_Sample_Barcode',
                        'Sequence_Source',
                        'HGVSc',
                        'HGVSp',
                        'HGVSp_Short',
                        'Transcript_ID',
                        'RefSeq',
                        'Protein_position'
                       ], axis=1)


# Strip whitespace and retry
mut_df.columns = mut_df.columns.str.strip()
mut_df = mut_df.rename(columns={'Tumor_Sample_Barcode': 'SAMPLE_ID'})

# Check duplicate count
num_duplicates = mut_df.duplicated().sum()
print(f"Number of duplicate rows : {num_duplicates}")

# TODO check duplicates for sanity check
# print duplicates (excluding first instance)
if num_duplicates > 0:
    print("\nDuplicate rows (excluding first instance):")
    print(mut_df[mut_df.duplicated()])

# # print full duplicate groups (including the first occurrences)
#     print("\nAll rows involved in duplication:")
#     print(mut_df[mut_df.duplicated(keep=False)])

# save duplicate rows to file
    dupes = mut_df[mut_df.duplicated(keep=False)]
    mut_df.to_csv('mut_dupes.csv', index=False)
    

    
# remove duplicates, but keep first occurrence
    mut_df = mut_df.drop_duplicates()
    print(f"\nDataFrame shape after removing duplicates: {mut_df.shape}")
else:
    print("No duplicate rows found.")

    
# print(mut_df.columns)
# # print(mut_df.head())

Number of duplicate rows : 495

Duplicate rows (excluding first instance):
      Hugo_Symbol Chromosome  Start_Position  End_Position  \
720         ABCA1          9       107607765     107607765   
730         ABCA3         16         2338066       2338066   
783         ABCC9         12        21954066      21954066   
813         ACACA         17        35640173      35640173   
829          ACHE          7       100490251     100490251   
...           ...        ...             ...           ...   
15134       STAG2          X       123179197     123179197   
15155     ZDHHC15          X        74742823      74742824   
15191       HUWE1          X        53579734      53579734   
15216     SHROOM4          X        50350700      50350700   
15218        SOX3          X       139586714     139586714   

              Consequence Variant_Classification Variant_Type  \
720      missense_variant      Missense_Mutation          SNP   
730      missense_variant      Missense_Mutation  

#### Patient data: subset columns, check for duplicates

In [8]:
# clean patient data
# print(patient_df.shape)
# print(patient_df.columns)

# subset data
# subset for necessary columns
patient_df = init_patient_df.filter(['PATIENT_ID',
                        'AGE',
                        'SEX', 
                        'ETHNICITY',
                        'Consequence'
                       ], axis=1)

print(patient_df.columns)

# Check duplicate count
num_duplicates = patient_df.duplicated().sum()
print(f"Number of duplicate rows : {num_duplicates}")

# print duplicates (excluding first instance)
if num_duplicates > 0:
    print("\nDuplicate rows (excluding first instance):")
    print(patient_df[patient_df.duplicated()])

# # print full duplicate groups (including the first occurrences)
#     print("\nAll rows involved in duplication:")
#     print(mut_df[mut_df.duplicated(keep=False)])

# remove duplicates, but keep first occurrence
    patient_df = patient_df.drop_duplicates()
    print(f"\nDataFrame shape after removing duplicates: {patient_df.shape}")
else:
    print("No duplicate rows found.")

Index(['PATIENT_ID', 'AGE', 'SEX', 'ETHNICITY'], dtype='object')
Number of duplicate rows : 0
No duplicate rows found.


#### Sample data: subset columns, check for duplicates

In [9]:
# clean sample data

# print(sample_df.shape)
# print(sample_df.columns)
# print(sample_df.head())


# subset data
# subset for necessary columns
sample_df = init_sample_df.filter(['PATIENT_ID',
                                     'SAMPLE_ID',
                                     'SAMPLE_CLASS',
                                     'ONCOTREE_CODE',
                                     'CANCER_TYPE',
                                     'CANCER_TYPE_DETAILED',
                                     'TMB_NONSYNONYMOUS'
                                    ], axis=1)


print(sample_df.columns)


# Check duplicate count
num_duplicates = sample_df.duplicated().sum()
print(f"Number of duplicate rows : {num_duplicates}")

# print duplicates (excluding first instance)
if num_duplicates > 0:
    print("\nDuplicate rows (excluding first instance):")
    print(sample_df[sample_df.duplicated()])

# # print full duplicate groups (including the first occurrences)
#     print("\nAll rows involved in duplication:")
#     print(mut_df[mut_df.duplicated(keep=False)])

# remove duplicates, but keep first occurrence
    sample_df = sample_df.drop_duplicates()
    print(f"\nDataFrame shape after removing duplicates: {sample_df.shape}")
else:
    print("No duplicate rows found.")

Index(['PATIENT_ID', 'SAMPLE_ID', 'SAMPLE_CLASS', 'ONCOTREE_CODE',
       'CANCER_TYPE', 'CANCER_TYPE_DETAILED', 'TMB_NONSYNONYMOUS'],
      dtype='object')
Number of duplicate rows : 0
No duplicate rows found.


#### Combine dataframes

In [10]:
# combine dataframes
## TODO: redo so that all columns are available and that records with no samples will be there. "left outer join"? will get NAs.

init_combined_df = mut_df.merge(sample_df, on='SAMPLE_ID', how='left')

# print(mut_df.columns)
# print(mut_df.shape)

# print(sample_df.columns)
# print(sample_df.shape)

# print(init_combined_df.columns)
# print(init_combined_df.shape)

#add patient_df
combined_df = init_combined_df.merge(patient_df, on='PATIENT_ID', how='left')

# print(patient_df.columns)
# print(patient_df.shape)

# print(combined_df.columns)
# print(combined_df.shape)


#### Add STUDY_ID column

In [11]:

# add column for study id
study_id = init_study_meta.iloc[0, 0]
study_id = study_id.replace('cancer_study_identifier: ', '')
# study_id
combined_df['STUDY_ID'] = study_id


#### Check for (and remove) duplicates

In [12]:


# Check duplicate count
num_duplicates = combined_df.duplicated().sum()
print(f"Number of duplicate rows : {num_duplicates}")

# print duplicates (excluding first instance)
if num_duplicates > 0:
    print("\nDuplicate rows (excluding first instance):")
    print(combined_df[combined_df.duplicated()])

# # print full duplicate groups (including the first occurrences)
#     print("\nAll rows involved in duplication:")
#     print(mut_df[mut_df.duplicated(keep=False)])

# remove duplicates, but keep first occurrence
    combined_df = combined_df.drop_duplicates()
    print(f"\nDataFrame shape after removing duplicates: {combined_df.shape}")
else:
    print("No duplicate rows found.")



Number of duplicate rows : 0
No duplicate rows found.


#### Remove data from cell lines

In [13]:

# remove cell lines

original_shape = combined_df.shape
print(f"Original shape: {original_shape}")

#lines to remove
removed_df = combined_df[combined_df['SAMPLE_CLASS'] == 'Cell line']

# remove cell lines
filtered_df = combined_df[combined_df['SAMPLE_CLASS'] != 'Cell line']

# calculate how many rows were removed
rows_removed = original_shape[0] - filtered_df.shape[0]
print(f"Removed {rows_removed} rows where SAMPLE_CLASS == 'Cell line'")

# print new shape
print(f"New shape: {filtered_df.shape}")

# reassign df
combined_df = filtered_df

removed_df.to_csv('cell_lines_removed.csv', index=False)
removed_df.value_counts("SAMPLE_CLASS")


Original shape: (14737, 27)
Removed 3123 rows where SAMPLE_CLASS == 'Cell line'
New shape: (11614, 27)


SAMPLE_CLASS
Cell line    3123
Name: count, dtype: int64

#### Write value counts for NaNs

In [14]:
# combined_df.isna().sum()

# Hugo_Symbol                   0
# Chromosome                    0
# Start_Position                0
# End_Position                  0
# Consequence                 124 - some of the Variant_Classification=Silent have no consequence described
# Variant_Classification        0
# Variant_Type                  0
# Reference_Allele              0
# Tumor_Seq_Allele2             0
# SAMPLE_ID                     0
# Sequence_Source               0
# HGVSc                       136 - some of the Variant_Classification=Silent and all of the Variant_Classification=3'Flank and 5'Flank
# HGVSp                       346 
# Transcript_ID               124 - some of the Variant_Classification=Silent have no Transcript_ID described
# RefSeq                     1338
# Protein_position            334
# Gnomad_Notation               0
# PATIENT_ID                    0
# SAMPLE_CLASS                  0
# ONCOTREE_CODE                 0
# CANCER_TYPE                   0
# CANCER_TYPE_DETAILED          0
# TMB_NONSYNONYMOUS             0
# AGE                        4745 - some ages undisclosed
# SEX                           0
# ETHNICITY                 10900 - many patients' ethnicities undisclosed
# STUDY_ID                      0


# define output folder (will create it if needed)
output_dir = "value_counts_by_column"
os.makedirs(output_dir, exist_ok=True)

# loop through each column
for col in combined_df.columns:
    # build filename
    filename = f"value_counts_{col}.txt"
    filepath = os.path.join(output_dir, filename)
    
    # write counts to file
    with open(filepath, "w") as f:
        f.write(f"Value counts for column: {col}\n\n")
        f.write(combined_df[col].value_counts(dropna=False).to_string())
        f.write("\n")

print(f"✅ Value counts written for {len(combined_df.columns)} columns to folder: {output_dir}")


✅ Value counts written for 27 columns to folder: value_counts_by_column


#### Replace missing patient, sample data with "No_Data"

In [15]:
#filling in NaNs - AGE, ETHNICITY, Consequence
# TODO add consequence with annotation

cols_to_fill = ['Consequence', 'AGE', 'ETHNICITY']
fill_value = "No_Data"

for col in cols_to_fill:
    combined_df[col] = combined_df[col].fillna(fill_value)

combined_df.isna().sum()

Hugo_Symbol                  0
Chromosome                   0
Start_Position               0
End_Position                 0
Consequence                  0
Variant_Classification       0
Variant_Type                 0
Reference_Allele             0
Tumor_Seq_Allele2            0
SAMPLE_ID                    0
Sequence_Source              0
HGVSc                      136
HGVSp                      346
HGVSp_Short                136
Transcript_ID              124
RefSeq                    1338
Protein_position           334
PATIENT_ID                   0
SAMPLE_CLASS                 0
ONCOTREE_CODE                0
CANCER_TYPE                  0
CANCER_TYPE_DETAILED         0
TMB_NONSYNONYMOUS            0
AGE                          0
SEX                          0
ETHNICITY                    0
STUDY_ID                     0
dtype: int64

#### Construct GnomAD variant ID column

In [16]:
# construct Gnomad variant ID column
combined_df["temp_Gnomad_Notation"] = combined_df.apply(
    lambda row: f"{row['Chromosome']}-{row['Start_Position']}-{row['Reference_Allele']}-{row['Tumor_Seq_Allele2']}",
    axis=1
)

#### Correcting Chromosome 23 samples to X or Y

##### Write initial combined_df to file

In [17]:
combined_df.to_csv('output0.csv', index=False)

##### Set test variables and REST API variables

In [18]:
BASE_URL = "https://normalize.cancervariants.org/variation/"
# HEADERS = {"Accept": "application/json"}
HEADERS = {
    "Accept": "application/json",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"  # mimics a browser
}
# variant = "23-2408485-G-C"
# gene = "ZBED1"

PATTERN = re.compile(r'^23-')        # anchored ^ so only the chromosome prefix is substituted


##### FUNCTION: Flag rows with Chrom23

In [19]:
def flag_rows_chrom_23(df):
    """
    Create "Chrom_23" column, True for those with Chromosome = 23

    Parameters
    ----------
    df : pd.DataFrame
        Must contain column 'Chromosome'.
    
    Returns
    -------
    dataframe
    """
    df["Chrom_23"] = False
    # print(combined_df.head)
    df["Chrom_23"] = df["Chromosome"].astype(str).str.strip().eq("23")
    df.loc[df["Chromosome"] == 23, "Chrom_23"] = True
    # print(combined_df["Chrom_23"].value_counts())
    # print(combined_df["Chromosome"].value_counts())
    return df

combined_df = flag_rows_chrom_23(combined_df)

combined_df.to_csv("output_flag_chrom_23.csv", index=False)

##### FUNCTION: change female chrom23 to X

In [20]:
# print(combined_df["Chromosome"].value_counts())

def chr23_female(df):
    """
    Convert Chromosome 23 to 'X' for rows where SEX is female.
    
    Parameters
    ----------
    df : pd.DataFrame
        Must contain columns 'Chromosome' and 'SEX'.
    
    Returns
    -------
    dataframe
    """
    # Ensure we’re comparing like with like
    chr_col = df["Chromosome"].astype(str).str.strip()
    sex_col = df["SEX"].astype(str).str.upper().str.strip()   # handles 'F', 'f', 'Female', etc.
    
    mask = (chr_col == "23") & (sex_col.str.startswith("F"))
    df.loc[mask, "Chromosome"] = "X"
    return df

combined_df = chr23_female(combined_df)

# print(combined_df["Chromosome"].value_counts())

combined_df.to_csv('output1_femaleX.csv', index=False)

##### FUNCTION: Add cols for Chr23_X and Chr23_Y, fill with false

In [21]:
def add_cols_chrom_23_male(df):
    """
    Create "Chr23_X and Chr23_Y" columns, fill with false

    Parameters
    ----------
    df : pd.DataFrame
        Must contain column 'Chromosome'.
    
    Returns
    -------
    dataframe
    """
    df["Chr23_X"] = False
    df["Chr23_Y"] = False
    # df["Chrom_23"] = df["Chromosome"].astype(str).str.strip().eq("23")
    # df.loc[df["Chromosome"] == 23, "Chrom_23"] = True
    # print(combined_df["Chr23_X"].value_counts())
    # print(combined_df["Chr23_Y"].value_counts())
    return df

# combined_df = add_cols_chrom_23_male(combined_df)

# combined_df.to_csv('output2_new_chr23_boolean_cols.csv', index=False)

##### Setting chromosome 23 variant notations as a list

In [66]:
# chrom_23_list = combined_df.loc[
#     combined_df["temp_Gnomad_Notation"].str.startswith("23-", na=False),
#     "temp_Gnomad_Notation"
# ].tolist()

# with open("chrom_23_list.json", "w") as f:
#     json.dump(chrom_23_list, f, indent=2)

# variant = chrom_23_list

# print(chrom_23_list[1])

23-129273827-C-T


##### FUNCTION: Adjust GnomAD variant to accept X

In [67]:
def chr23_to_X(variant: list[str]) -> list[str]:
    """Convert any '23-' prefix in a list of variants to 'X-'."""
    return [PATTERN.sub('X-', v) if isinstance(v, str) else v for v in variant]
    

##### FUNCTION: Adjust GnomAD variant to accept Y

In [68]:
def chr23_to_Y(variant: list[str]) -> list[str]:
    """Convert any '23-' prefix in a list of variants to 'X-'."""
    return [PATTERN.sub('Y-', v) if isinstance(v, str) else v for v in variant]

##### FUNCTION: Test tokenization

In [69]:
# def test_tokenization(variant):
#     """Fetch gene from VICC variation normalizer"""
#     url = f"{BASE_URL}normalize?q={variant}"
#     response = requests.get(url, headers=HEADERS)
#     if response.status_code == 200:
#         return response.json()
#     else:
#         print(f"Error {response.status_code}: {response.text}")
#         return None

In [70]:

# # --- Query and collect results ---
# results = []

# for variant in variant:
#     try:
#         url = f"{BASE_URL}?q={variant}&assembly=GRCh37"
#         response = requests.get(url, headers=HEADERS)
#         if response.status_code == 200:
#             data = response.json()
#             results.append({
#                 "original_variant": variant,
#                 "response": json.dumps(data)  # store raw JSON as string
#             })
#         else:
#             results.append({
#                 "original_variant": variant,
#                 "response": f"Error {response.status_code}: {response.text}"
#             })
#     except Exception as e:
#         results.append({
#             "original_variant": variant,
#             "response": f"Exception: {str(e)}"
#         })
    
#     time.sleep(0.5)  # polite rate-limiting

# # --- Save to CSV ---
# results_df = pd.DataFrame(results)
# results_df.to_csv("normalized_variants_output.csv", index=False)
# print("Saved results to normalized_variants_output.csv")


In [76]:

def test_tokenization(variant, output_csv="normalized_variants_output.csv", delay=0.5):
    """
    Fetch normalized variant info from VICC API for each variant in list.
    
    Parameters
    ----------
    variant_list : list of str
        List of GnomAD-style variants (e.g., '23-2408485-G-C').
    output_csv : str
        Filename for the output CSV.
    delay : float
        Seconds to wait between API requests (default 0.5).
    
    Returns
    -------
    pd.DataFrame
        DataFrame with original variant and raw JSON string response.
    """
    results = []

    for v in variant:
        url = f"{BASE_URL}normalize?q={v}"

        try:
            response = requests.get(url, headers=HEADERS)
            # if response.status_code == 200:
            data = response.json()
            results.append({
                "variant": v,
                "response": json.dumps(data)  # store raw JSON as string
            })
                
            # else:
            #     results.append({
            #         "variant": v,
            #         "response": f"Error {response.status_code}: {response.text}"
            #     })
        except Exception as e:
            results.append({
                "variant": v,
                "response": f"Exception: {str(e)}"
            })
        
        time.sleep(delay)

    # return results
    print(type(results))

#     if response.status_code == 200:
#         return response.json()
#     else:
#         print(f"Error {response.status_code}: {response.text}")
#         return None

    API_response_df = pd.DataFrame(results)
    API_response_df.to_csv(output_csv, index=False)
    print(f"✅ Saved {len(results)} results to {output_csv}")
    return results
    # return API_response_df

##### FUNCTION: Check if variant on X

In [77]:
def check_for_x_variant(df, variant):
    variant_x = chr23_to_X(variant)
    x_r       = test_tokenization(variant_x)

    # if "variation" not in x_r:
    #     return df

    if x_r is None or "variation" not in x_r:   # <- ✅ Add `x_r is None` check here
        return df

    x_symbols = [
        gene["symbol"]
        for ext in x_r["variation"].get("extensions", [])
        if ext.get("name") == "mane_genes"
        for gene in ext.get("value", [])
        if "symbol" in gene
    ]

    if x_symbols:
        if "Chr23_X" not in df.columns:
            df["Chr23_X"] = False
        df.loc[df["Hugo_Symbol"].isin(x_symbols), "Chr23_X"] = True
        print("X symbols:", x_symbols)

    return df


##### FUNCTION: Check if variant on Y

In [78]:
def check_for_y_variant(df, variant):
    variant_y = chr23_to_Y(variant)
    y_r       = test_tokenization(variant_y)

    # if "variation" not in y_r:          # guard against 422 / 404 replies
    #     return df

    if y_r is None or "variation" not in y_r:   # <- ✅ Add `y_r is None` check here
        return df

    y_symbols = [
        gene["symbol"]
        for ext in y_r["variation"].get("extensions", [])
        if ext.get("name") == "mane_genes"
        for gene in ext.get("value", [])
        if "symbol" in gene
    ]

    if y_symbols:
        if "Chr23_Y" not in df.columns:
            df["Chr23_Y"] = False
        df.loc[df["Hugo_Symbol"].isin(y_symbols), "Chr23_Y"] = True
        print("Y symbols:", y_symbols)

    print(variant_y)

    return df


##### FUNCTION: Master function for dealing with male chrom23 

In [79]:
def chr23_male(df, variant):
    df = add_cols_chrom_23_male(df)          # prep chromosome-23 columns

    df = check_for_x_variant(df, variant)    # pass **both** args
    df = check_for_y_variant(df, variant)

    df.to_csv("output2_new_chr23_boolean_cols.csv", index=False)
    return df


##### RUN: driver function

In [80]:
result_df = chr23_male(combined_df, variant)

print(result_df["Chr23_X"].value_counts(dropna=False))
print(result_df["Chr23_Y"].value_counts(dropna=False))

result_df.to_csv("output_post_23_BOOLEAN.csv", index=False)

#tqdm

<class 'list'>
✅ Saved 124 results to normalized_variants_output.csv
<class 'list'>
✅ Saved 124 results to normalized_variants_output.csv
Chr23_X
False    11614
Name: count, dtype: int64
Chr23_Y
False    11614
Name: count, dtype: int64


##### FUNCTION: Reassign male chrom 23s

In [42]:
def correct_male_chrom23(df):
    # Initialize ambig_chrom with a default value
    df["ambig_chrom"] = "non-ambiguous"

    # Limit all logic to Chromosome 23 rows
    chr23_mask = df["Chromosome"] == 23

    # X-only: Chr23_X == True, Chr23_Y == False
    mask_x = chr23_mask & (df["Chr23_X"] == True) & (df["Chr23_Y"] == False)
    df.loc[mask_x, "Chromosome"] = "X"

    # Y-only: Chr23_Y == True, Chr23_X == False
    mask_y = chr23_mask & (df["Chr23_Y"] == True) & (df["Chr23_X"] == False)
    df.loc[mask_y, "Chromosome"] = "Y"

    # Both X and Y → ambiguous
    mask_xy = chr23_mask & (df["Chr23_X"] == True) & (df["Chr23_Y"] == True)
    df.loc[mask_xy, "ambig_chrom"] = "XY"

    # Neither X nor Y → ambiguous
    mask_neither = chr23_mask & (df["Chr23_X"] == False) & (df["Chr23_Y"] == False)
    df.loc[mask_neither, "ambig_chrom"] = "neither"

    # Reconfirm and correct values where ambig_chrom is non-ambiguous
    nonambig_mask = chr23_mask & (df["ambig_chrom"] == "non-ambiguous")
    df.loc[nonambig_mask & (df["Chr23_X"] == True), "Chromosome"] = "X"
    df.loc[nonambig_mask & (df["Chr23_Y"] == True), "Chromosome"] = "Y"

    return df


##### RUN: Correct male 23s and see if any ambiguous chromosome 23 values exist

In [43]:
correct_male_chrom23(result_df)

print(result_df["ambig_chrom"].value_counts())

result_df.to_csv("output_post_23_correction.csv", index=False)

ambig_chrom
non-ambiguous    11614
Name: count, dtype: int64


##### Next step?

In [155]:

# make "variables" a list from the df where chrom=23 and sex=M


#check for ambiguous chromosomes
# if none, create Gnomad column and write gnomad notations in it.
#delete temp gnomad column

#If there are ambiguous chromosomes, stop analysis
#if XY or None, stop to look through manually

# check gene symbols against each other
#write df

#remove temporary columns

#get missing p dots!

#consolidate driver function

#Need function that tkes gnomad notation and puts it through normalizer, then takes response and fashions it into the test fixture
#put response into columns "focus variant members", etc. 

#figure out other info for object - cohort, frequency, etc. 








In [156]:
# remove variant dupes per patient

# find duplicated (PATIENT_ID, Gnomad_Notation) pairs
dupe_mask = combined_df.duplicated(subset=["PATIENT_ID", "Gnomad_Notation"], keep="first")
# new DataFrame with the duplicated rows
patient_variant_dupes = combined_df[dupe_mask]
# remove those rows from the original DataFrame
combined_df_cleaned = combined_df[~dupe_mask]
# write removed rows to file
patient_variant_dupes.to_csv("patient_variant_dupes.csv", index=False)
# print the number of rows removed
print(f"Removed {patient_variant_dupes.shape[0]} rows with duplicated Gnomad_Notation per PATIENT_ID.")
# reassign dataframe:
combined_df = combined_df_cleaned

KeyError: Index(['Gnomad_Notation'], dtype='object')