##### imports

In [1761]:
import pandas as pd
import os
import requests
import time
import pprint
from urllib.parse import quote_plus
import re
import json



### DFCI_2014_PES study

In [1762]:
# get data
#TODO import as objects from harvester
## add parameter to skip first 4 lines in patient and study sample data

init_mut_df = pd.read_csv('/Users/costellh/repos/metakb/hmc_notebooks/es_dfarber_broad_2014_extracted/es_dfarber_broad_2014/data_mutations.txt', sep='\t')
init_study_meta = pd.read_csv('/Users/costellh/repos/metakb/hmc_notebooks/es_dfarber_broad_2014_extracted/es_dfarber_broad_2014/meta_study.txt', sep='\t')
init_patient_df = pd.read_csv('/Users/costellh/repos/metakb/hmc_notebooks/es_dfarber_broad_2014_extracted/es_dfarber_broad_2014/data_clinical_patient.txt', sep='\t', skiprows=4)
init_sample_df = pd.read_csv('/Users/costellh/repos/metakb/hmc_notebooks/es_dfarber_broad_2014_extracted/es_dfarber_broad_2014/data_clinical_sample.txt', sep='\t', skiprows=4)


#### Variant data: subset columns, check for duplicates 

In [1763]:
# clean variant data

# subset for necessary columns
mut_df = init_mut_df.filter(['Hugo_Symbol',
                        'Chromosome',
                        'Start_Position', 
                        'End_Position',
                        'Consequence',
                        'Variant_Classification',
                        'Variant_Type',
                        'Reference_Allele',
                        'Tumor_Seq_Allele2',
                        'Tumor_Sample_Barcode',
                        'Sequence_Source',
                        'HGVSc',
                        'HGVSp',
                        'HGVSp_Short',
                        'Transcript_ID',
                        'RefSeq',
                        'Protein_position'
                       ], axis=1)


# Strip whitespace and retry
mut_df.columns = mut_df.columns.str.strip()
mut_df = mut_df.rename(columns={'Tumor_Sample_Barcode': 'SAMPLE_ID'})

# Check duplicate count
num_duplicates = mut_df.duplicated().sum()
print(f"Number of duplicate rows : {num_duplicates}")

# TODO check duplicates for sanity check
# print duplicates (excluding first instance)
if num_duplicates > 0:
    print("\nDuplicate rows (excluding first instance):")
    print(mut_df[mut_df.duplicated()])

# # print full duplicate groups (including the first occurrences)
#     print("\nAll rows involved in duplication:")
#     print(mut_df[mut_df.duplicated(keep=False)])

# save duplicate rows to file
    dupes = mut_df[mut_df.duplicated(keep=False)]
    mut_df.to_csv('mut_dupes.csv', index=False)
    

    
# remove duplicates, but keep first occurrence
    mut_df = mut_df.drop_duplicates()
    print(f"\nDataFrame shape after removing duplicates: {mut_df.shape}")
else:
    print("No duplicate rows found.")

    
# print(mut_df.columns)
# # print(mut_df.head())

Number of duplicate rows : 495

Duplicate rows (excluding first instance):
      Hugo_Symbol Chromosome  Start_Position  End_Position  \
720         ABCA1          9       107607765     107607765   
730         ABCA3         16         2338066       2338066   
783         ABCC9         12        21954066      21954066   
813         ACACA         17        35640173      35640173   
829          ACHE          7       100490251     100490251   
...           ...        ...             ...           ...   
15134       STAG2          X       123179197     123179197   
15155     ZDHHC15          X        74742823      74742824   
15191       HUWE1          X        53579734      53579734   
15216     SHROOM4          X        50350700      50350700   
15218        SOX3          X       139586714     139586714   

              Consequence Variant_Classification Variant_Type  \
720      missense_variant      Missense_Mutation          SNP   
730      missense_variant      Missense_Mutation  

#### Patient data: subset columns, check for duplicates

In [1764]:
# clean patient data
# print(patient_df.shape)
# print(patient_df.columns)

# subset data
# subset for necessary columns
patient_df = init_patient_df.filter(['PATIENT_ID',
                        'AGE',
                        'SEX', 
                        'ETHNICITY',
                        'Consequence'
                       ], axis=1)

print(patient_df.columns)

# Check duplicate count
num_duplicates = patient_df.duplicated().sum()
print(f"Number of duplicate rows : {num_duplicates}")

# print duplicates (excluding first instance)
if num_duplicates > 0:
    print("\nDuplicate rows (excluding first instance):")
    print(patient_df[patient_df.duplicated()])

# # print full duplicate groups (including the first occurrences)
#     print("\nAll rows involved in duplication:")
#     print(mut_df[mut_df.duplicated(keep=False)])

# remove duplicates, but keep first occurrence
    patient_df = patient_df.drop_duplicates()
    print(f"\nDataFrame shape after removing duplicates: {patient_df.shape}")
else:
    print("No duplicate rows found.")

Index(['PATIENT_ID', 'AGE', 'SEX', 'ETHNICITY'], dtype='object')
Number of duplicate rows : 0
No duplicate rows found.


#### Sample data: subset columns, check for duplicates

In [1765]:
# clean sample data

# print(sample_df.shape)
# print(sample_df.columns)
# print(sample_df.head())


# subset data
# subset for necessary columns
sample_df = init_sample_df.filter(['PATIENT_ID',
                                     'SAMPLE_ID',
                                     'SAMPLE_CLASS',
                                     'ONCOTREE_CODE',
                                     'CANCER_TYPE',
                                     'CANCER_TYPE_DETAILED',
                                     'TMB_NONSYNONYMOUS'
                                    ], axis=1)


print(sample_df.columns)


# Check duplicate count
num_duplicates = sample_df.duplicated().sum()
print(f"Number of duplicate rows : {num_duplicates}")

# print duplicates (excluding first instance)
if num_duplicates > 0:
    print("\nDuplicate rows (excluding first instance):")
    print(sample_df[sample_df.duplicated()])

# # print full duplicate groups (including the first occurrences)
#     print("\nAll rows involved in duplication:")
#     print(mut_df[mut_df.duplicated(keep=False)])

# remove duplicates, but keep first occurrence
    sample_df = sample_df.drop_duplicates()
    print(f"\nDataFrame shape after removing duplicates: {sample_df.shape}")
else:
    print("No duplicate rows found.")

Index(['PATIENT_ID', 'SAMPLE_ID', 'SAMPLE_CLASS', 'ONCOTREE_CODE',
       'CANCER_TYPE', 'CANCER_TYPE_DETAILED', 'TMB_NONSYNONYMOUS'],
      dtype='object')
Number of duplicate rows : 0
No duplicate rows found.


#### Combine dataframes

In [1766]:
# combine dataframes

init_combined_df = mut_df.merge(sample_df, on='SAMPLE_ID', how='left')

# print(mut_df.columns)
# print(mut_df.shape)

# print(sample_df.columns)
# print(sample_df.shape)

# print(init_combined_df.columns)
# print(init_combined_df.shape)

#add patient_df
combined_df = init_combined_df.merge(patient_df, on='PATIENT_ID', how='left')

# print(patient_df.columns)
# print(patient_df.shape)

# print(combined_df.columns)
# print(combined_df.shape)


#### Add STUDY_ID column

In [1767]:

# add column for study id
study_id = init_study_meta.iloc[0, 0]
study_id = study_id.replace('cancer_study_identifier: ', '')
# study_id
combined_df['STUDY_ID'] = study_id


#### Check for (and remove) duplicates

In [1768]:


# Check duplicate count
num_duplicates = combined_df.duplicated().sum()
print(f"Number of duplicate rows : {num_duplicates}")

# print duplicates (excluding first instance)
if num_duplicates > 0:
    print("\nDuplicate rows (excluding first instance):")
    print(combined_df[combined_df.duplicated()])

# # print full duplicate groups (including the first occurrences)
#     print("\nAll rows involved in duplication:")
#     print(mut_df[mut_df.duplicated(keep=False)])

# remove duplicates, but keep first occurrence
    combined_df = combined_df.drop_duplicates()
    print(f"\nDataFrame shape after removing duplicates: {combined_df.shape}")
else:
    print("No duplicate rows found.")



Number of duplicate rows : 0
No duplicate rows found.


#### Remove data from cell lines

In [1769]:

# remove cell lines

original_shape = combined_df.shape
print(f"Original shape: {original_shape}")

#lines to remove
removed_df = combined_df[combined_df['SAMPLE_CLASS'] == 'Cell line']

# remove cell lines
filtered_df = combined_df[combined_df['SAMPLE_CLASS'] != 'Cell line']

# calculate how many rows were removed
rows_removed = original_shape[0] - filtered_df.shape[0]
print(f"Removed {rows_removed} rows where SAMPLE_CLASS == 'Cell line'")

# print new shape
print(f"New shape: {filtered_df.shape}")

# reassign df
combined_df = filtered_df

removed_df.to_csv('cell_lines_removed.csv', index=False)
removed_df.value_counts("SAMPLE_CLASS")


Original shape: (14737, 27)
Removed 3123 rows where SAMPLE_CLASS == 'Cell line'
New shape: (11614, 27)


SAMPLE_CLASS
Cell line    3123
Name: count, dtype: int64

#### Write value counts for NaNs

In [1770]:
# combined_df.isna().sum()

# Hugo_Symbol                   0
# Chromosome                    0
# Start_Position                0
# End_Position                  0
# Consequence                 124 - some of the Variant_Classification=Silent have no consequence described
# Variant_Classification        0
# Variant_Type                  0
# Reference_Allele              0
# Tumor_Seq_Allele2             0
# SAMPLE_ID                     0
# Sequence_Source               0
# HGVSc                       136 - some of the Variant_Classification=Silent and all of the Variant_Classification=3'Flank and 5'Flank
# HGVSp                       346 
# Transcript_ID               124 - some of the Variant_Classification=Silent have no Transcript_ID described
# RefSeq                     1338
# Protein_position            334
# Gnomad_Notation               0
# PATIENT_ID                    0
# SAMPLE_CLASS                  0
# ONCOTREE_CODE                 0
# CANCER_TYPE                   0
# CANCER_TYPE_DETAILED          0
# TMB_NONSYNONYMOUS             0
# AGE                        4745 - some ages undisclosed
# SEX                           0
# ETHNICITY                 10900 - many patients' ethnicities undisclosed
# STUDY_ID                      0


# define output folder (will create it if needed)
output_dir = "value_counts_by_column"
os.makedirs(output_dir, exist_ok=True)

# loop through each column
for col in combined_df.columns:
    # build filename
    filename = f"value_counts_{col}.txt"
    filepath = os.path.join(output_dir, filename)
    
    # write counts to file
    with open(filepath, "w") as f:
        f.write(f"Value counts for column: {col}\n\n")
        f.write(combined_df[col].value_counts(dropna=False).to_string())
        f.write("\n")

print(f"✅ Value counts written for {len(combined_df.columns)} columns to folder: {output_dir}")


✅ Value counts written for 27 columns to folder: value_counts_by_column


#### Replace missing patient, sample data with "No_Data"

In [1771]:
#filling in NaNs - AGE, ETHNICITY, Consequence
# TODO add consequence with annotation

cols_to_fill = ['Consequence', 'AGE', 'ETHNICITY']
fill_value = "No_Data"

for col in cols_to_fill:
    combined_df[col] = combined_df[col].fillna(fill_value)

combined_df.isna().sum()

Hugo_Symbol                  0
Chromosome                   0
Start_Position               0
End_Position                 0
Consequence                  0
Variant_Classification       0
Variant_Type                 0
Reference_Allele             0
Tumor_Seq_Allele2            0
SAMPLE_ID                    0
Sequence_Source              0
HGVSc                      136
HGVSp                      346
HGVSp_Short                136
Transcript_ID              124
RefSeq                    1338
Protein_position           334
PATIENT_ID                   0
SAMPLE_CLASS                 0
ONCOTREE_CODE                0
CANCER_TYPE                  0
CANCER_TYPE_DETAILED         0
TMB_NONSYNONYMOUS            0
AGE                          0
SEX                          0
ETHNICITY                    0
STUDY_ID                     0
dtype: int64

#### Construct TEMP GnomAD variant ID column

In [1772]:
# construct Gnomad variant ID column
combined_df["temp_Gnomad_Notation"] = combined_df.apply(
    lambda row: f"{row['Chromosome']}-{row['Start_Position']}-{row['Reference_Allele']}-{row['Tumor_Seq_Allele2']}",
    axis=1
)

#### Correcting Chromosome 23 samples to X or Y

##### Write initial combined_df to file

In [1773]:
combined_df.to_csv('output0.csv', index=False)

##### Set test variables and REST API variables

In [1774]:
# BASE_URL = "https://normalize.cancervariants.org/variation/"
# # HEADERS = {"Accept": "application/json"}
# HEADERS = {
#     "Accept": "application/json",
#     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"  # mimics a browser
# }
# variant = "23-2408485-G-C"
# gene = "ZBED1"

PATTERN = re.compile(r'^23-')        # anchored ^ so only the chromosome prefix is substituted



##### FUNCTION: Flag rows with Chrom23

In [1775]:
def flag_rows_chrom_23(df):
    """
    Create "Chrom_23" column, True for those with Chromosome = 23

    Parameters
    ----------
    df : pd.DataFrame
        Must contain column 'Chromosome'.
    
    Returns
    -------
    dataframe
    """
    df["Chrom_23"] = False
    # print(combined_df.head)
    df["Chrom_23"] = df["Chromosome"].astype(str).str.strip().eq("23")
    df.loc[df["Chromosome"] == 23, "Chrom_23"] = True
    # print(combined_df["Chrom_23"].value_counts())
    # print(combined_df["Chromosome"].value_counts())
    
    return df

combined_df = flag_rows_chrom_23(combined_df)

combined_df.to_csv("output_flag_chrom_23.csv", index=False)

##### FUNCTION: change female chrom23 to X

In [1776]:
# print(combined_df["Chromosome"].value_counts())

def chr23_female(df):
    """
    Convert Chromosome 23 to 'X' for rows where SEX is female.
    
    Parameters
    ----------
    df : pd.DataFrame
        Must contain columns 'Chromosome' and 'SEX'.
    
    Returns
    -------
    dataframe
    """
    # Ensure we’re comparing like with like
    chr_col = df["Chromosome"].astype(str).str.strip()
    sex_col = df["SEX"].astype(str).str.upper().str.strip()   # handles 'F', 'f', 'Female', etc.
    
    mask = (chr_col == "23") & (sex_col.str.startswith("F"))
    df.loc[mask, "Chromosome"] = "X"
    return df

combined_df = chr23_female(combined_df)

# print(combined_df["Chromosome"].value_counts())

combined_df.to_csv('output1_femaleX.csv', index=False)

##### FUNCTION: Add cols for Chr23_X and Chr23_Y, fill with false

In [1777]:
def add_cols_chrom_23_male(df):
    """
    Create "Chr23_X and Chr23_Y" columns, fill with false

    Parameters
    ----------
    df : pd.DataFrame
        Must contain column 'Chromosome'.
    
    Returns
    -------
    dataframe
    """
    df["Chr23_X"] = False
    df["Chr23_Y"] = False
    # df["Chrom_23"] = df["Chromosome"].astype(str).str.strip().eq("23")
    # df.loc[df["Chromosome"] == 23, "Chrom_23"] = True
    # print(combined_df["Chr23_X"].value_counts())
    # print(combined_df["Chr23_Y"].value_counts())
    return df

combined_df = add_cols_chrom_23_male(combined_df)
combined_df.to_csv('output2_new_chr23_boolean_cols.csv', index=False)

##### FUNCTION: Adjust GnomAD variant to accept X

In [1778]:
def chr23_to_X(variant: str) -> str:
    """Convert a single '23-' prefix in a variant string to 'X-'."""
    return PATTERN.sub('X-', variant) if isinstance(variant, str) else variant

x_variant = chr23_to_X(variant)
# print(x_variant)

##### FUNCTION: Adjust GnomAD variant to accept Y

In [1779]:
def chr23_to_Y(variant: str) -> str:
    """Convert a '23-' prefix in a single variant string to 'Y-'."""
    return PATTERN.sub('Y-', variant) if isinstance(variant, str) else variant

y_variant = chr23_to_Y(variant)
# print(y_variant)

##### FUNCTION: Test tokenization

In [1780]:
def test_variant_tokenization(variant: str, delay=0.5):
    """
    Fetch normalized variant info from VICC API for a single variant string.

    Parameters
    ----------
    variant : str
        A GnomAD-style variant (e.g., '23-2408485-G-C').
    delay : float
        Seconds to wait between API requests (default 0.5).

    Returns
    -------
    pd.DataFrame
        DataFrame with original variant and raw JSON string response.
    """
    
    # BASE_URL = "https://normalize.cancervariants.org/variation/"
    BASE_URL = "http://localhost:8001/variation/"
    HEADERS = {
        "Accept": "application/json",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"  # mimics a browser
        }
    
    results = []

    # url = f"{BASE_URL}normalize?q={variant}"
    url = f"{BASE_URL}normalize?q={variant}&hgvs_dup_del_mode=default&input_assembly=GRCh37"

    try:
        response = requests.get(url, headers=HEADERS)
        if response.status_code == 200:
            data = response.json()
            results.append({
                "variant": variant,
                "response": json.dumps(data)
            })
        else:
            results.append({
                "variant": variant,
                "response": f"Error {response.status_code}: {response.text}"
            })
    except Exception as e:
        results.append({
            "variant": variant,
            "response": f"Exception: {str(e)}"
        })

    time.sleep(delay)

    return pd.DataFrame(results)


##### FUNCTION: Check if variant on X

In [1781]:
def check_for_x_variant(df, variant):
    # Convert the variant to X-style format (e.g., "23-..." → "X-...")
    variant_x = chr23_to_X(variant)

    # Query the API and get a one-row DataFrame
    x_df = test_variant_tokenization(variant_x)  # returns a one-row DataFrame

    # Extract and parse JSON string from the response
    raw_response = x_df.loc[0, "response"]

    try:
        parsed_response = json.loads(raw_response)
    except json.JSONDecodeError:
        print(f"❌ Failed to parse JSON for: {variant_x}")
        return df

    # Ensure 'variation' is in the parsed response
    if "variation" not in parsed_response:
        print(f"⚠️ 'variation' key not found for: {variant_x}")
        return df

    # Try to extract hgnc_id if available
    hgnc_id = None
    try:
        hgnc_id = parsed_response["variation"]["extensions"][0]["value"][0]["hgnc_id"]
        print(f"✅ Extracted HGNC ID for {variant_x}: {hgnc_id}")
    except (KeyError, IndexError, TypeError):
        print(f"⚠️ No HGNC ID found for: {variant_x}")

    # Initialize columns if needed
    if "x_hgnc_id" not in df.columns:
        df["x_hgnc_id"] = "no_value"
    if "Chr23_X" not in df.columns:
        df["Chr23_X"] = False

    # Reconstruct variant string from each row to match normalized variant_x
    reconstructed = (
        "X-" +
        df["Start_Position"].astype(str).str.strip() + "-" +
        df["Reference_Allele"].astype(str).str.strip() + "-" +
        df["Tumor_Seq_Allele2"].astype(str).str.strip()
    )

    # Mask: match rows on reconstructed X-variant and Chromosome == 23
    chrom_col = df["Chromosome"].astype(str).str.strip()
    mask = (chrom_col == "23") & (reconstructed == variant_x)

    print(f"🧪 Matched {mask.sum()} row(s) for {variant_x}")

    # Set Chr23_X = True
    df.loc[mask, "Chr23_X"] = True

    # Set x_hgnc_id if available
    if hgnc_id is not None:
        df.loc[mask, "x_hgnc_id"] = hgnc_id

    return df






##### FUNCTION: Check if variant on Y

In [1782]:
def check_for_y_variant(df, variant):
    # Convert the variant to Y-style format (e.g., "23-..." → "Y-...")
    variant_y = chr23_to_Y(variant)

    # Query the API and get a one-row DataFrame
    y_df = test_variant_tokenization(variant_y)  # returns a one-row DataFrame

    # Extract and parse JSON string from the response
    raw_response = y_df.loc[0, "response"]

    try:
        parsed_response = json.loads(raw_response)
    except json.JSONDecodeError:
        print(f"❌ Failed to parse JSON for: {variant_y}")
        return df

    # Ensure 'variation' is in the parsed response
    if "variation" not in parsed_response:
        print(f"⚠️ 'variation' key not found for: {variant_y}")
        return df

    # Try to extract hgnc_id if available
    hgnc_id = None
    try:
        hgnc_id = parsed_response["variation"]["extensions"][0]["value"][0]["hgnc_id"]
        print(f"✅ Extracted HGNC ID for {variant_y}: {hgnc_id}")
    except (KeyError, IndexError, TypeError):
        print(f"⚠️ No HGNC ID found for: {variant_y}")

    # Initialize columns if needed
    if "y_hgnc_id" not in df.columns:
        df["y_hgnc_id"] = "no_value"
    if "Chr23_Y" not in df.columns:
        df["Chr23_Y"] = False

    # Reconstruct variant string from each row to match normalized variant_y
    reconstructed = (
        "Y-" +
        df["Start_Position"].astype(str).str.strip() + "-" +
        df["Reference_Allele"].astype(str).str.strip() + "-" +
        df["Tumor_Seq_Allele2"].astype(str).str.strip()
    )

    # Mask: match rows on reconstructed Y-variant and Chromosome == 23
    chrom_col = df["Chromosome"].astype(str).str.strip()
    mask = (chrom_col == "23") & (reconstructed == variant_y)

    print(f"🧪 Matched {mask.sum()} row(s) for {variant_y}")

    # Set Chr23_Y = True
    df.loc[mask, "Chr23_Y"] = True

    # Set y_hgnc_id if available
    if hgnc_id is not None:
        df.loc[mask, "y_hgnc_id"] = hgnc_id

    return df



##### FUNCTION: Master function for dealing with male chrom23 

In [1783]:
def chr23_male(df, variant):
    # df = add_cols_chrom_23_male(df)          # prep chromosome-23 columns
    if "Chr23_X" not in df.columns:
        df["Chr23_X"] = False
    if "Chr23_Y" not in df.columns:
        df["Chr23_Y"] = False
    df = check_for_x_variant(df, variant)    # pass **both** args
    df = check_for_y_variant(df, variant)

    df.to_csv("output2_new_chr23_boolean_cols.csv", index=False)
    return df


##### Setting chromosome 23 variant notations as a list


In [1784]:

chrom_23_list = combined_df[
    (combined_df["SEX"].str.lower() == "male") &
    (combined_df["Chrom_23"] == True)
]["temp_Gnomad_Notation"].dropna().tolist()

print("Total variants in list:", len(chrom_23_list))
print("Sample of chrom_23_list:", chrom_23_list[:5])

# print(len(chrom_23_list))

# with open("chrom_23_list.json", "w") as f:
#     json.dump(chrom_23_list, f, indent=2)

Total variants in list: 53
Sample of chrom_23_list: ['23-70824012-C-T', '23-84258927-C-T', '23-153184693-C-T', '23-153175777-G-A', '23-135772859-A-C']


##### Initialize x_hgnc_id and y_hgnc_id columns


In [1785]:
if "x_hgnc_id" not in combined_df.columns:
    result_df["x_hgnc_id"] = "no_value"
if "y_hgnc_id" not in combined_df.columns:
    result_df["y_hgnc_id"] = "no_value"

##### RUN: driver function

In [1786]:
# Start with a copy of your combined_df
result_df = combined_df.copy()

# Iterate through each chromosome 23 variant for male samples
for variant in chrom_23_list:
    print(f"▶️ Checking variant: {variant}")  # Add this line
    result_df = chr23_male(result_df, variant)

# Now all updates are preserved in result_df
print(result_df["Chr23_X"].value_counts(dropna=False))
print(result_df["Chr23_Y"].value_counts(dropna=False))

# Save the final output to CSV
result_df.to_csv("output_post_23_BOOLEAN.csv", index=False)

##TODO: THIS IS ONLY CHANGING ZNF449 TO TRUE FOR X
##TODO: SOME GENES SHOW ALIASES, SOME ARE WRONG GENES
##TODO: MUST TRUST THAT NORMALIZER IS DOING ITS JOB. WHEN A NEW SYMBOL COMES BACK, BUT THEY ARE ALIASES, NORMALIZER GIVES 
##TODO: USE GENEID INSTEAD OF GENE SYMBOL FOR TOKENIZATION TEST. DO VARIATION TESTER FOR GENE SYMBOL AND THEN GET GENE ID. THEN TEST VARIANT, GET GENE ID. IF THEY ARE THE SAME, RECORD.  
##TODO: why aren't some of the x variants showing up with hgnc IDs.  Like X-84258927-C-T

▶️ Checking variant: 23-70824012-C-T
✅ Extracted HGNC ID for X-70824012-C-T: 15805
🧪 Matched 1 row(s) for X-70824012-C-T
⚠️ 'variation' key not found for: Y-70824012-C-T
▶️ Checking variant: 23-84258927-C-T
✅ Extracted HGNC ID for X-84258927-C-T: 24009
🧪 Matched 1 row(s) for X-84258927-C-T
⚠️ 'variation' key not found for: Y-84258927-C-T
▶️ Checking variant: 23-153184693-C-T
✅ Extracted HGNC ID for X-153184693-C-T: 674
🧪 Matched 1 row(s) for X-153184693-C-T
⚠️ 'variation' key not found for: Y-153184693-C-T
▶️ Checking variant: 23-153175777-G-A
✅ Extracted HGNC ID for X-153175777-G-A: 674
🧪 Matched 1 row(s) for X-153175777-G-A
⚠️ 'variation' key not found for: Y-153175777-G-A
▶️ Checking variant: 23-135772859-A-C
✅ Extracted HGNC ID for X-135772859-A-C: 685
🧪 Matched 1 row(s) for X-135772859-A-C
⚠️ 'variation' key not found for: Y-135772859-A-C
▶️ Checking variant: 23-135827412-A-G
✅ Extracted HGNC ID for X-135827412-A-G: 685
🧪 Matched 1 row(s) for X-135827412-A-G
⚠️ 'variation' key not

##### FUNCTION: Reassign male chrom 23s

In [1787]:
def correct_male_chrom23(df):
    # Initialize ambig_chrom column
    df["ambig_chrom"] = "non-ambiguous"

    def update_row(row):
        if row.get("Chrom_23") is True and str(row.get("SEX", "")).strip().lower() == "male":
            if row["Chr23_X"] and not row["Chr23_Y"]:
                row["Chromosome"] = "X"
            elif row["Chr23_Y"] and not row["Chr23_X"]:
                row["Chromosome"] = "Y"
            elif row["Chr23_X"] and row["Chr23_Y"]:
                row["ambig_chrom"] = "XY"
            else:  # neither Chr23_X nor Chr23_Y is True
                row["ambig_chrom"] = "neither"
        return row

    # Apply corrections row-by-row
    df = df.apply(update_row, axis=1)

    # Check for ambiguous rows
    ambig_rows = df[df["ambig_chrom"].isin(["XY", "neither"])]
    if not ambig_rows.empty:
        print(f"⚠️ Found {len(ambig_rows)} row(s) with ambiguous chromosome identification.")
        print(ambig_rows[["temp_Gnomad_Notation", "Hugo_Symbol", "ambig_chrom"]].head())

    return df




##### RUN: Correct male 23s and see if any ambiguous chromosome 23 values exist

In [1788]:
result_df = correct_male_chrom23(result_df)

print(result_df["ambig_chrom"].value_counts())

result_df.to_csv("output_post_23_correction.csv", index=False)

# result_df 


⚠️ Found 2 row(s) with ambiguous chromosome identification.
      temp_Gnomad_Notation Hugo_Symbol ambig_chrom
2906        23-1428413-C-T      CSF2RA          XY
10778       23-8434397-C-T       VCX3B          XY
ambig_chrom
non-ambiguous    11612
XY                   2
Name: count, dtype: int64


##### FUNCTION: correct ambiguous chromosomes

In [1789]:
def resolve_ambiguous_chromosomes(df):
    """
    Resolve ambiguous Chr23 variants in male samples and flag problematic rows.

    - For Chrom_23 == True and SEX == male:
      - If both x_hgnc_id and y_hgnc_id have values → ⚠️ warn
      - If neither has values → ❌ warn
      - If ambig_chrom == "XY" or "neither", try to resolve Chromosome field
    """
    chrom23_male_mask = (
        (df["Chrom_23"] == True) &
        (df["SEX"].str.lower().str.strip() == "male")
    )

    for idx, row in df[chrom23_male_mask].iterrows():
        x_id = str(row.get("x_hgnc_id", "")).strip()
        y_id = str(row.get("y_hgnc_id", "")).strip()
        ambig_status = row.get("ambig_chrom", "not_set")

        # ⚠️ Raise general warnings, regardless of ambig_chrom
        if x_id != "no_value" and y_id != "no_value":
            print(f"⚠️ Row {idx}: Both x_hgnc_id and y_hgnc_id have values. Manual check recommended.")
        elif x_id == "no_value" and y_id == "no_value":
            print(f"❌ Row {idx}: Neither x_hgnc_id nor y_hgnc_id has a value.")

        # ✅ Try resolving ambiguous chromosomes
        if ambig_status in ["XY", "neither"]:
            if x_id != "no_value" and y_id == "no_value":
                df.at[idx, "Chromosome"] = "X"
                df.at[idx, "ambig_chrom"] = "resolved_X"
                print(f"✅ Resolved index {idx} to Chromosome X")
            elif y_id != "no_value" and x_id == "no_value":
                df.at[idx, "Chromosome"] = "Y"
                df.at[idx, "ambig_chrom"] = "resolved_Y"
                print(f"✅ Resolved index {idx} to Chromosome Y")
            elif x_id != "no_value" and y_id != "no_value":
                print(f"⚠️ Index {idx}: Ambiguous Chromosome with both HGNC IDs present.")
            else:
                print(f"❌ Index {idx}: Ambiguous Chromosome with no HGNC ID found.")

    return df




##### RUN: resolve ambiguous chromosomes

In [1790]:
resolve_ambiguous_chromosomes(result_df)

✅ Resolved index 2906 to Chromosome X
❌ Row 6805: Neither x_hgnc_id nor y_hgnc_id has a value.
❌ Row 9877: Neither x_hgnc_id nor y_hgnc_id has a value.
✅ Resolved index 10778 to Chromosome X
❌ Row 10942: Neither x_hgnc_id nor y_hgnc_id has a value.


Unnamed: 0,Hugo_Symbol,Chromosome,Start_Position,End_Position,Consequence,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele2,SAMPLE_ID,...,SEX,ETHNICITY,STUDY_ID,temp_Gnomad_Notation,Chrom_23,Chr23_X,Chr23_Y,x_hgnc_id,y_hgnc_id,ambig_chrom
0,ETFDH,4,159603535,159603535,missense_variant,Missense_Mutation,SNP,G,C,SJDES004,...,Male,White/Europe,es_dfarber_broad_2014,4-159603535-G-C,False,False,False,no_value,no_value,non-ambiguous
1,GJB4,1,35227334,35227334,missense_variant,Missense_Mutation,SNP,G,A,SJDES004,...,Male,White/Europe,es_dfarber_broad_2014,1-35227334-G-A,False,False,False,no_value,no_value,non-ambiguous
2,MUC5B,11,1272679,1272679,missense_variant,Missense_Mutation,SNP,G,A,SJDES004,...,Male,White/Europe,es_dfarber_broad_2014,11-1272679-G-A,False,False,False,no_value,no_value,non-ambiguous
3,PCDHAC1,5,140307515,140307515,synonymous_variant,Silent,SNP,G,A,SJDES004,...,Male,White/Europe,es_dfarber_broad_2014,5-140307515-G-A,False,False,False,no_value,no_value,non-ambiguous
4,YAP1,11,102080254,102080254,missense_variant,Missense_Mutation,SNP,C,T,SJDES004,...,Male,White/Europe,es_dfarber_broad_2014,11-102080254-C-T,False,False,False,no_value,no_value,non-ambiguous
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14667,ZNF41,X,47306909,47306909,missense_variant,Missense_Mutation,SNP,T,C,CHEWS017,...,Male,No_Data,es_dfarber_broad_2014,X-47306909-T-C,False,False,False,no_value,no_value,non-ambiguous
14668,ZNF645,X,22292037,22292037,missense_variant,Missense_Mutation,SNP,G,A,CHEWS002,...,Male,No_Data,es_dfarber_broad_2014,X-22292037-G-A,False,False,False,no_value,no_value,non-ambiguous
14669,ZRSR2,X,15841117,15841117,missense_variant,Missense_Mutation,SNP,C,T,01-P034,...,Male,No_Data,es_dfarber_broad_2014,X-15841117-C-T,False,False,False,no_value,no_value,non-ambiguous
14670,ZRSR2,X,15841120,15841120,missense_variant,Missense_Mutation,SNP,C,T,01-P131,...,Female,No_Data,es_dfarber_broad_2014,X-15841120-C-T,False,False,False,no_value,no_value,non-ambiguous


##### Create Gnomad_Notation column and populate.

In [1791]:
# construct Gnomad variant ID column
result_df["Gnomad_Notation"] = result_df.apply(
    lambda row: f"{row['Chromosome']}-{row['Start_Position']}-{row['Reference_Allele']}-{row['Tumor_Seq_Allele2']}",
    axis=1
)

##### FUNCTION: test gene tokenization


In [1792]:
def test_gene_tokenization(gene: str, delay=0.5):
    BASE_URL = "https://normalize.cancervariants.org/gene/"
    HEADERS = {
        "Accept": "application/json",
        "User-Agent": "Mozilla/5.0"
    }

    results = []

    url = f"{BASE_URL}normalize?q={gene}"

    try:
        response = requests.get(url, headers=HEADERS)
        if response.status_code == 200:
            data = response.json()
            results.append({
                "gene": gene,
                "response": json.dumps(data)
            })
        else:
            results.append({
                "gene": gene,
                "response": f"Error {response.status_code}: {response.text}"
            })
    except Exception as e:
        results.append({
            "gene": gene,
            "response": f"Exception: {str(e)}"
        })

    time.sleep(delay)
    return pd.DataFrame(results)


##### FUNCTION: test gene tokenization

In [None]:

def update_gene_hgnc_id_inplace(df):
    for idx, row in df.iterrows():
        if row.get("Chrom_23") and str(row.get("SEX", "")).strip().lower() == "male":
            try:
                result = test_gene_tokenization(row["Hugo_Symbol"])
                full_id = result["gene"]["id"]  # e.g., "normalize.gene.hgnc:447"
                hgnc_id = full_id.split(":")[-1]  # Extract "447"
                df.at[idx, "gene_hgnc_id"] = hgnc_id
            except Exception as e:
                print(f"⚠️ Could not extract HGNC ID for {row['Hugo_Symbol']}: {e}")

    print(df.head())
    return df


##### Setting genes as list


In [1805]:

# Ensure column exists and is initialized to "untested"
if "gene_hgnc_id" not in result_df.columns:
    result_df["gene_hgnc_id"] = "untested"

gene_list = result_df[
    (result_df["SEX"].str.lower() == "male") &
    (result_df["Chrom_23"] == True)
]["Hugo_Symbol"].dropna().tolist()

print("Total genes in list:", len(gene_list))
print("Sample of gene_list:", gene_list[:5])

Total genes in list: 53
Sample of gene_list: ['ACRC', 'APOOL', 'ARHGAP4', 'ARHGAP4', 'ARHGEF6']


##### RUN: Gene tokenization

In [1806]:
# Start with a copy of your combined_df
result_post_23_df = result_df.copy()

def populate_gene_hgnc_col(gene_list, df):
    # Add the column once
    if "gene_hgnc_id" not in df.columns:
        df["gene_hgnc_id"] = None

    for gene in gene_list:
        print(f"▶️ Checking gene: {gene}")
        gene_df = test_gene_tokenization(gene)

        # Extract and parse JSON string from the response
        raw_response = gene_df.loc[0, "response"]
        try:
            parsed_response = json.loads(raw_response)
        except json.JSONDecodeError:
            print(f"❌ Failed to parse JSON for: {gene}")
            continue

        if "gene" not in parsed_response:
            print(f"⚠️ 'gene' key not found for: {gene}")
            continue

        try:
            hgnc_id = parsed_response["gene"]["id"].split(":")[-1]
            print(f"✅ Extracted HGNC ID for {gene}: {hgnc_id}")
        except (KeyError, IndexError, TypeError):
            print(f"⚠️ No HGNC ID found for: {gene}")
            continue

        # Apply to matching rows
        df.loc[
            (df["Chrom_23"] == True) & 
            (df["SEX"].str.strip().str.lower() == "male") & 
            (df["Hugo_Symbol"].str.strip() == gene.strip()),
            "gene_hgnc_id"
        ] = hgnc_id

    return df




result_post_23_df = populate_gene_hgnc_col(gene_list, result_post_23_df)
print(result_post_23_df["gene_hgnc_id"].value_counts(dropna=False))
result_post_23_df.to_csv("output_post_gene_norm.csv", index=False)



▶️ Checking gene: ACRC
✅ Extracted HGNC ID for ACRC: 15805
▶️ Checking gene: APOOL
✅ Extracted HGNC ID for APOOL: 24009
▶️ Checking gene: ARHGAP4
✅ Extracted HGNC ID for ARHGAP4: 674
▶️ Checking gene: ARHGAP4
✅ Extracted HGNC ID for ARHGAP4: 674
▶️ Checking gene: ARHGEF6
✅ Extracted HGNC ID for ARHGEF6: 685
▶️ Checking gene: ARHGEF6
✅ Extracted HGNC ID for ARHGEF6: 685
▶️ Checking gene: BEND2
✅ Extracted HGNC ID for BEND2: 28509
▶️ Checking gene: BMX
✅ Extracted HGNC ID for BMX: 1079
▶️ Checking gene: CSF2RA
✅ Extracted HGNC ID for CSF2RA: 2435
▶️ Checking gene: DMD
✅ Extracted HGNC ID for DMD: 2928
▶️ Checking gene: ERCC6L
✅ Extracted HGNC ID for ERCC6L: 20794
▶️ Checking gene: GJB1
✅ Extracted HGNC ID for GJB1: 4283
▶️ Checking gene: GUCY2F
✅ Extracted HGNC ID for GUCY2F: 4691
▶️ Checking gene: HAUS7
✅ Extracted HGNC ID for HAUS7: 32979
▶️ Checking gene: HCFC1
✅ Extracted HGNC ID for HCFC1: 4839
▶️ Checking gene: HUWE1
✅ Extracted HGNC ID for HUWE1: 30892
▶️ Checking gene: KDM5C
✅ Ex

##### FUNCTION: check hgnc_id matches

In [1807]:
def validate_gene_hgnc_match(df):
    """
    Compare gene_hgnc_id to x_hgnc_id or y_hgnc_id for Chromosome 23 variants in male samples.
    
    Adds a column 'hgnc_id_match':
        - "untested" (default)
        - set to matching hgnc_id if match is found
        - "no_match" if mismatch is found

    Prints warnings for mismatches.
    """
    # Initialize column
    if "hgnc_id_match" not in df.columns:
        df["hgnc_id_match"] = "untested"

    # Define mask for male Chr23 variants
    mask = (df["Chrom_23"] == True) & (df["SEX"].str.strip().str.lower() == "male")

    for idx, row in df[mask].iterrows():
        gene_id = str(row.get("gene_hgnc_id", "")).strip()
        chrom = str(row.get("Chromosome", "")).strip()
        x_id = str(row.get("x_hgnc_id", "")).strip()
        y_id = str(row.get("y_hgnc_id", "")).strip()

        if chrom == "X":
            if gene_id == x_id and gene_id != "no_value":
                df.at[idx, "hgnc_id_match"] = gene_id
            else:
                df.at[idx, "hgnc_id_match"] = "no_match"
                print(f"❌ Row {idx}: X chromosome mismatch. gene_hgnc_id={gene_id}, x_hgnc_id={x_id}")
        elif chrom == "Y":
            if gene_id == y_id and gene_id != "no_value":
                df.at[idx, "hgnc_id_match"] = gene_id
            else:
                df.at[idx, "hgnc_id_match"] = "no_match"
                print(f"❌ Row {idx}: Y chromosome mismatch. gene_hgnc_id={gene_id}, y_hgnc_id={y_id}")

    return df


##### RUN: validate hgnc ID matches

In [1808]:
post_validation_df = validate_gene_hgnc_match(result_post_23_df)
post_validation_df.to_csv("output_post_validation.csv", index=False)

❌ Row 6805: X chromosome mismatch. gene_hgnc_id=19374, x_hgnc_id=no_value
❌ Row 9877: X chromosome mismatch. gene_hgnc_id=24553, x_hgnc_id=no_value
❌ Row 10942: X chromosome mismatch. gene_hgnc_id=29237, x_hgnc_id=no_value


#### Function: fill in missing values in df

##### FUNCTION: Query API for HGVSp

In [1811]:
#check NAs 
# post_validation_df.isna().sum()
variant = "X-70824012-C-T"
def retrieve_HGVSp_value(variant: str, delay=0.5):
    BASE_URL = "http://localhost:8001/variation/"
    HEADERS = {
        "Accept": "application/json",
        "User-Agent": "Mozilla/5.0"
    }

    results = []

    url = f"{BASE_URL}gnomad_vcf_to_protein?q={variant}"

    try:
        response = requests.get(url, headers=HEADERS)
        if response.status_code == 200:
            data = response.json()
            results.append({
                "variant": variant,
                "response": json.dumps(data)
            })
        else:
            results.append({
                "variant": variant,
                "response": f"Error {response.status_code}: {response.text}"
            })
    except Exception as e:
        results.append({
            "variant": variant,
            "response": f"Exception: {str(e)}"
        })

    time.sleep(delay)
    return pd.DataFrame(results)

r = retrieve_HGVSp_value(variant)
r


Unnamed: 0,variant,response
0,X-70824012-C-T,"{""warnings"": [], ""service_meta_"": {""name"": ""va..."


##### FUNCTION: update HGVSp in place for missing values

In [None]:

def update_HGVSp_inplace(df):
    for idx, row in df.iterrows():
        if row.get("Chrom_23") and str(row.get("SEX", "")).strip().lower() == "male":
            try:
                result = test_gene_tokenization(row["Hugo_Symbol"])
                full_id = result["gene"]["id"]  # e.g., "normalize.gene.hgnc:447"
                hgnc_id = full_id.split(":")[-1]  # Extract "447"
                df.at[idx, "gene_hgnc_id"] = hgnc_id
            except Exception as e:
                print(f"⚠️ Could not extract HGNC ID for {row['Hugo_Symbol']}: {e}")

    print(df.head())
    return df

##### Next step?

In [None]:




#remove temporary columns

#get missing p dots!

#consolidate driver function

#Need function that tkes gnomad notation and puts it through normalizer, then takes response and fashions it into the test fixture
#put response into columns "focus variant members", etc. 

#figure out other info for object - cohort, frequency, etc. 








In [515]:
# remove variant dupes per patient

# find duplicated (PATIENT_ID, Gnomad_Notation) pairs
dupe_mask = combined_df.duplicated(subset=["PATIENT_ID", "Gnomad_Notation"], keep="first")
# new DataFrame with the duplicated rows
patient_variant_dupes = combined_df[dupe_mask]
# remove those rows from the original DataFrame
combined_df_cleaned = combined_df[~dupe_mask]
# write removed rows to file
patient_variant_dupes.to_csv("patient_variant_dupes.csv", index=False)
# print the number of rows removed
print(f"Removed {patient_variant_dupes.shape[0]} rows with duplicated Gnomad_Notation per PATIENT_ID.")
# reassign dataframe:
combined_df = combined_df_cleaned

KeyError: Index(['Gnomad_Notation'], dtype='object')