# Processing the Reasoning Trace Data and Adding in Nucleotides

In [None]:
cd kegg_data

In [2]:
import json
from Bio import SeqIO

In [3]:
!mkdir 'processed_variants 1450 with seqs'

mkdir: processed_variants 1450 with seqs: File exists


In [4]:
for i in range(1,1450):
    # opened the json file
    with open(f'processed_variants first 700/KEGG_{i}_processed.json', 'r') as file:
        data = json.load(file)

    # open the nt file
    fasta_file = f"nt_seq/KEGG_{i}.txt"
    sequence_list = list(SeqIO.parse(fasta_file, "fasta"))
    ref_seq = sequence_list[0].seq
    var_seq = sequence_list[1].seq

    # Add sequences to the JSON data
    data["reference_sequence"] = str(ref_seq)
    data["variant_sequence"] = str(var_seq)

    # Save the updated JSON to a new file
    with open(f'processed_variants 1450 with seqs/KEGG_{i}_with_seqs.json', 'w') as out_file:
        json.dump(data, out_file, indent=2)

# Creating the Final KEGG SFT and RL Dataset

# Final KEGG Dataset Creation

This section creates the final machine learning dataset by combining variant data with sequences and generating structured question-answer pairs for biological reasoning tasks.

In [None]:
cd kegg_data

In [2]:
import pandas as pd
import json
import ast

In [3]:
variant_data = pd.read_csv("final_network_with_variant.tsv", sep='\t')
variant_data

Unnamed: 0,Var_ID,Network,Entry,Source,ID,TranscriptID,NucChange,Chr,Start,End,...,Network Expanded,Pathway,Class,Disease,Gene,Variant_Name,Variant_Gene,Variant_Gene Info,Variant_Type,Disease_Names
0,KEGG_1,N00073,1019v2,ClinVar,16929,NC_000012.12,,12,57751646,57751646,...,"((595,894,896)+1019v2) -> 5925 // (1869,1870,1...",{'hsa05218': 'Melanoma'},"{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...",{'H00038': 'Melanoma is a form of skin cancer ...,"{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...",CDK4 mutation,CDK4,cyclin dependent kinase 4 [KO:K02089],,{'H00038': 'Melanoma'}
1,KEGG_2,N00073,1019v2,dbSNP,rs104894340,NC_000012.12,,12,57751646,57751646,...,"((595,894,896)+1019v2) -> 5925 // (1869,1870,1...",{'hsa05218': 'Melanoma'},"{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...",{'H00038': 'Melanoma is a form of skin cancer ...,"{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...",CDK4 mutation,CDK4,cyclin dependent kinase 4 [KO:K02089],,{'H00038': 'Melanoma'}
2,KEGG_3,N00073,1019v2,dbSNP,rs104894340,NC_000012.12,,12,57751646,57751646,...,"((595,894,896)+1019v2) -> 5925 // (1869,1870,1...",{'hsa05218': 'Melanoma'},"{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...",{'H00038': 'Melanoma is a form of skin cancer ...,"{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...",CDK4 mutation,CDK4,cyclin dependent kinase 4 [KO:K02089],,{'H00038': 'Melanoma'}
3,KEGG_4,N00073,1019v2,ClinVar,16928,NC_000012.12,,12,57751647,57751647,...,"((595,894,896)+1019v2) -> 5925 // (1869,1870,1...",{'hsa05218': 'Melanoma'},"{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...",{'H00038': 'Melanoma is a form of skin cancer ...,"{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...",CDK4 mutation,CDK4,cyclin dependent kinase 4 [KO:K02089],,{'H00038': 'Melanoma'}
4,KEGG_5,N00073,1019v2,dbSNP,rs11547328,NC_000012.12,,12,57751647,57751647,...,"((595,894,896)+1019v2) -> 5925 // (1869,1870,1...",{'hsa05218': 'Melanoma'},"{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...",{'H00038': 'Melanoma is a form of skin cancer ...,"{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...",CDK4 mutation,CDK4,cyclin dependent kinase 4 [KO:K02089],,{'H00038': 'Melanoma'}
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1444,KEGG_1445,N00244,9817v1,COSM,6196635,ENST00000393623.6,c.706G>T,19,10492196,10492196,...,"9817v1 // 4780 => (3162,1728,119391,221357,293...",{'hsa05225': 'Hepatocellular carcinoma'},"{'nt06263': 'Hepatocellular carcinoma', 'nt062...",{'H00048': 'Hepatocellular carcinoma (HCC) is ...,{'9817': 'KEAP1; kelch like ECH associated pro...,KEAP1 mutation,KEAP1,kelch like ECH associated protein 1 [KO:K10456],,{'H00048': 'Hepatocellular carcinoma;'}
1445,KEGG_1446,N00244,9817v1,COSM,6196637,ENST00000393623.6,c.548A>G,19,10499486,10499486,...,"9817v1 // 4780 => (3162,1728,119391,221357,293...",{'hsa05225': 'Hepatocellular carcinoma'},"{'nt06263': 'Hepatocellular carcinoma', 'nt062...",{'H00048': 'Hepatocellular carcinoma (HCC) is ...,{'9817': 'KEAP1; kelch like ECH associated pro...,KEAP1 mutation,KEAP1,kelch like ECH associated protein 1 [KO:K10456],,{'H00048': 'Hepatocellular carcinoma;'}
1446,KEGG_1447,N00258,999v2,COSM,4766271,ENST00000621016.4,c.662A>G,16,68808823,68808823,...,"999v2 // 1499 -> (6932,83439,6934,51176) => (4...",{'hsa05226': 'Gastric cancer'},"{'nt06261': 'Gastric cancer', 'nt06215': 'WNT ...","{'H00018': ""Gastric cancer (GC) is one of the ...","{'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c...",CDH1 mutation,CDH1,cadherin 1 [KO:K05689],,{'H00018': 'Gastric cancer'}
1447,KEGG_1448,N00258,999v2,COSM,4766211,ENST00000621016.4,c.755T>G,16,68810264,68810264,...,"999v2 // 1499 -> (6932,83439,6934,51176) => (4...",{'hsa05226': 'Gastric cancer'},"{'nt06261': 'Gastric cancer', 'nt06215': 'WNT ...","{'H00018': ""Gastric cancer (GC) is one of the ...","{'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c...",CDH1 mutation,CDH1,cadherin 1 [KO:K05689],,{'H00018': 'Gastric cancer'}


In [9]:
variant_data.iloc[1]['Var_ID']

'KEGG_2'

In [5]:
!mkdir final_data

mkdir: final_data: File exists


In [None]:
import os
import json
import ast
from CONFIG import CONFIG

# Create final dataset with question-answer pairs
variants_with_seqs_dir = CONFIG['variants_with_seqs_dir']
final_data_dir = CONFIG['final_data_dir']
start_idx, end_idx = CONFIG['variant_range']

print(f"Creating final dataset with Q&A pairs...")
print(f"Input: {variants_with_seqs_dir}")
print(f"Output: {final_data_dir}")
print(f"Processing range: {start_idx} to {end_idx}")

processed_count = 0
error_count = 0

for i in range(start_idx, end_idx):
    try:
        # Load the JSON file with sequences
        input_file = f'{variants_with_seqs_dir}/KEGG_{i}_with_seqs.json'
        if not os.path.exists(input_file):
            error_count += 1
            continue
            
        with open(input_file, 'r') as file:
            data = json.load(file)

        # Build the question with fallback for inconsistent key casing
        try:
            chromosome = data['raw_data']['chromosome']
            network = data['raw_data']['network']
        except KeyError:
            try:
                chromosome = data['raw_data']['Chromosome']
                network = data['raw_data']['Network']
            except KeyError:
                print(f"[Warning] Missing chromosome/network data in {input_file}")
                error_count += 1
                continue

        # Extract gene information
        try:
            gene_list = list(ast.literal_eval(variant_data.iloc[i-1]['Gene']).values())
            gene_list_joined = ' | '.join(gene_list)
            variant_gene = variant_data.iloc[i-1]['Variant_Gene']
        except (KeyError, IndexError, ValueError) as e:
            print(f"[Warning] Gene information error for {input_file}: {e}")
            error_count += 1
            continue

        question = (
            f"Chromosome Number: {chromosome}\n"
            f"Network Definition of the pathway: {network}\n"
            f"Genes in the pathway: {gene_list_joined}\n\n"
            f"Given this context, what is the biological effect of this "
            f"{variant_gene} allele, specifically what disease does this contribute to?"
        )

        # Add Q&A to reasoning steps
        if 'reasoning' in data and 'reasoning_steps' in data['reasoning']:
            data['reasoning']['reasoning_steps'].append(data.get('answer', ''))

        # Extract answer
        try:
            answer = data['reasoning']['labels']['disease'][0]
        except (KeyError, IndexError):
            print(f"[Warning] Missing disease label in {input_file}")
            error_count += 1
            continue

        data['question'] = question
        data['answer'] = answer    

        # Clean up unnecessary fields
        if 'reasoning' in data:
            for key in ['variant_id', 'hgvs', 'labels']:
                data['reasoning'].pop(key, None)
        data.pop('raw_data', None)

        # Save to final data directory
        output_file = f'{final_data_dir}/KEGG_{i}_with_seqs.json'
        with open(output_file, 'w') as out_file:
            json.dump(data, out_file, indent=2)
            
        processed_count += 1
        
        if processed_count % 100 == 0:
            print(f"Created {processed_count} Q&A pairs...")
            
    except Exception as e:
        print(f"[Error] Failed to process variant {i}: {str(e)}")
        error_count += 1

print(f"✅ Final dataset creation complete:")
print(f"  Successfully processed: {processed_count}")
print(f"  Errors encountered: {error_count}")
print(f"  Output directory: {final_data_dir}")

# Fixing Disease Labels

In [None]:
cd kegg_data

In [2]:
import json

In [None]:
import os
import json

# CONFIG parameters
CONFIG = {
    'final_data_dir': 'final_data',
    'variant_range': (1, 1450)
}

# Extract disease labels from final dataset for standardization
final_data_dir = CONFIG['final_data_dir']
start_idx, end_idx = CONFIG['variant_range']

print("Extracting disease labels for standardization...")

disease = []
processed_count = 0

for i in range(start_idx, end_idx):
    try:
        input_file = f'{final_data_dir}/KEGG_{i}_with_seqs.json'
        if os.path.exists(input_file):
            with open(input_file, 'r') as file:
                data = json.load(file)
            
            if 'answer' in data:
                disease.append(data['answer'])
                processed_count += 1
                
    except Exception as e:
        print(f"[Warning] Could not process {input_file}: {str(e)}")

print(f"✅ Extracted {len(disease)} disease labels from {processed_count} files")
print(f"Unique diseases: {len(set(disease))}")

In [13]:
new_disease = {'Acute Myeloid Leukemia (AML)' : "Acute Myeloid Leukemia",
 'Acute myeloid leukemia (AML)' : "Acute Myeloid Leukemia",
 'Adenine Phosphoribosyltransferase Deficiency (APRTD)' : "Adenine Phosphoribosyltransferase Deficiency",
 'Adenine phosphoribosyltransferase deficiency (APRTD)' : "Adenine Phosphoribosyltransferase Deficiency",
 "Alzheimer's disease" : "Alzheimer's disease",
 "Alzheimer's disease (AD)" : "Alzheimer's disease",
 'Amyotrophic Lateral Sclerosis (ALS)' : "Amyotrophic Lateral Sclerosis",
 'Amyotrophic lateral sclerosis (ALS)' : "Amyotrophic Lateral Sclerosis",
 'Basal Cell Carcinoma (BCC)' : "Basal Cell Carcinoma",
 'Basal cell carcinoma' : "Basal Cell Carcinoma",
 'Basal cell carcinoma (BCC)' : "Basal Cell Carcinoma",
 'Chronic Myeloid Leukemia (CML)' : "Chronic Myeloid Leukemia",
 'Chronic myeloid leukemia (CML)' : "Chronic Myeloid Leukemia",
 'Clear cell Renal Cell Carcinoma (ccRCC)' : "Clear cell Renal Cell Carcinoma",
 'Clear cell renal cell carcinoma' : "Clear cell Renal Cell Carcinoma",
 'Clear cell renal cell carcinoma (ccRCC)' : "Clear cell Renal Cell Carcinoma",
 'Colorectal cancer' : "Colorectal cancer",
 'Colorectal cancer (CRC)' : "Colorectal cancer",
 'Cushing syndrome' : "Cushing syndrome",
 "Early-onset Alzheimer's disease" : "Alzheimer's disease",
 "Early-onset familial Alzheimer's disease" : "Alzheimer's disease",
 "Early-onset familial Alzheimer's disease (FAD)" : "Alzheimer's disease",
 'Familial Creutzfeldt-Jakob Disease' : "Creutzfeldt-Jakob Disease",
 'Familial Creutzfeldt-Jakob Disease (fCJD)' : "Creutzfeldt-Jakob Disease",
 'Familial Creutzfeldt-Jakob disease' : "Creutzfeldt-Jakob Disease",
 'Familial Creutzfeldt-Jakob disease (fCJD)' : "Creutzfeldt-Jakob Disease",
 "Familial Early-Onset Alzheimer's Disease" : "Alzheimer's disease",
 'Familial Isolated Pituitary Adenoma (FIPA)' : "Pituitary Adenoma",
 "Familial early-onset Alzheimer's disease" : "Alzheimer's disease",
 "Familial early-onset Alzheimer's disease (FAD)" : "Alzheimer's disease",
 'Familial isolated pituitary adenoma (FIPA)' : "Pituitary Adenoma",
 'Gastric cancer' : "Gastric cancer",
 'Gaucher disease' : "Gaucher disease",
 'Glioblastoma multiforme' : "Glioblastoma multiforme",
 'Glioblastoma multiforme (GBM)' : "Glioblastoma multiforme",
 'Hepatocellular carcinoma' : "Hepatocellular carcinoma",
 'Hepatocellular carcinoma (HCC)' : "Hepatocellular carcinoma",
 'Huntington disease' : "Huntington's disease",
 'Huntington disease (HD)' : "Huntington's disease",
 "Huntington's disease" : "Huntington's disease",
 "Huntington's disease (HD)" : "Huntington's disease",
 'Lesch-Nyhan syndrome' : "Lesch-Nyhan syndrome",
 'Melanoma' : "Melanoma",
 'Melanoma (H00038)' : "Melanoma",
 'Methylmalonic aciduria and homocystinuria (MAHC)' : "Methylmalonic aciduria and homocystinuria",
 'Multiple Endocrine Neoplasia type 1 (MEN1)' : "Multiple Endocrine Neoplasia type 1",
 'N-acetylglutamate synthase (NAGS) deficiency' : "N-acetylglutamate synthase deficiency",
 'Non-small cell lung cancer' : "Non-small cell lung cancer",
 'Non-small cell lung cancer (NSCLC)' : "Non-small cell lung cancer",
 'Non-small-cell lung cancer' : "Non-small cell lung cancer",
 'Non-small-cell lung cancer (NSCLC)' : "Non-small cell lung cancer",
 'Pancreatic ductal adenocarcinoma' : "Pancreatic ductal adenocarcinoma",
 'Papillary Renal Cell Carcinoma' : "Papillary Renal Cell Carcinoma",
 'Papillary renal cell carcinoma' : "Papillary Renal Cell Carcinoma",
 'Papillary thyroid carcinoma' : "Papillary thyroid carcinoma",
 'Papillary thyroid carcinoma (PTC)' : "Papillary thyroid carcinoma",
 "Parkinson's Disease" : "Parkinson's Disease",
 "Parkinson's disease" : "Parkinson's Disease",
 "Parkinson's disease (PD)" : "Parkinson's Disease",
 'Pituitary adenoma' : "Pituitary Adenoma",
 'Primary Aldosteronism' : "Primary Aldosteronism",
 'Primary aldosteronism' : "Primary Aldosteronism",
 'Prion disease' : "Prion disease",
 'Prion diseases' : "Prion disease",
 'Prostate cancer' : "Prostate cancer",
 'Renal cell cancer (RCC)' : "Renal cell carcinoma",
 'Renal cell carcinoma' : "Renal cell carcinoma",
 'Renal cell carcinoma (RCC)' : "Renal cell carcinoma",
 'Robinow syndrome' : "Robinow syndrome",
 'Sphingolipidoses' : "Sphingolipidoses",
 'Sphingolipidosis' : "Sphingolipidoses",
 'Spinocerebellar Ataxia (SCA)' : "Spinocerebellar Ataxia",
 'Spinocerebellar Ataxia (SCA19/22)' : "Spinocerebellar Ataxia",
 'Spinocerebellar Ataxia Type 1 (SCA1)' : "Spinocerebellar Ataxia",
 'Spinocerebellar Ataxia Type 13 (SCA13)' : "Spinocerebellar Ataxia",
 'Spinocerebellar Ataxia Type 14 (SCA14)' : "Spinocerebellar Ataxia",
 'Spinocerebellar Ataxia Type 15 (SCA15)' : "Spinocerebellar Ataxia",
 'Spinocerebellar Ataxia Type 2 (SCA2)' : "Spinocerebellar Ataxia",
 'Spinocerebellar Ataxia Type 3' : "Spinocerebellar Ataxia",
 'Spinocerebellar Ataxia Type 3 (SCA3)' : "Spinocerebellar Ataxia",
 'Spinocerebellar Ataxia Type 5 (SCA5)' : "Spinocerebellar Ataxia",
 'Spinocerebellar Ataxia type 13 (SCA13)' : "Spinocerebellar Ataxia",
 'Spinocerebellar Ataxia type 6 (SCA6)' : "Spinocerebellar Ataxia",
 'Spinocerebellar ataxia' : "Spinocerebellar Ataxia",
 'Spinocerebellar ataxia (SCA)' : "Spinocerebellar Ataxia",
 'Spinocerebellar ataxia (SCA19/22)' : "Spinocerebellar Ataxia",
 'Spinocerebellar ataxia type 1 (SCA1)' : "Spinocerebellar Ataxia",
 'Spinocerebellar ataxia type 19 (SCA19)' : "Spinocerebellar Ataxia",
 'Spinocerebellar ataxia type 19/22 (SCA19/22)' : "Spinocerebellar Ataxia",
 'Spinocerebellar ataxia type 2 (SCA2)' : "Spinocerebellar Ataxia",
 'Spinocerebellar ataxia type 3' : "Spinocerebellar Ataxia",
 'Spinocerebellar ataxia type 3 (SCA3)' : "Spinocerebellar Ataxia",
 'Spinocerebellar ataxia type 5 (SCA5)' : "Spinocerebellar Ataxia",
 'Spinocerebellar ataxia type 6 (SCA6)' : "Spinocerebellar Ataxia",
 'Thyroid cancer' : "Thyroid cancer",
 'Thyroid dyshormonogenesis' : "Thyroid dyshormonogenesis",
 'Urothelial carcinoma' : "Urothelial carcinoma",
 'von Hippel-Lindau syndrome' : "von Hippel-Lindau syndrome"}

In [19]:
!mkdir final_data_fix

In [None]:
import json
import os

# CONFIG parameters
CONFIG = {
    'final_data_dir': 'final_data',
    'final_data_fix_dir': 'final_data_fix',
    'variant_range': (1, 1450)
}

# Dummy new_disease mapping for demonstration
new_disease = {
    "disease_A": "new_disease_A",
    "disease_B": "new_disease_B"
    # Add more mappings as needed
}

# Standardize disease labels using the mapping dictionary
final_data_dir = CONFIG['final_data_dir']
final_data_fix_dir = CONFIG['final_data_fix_dir']
start_idx, end_idx = CONFIG['variant_range']

print("Applying disease label standardization...")
print(f"Input: {final_data_dir}")
print(f"Output: {final_data_fix_dir}")

processed_count = 0
error_count = 0

for i in range(start_idx, end_idx):
    try:
        input_file = f'{final_data_dir}/KEGG_{i}_with_seqs.json'
        if not os.path.exists(input_file):
            continue
            
        with open(input_file, 'r') as file:
            data = json.load(file)

        # Get original answer
        temp = data.get('answer', '')
        
        # Apply standardization if mapping exists
        if temp in new_disease:
            data['answer'] = new_disease[temp]
        else:
            print(f"[Warning] No mapping found for disease: {temp}")
        
        # Save to standardized directory
        output_file = f'{final_data_fix_dir}/KEGG_{i}_with_seqs.json'
        with open(output_file, 'w') as out_file:
            json.dump(data, out_file, indent=2)
            
        processed_count += 1
        
        if processed_count % 100 == 0:
            print(f"Standardized {processed_count} disease labels...")
            
    except Exception as e:
        print(f"[Error] Failed to process {input_file}: {str(e)}")
        error_count += 1

print(f"✅ Disease label standardization complete:")
print(f"  Successfully processed: {processed_count}")
print(f"  Errors encountered: {error_count}")

In [None]:
# Remove original final_data directory and replace with standardized version
final_data_dir = CONFIG['final_data_dir']
final_data_fix_dir = CONFIG['final_data_fix_dir']

import shutil
import os

if os.path.exists(final_data_dir):
    shutil.rmtree(final_data_dir)
    print(f"Removed original directory: {final_data_dir}")
else:
    print(f"Directory not found: {final_data_dir}")

In [None]:
import os
from your_config_module import CONFIG  # Adjust the import based on your project structure

# Rename standardized directory to final_data
final_data_dir = CONFIG['final_data_dir']
final_data_fix_dir = CONFIG['final_data_fix_dir']

if os.path.exists(final_data_fix_dir):
    os.rename(final_data_fix_dir, final_data_dir)
    print(f"Renamed {final_data_fix_dir} to {final_data_dir}")
    print("✅ Final dataset with standardized disease labels is ready")
else:
    print(f"Directory not found: {final_data_fix_dir}")

In [None]:
import os
import json

# Assuming CONFIG is defined somewhere earlier in the code
# CONFIG = {
#     'final_data_dir': 'path_to_final_data_dir',
#     'variant_range': (1, 1450)
# }

# Verify standardized disease labels
final_data_dir = CONFIG['final_data_dir']
start_idx, end_idx = CONFIG['variant_range']

print("Verifying standardized disease labels...")

disease = []
for i in range(start_idx, end_idx):
    try:
        input_file = f'{final_data_dir}/KEGG_{i}_with_seqs.json'
        if os.path.exists(input_file):
            with open(input_file, 'r') as file:
                data = json.load(file)
            
            if 'answer' in data:
                disease.append(data['answer'])
                
    except Exception as e:
        print(f"[Warning] Could not verify {input_file}: {str(e)}")

print(f"✅ Verification complete:")
print(f"  Total disease labels: {len(disease)}")
print(f"  Unique diseases: {len(set(disease))}")
print(f"  Top 10 diseases: {list(set(disease))[:10]}")

# Saving the KEGG Task to the WangLab Hugging Face

In [None]:
cd ../../bioR_tasks

In [2]:
mkdir kegg_variant

In [None]:
cp ../BioReason/data/kegg_data/final_data/* kegg_variant/

# Creating the Nt Variant Database

In [None]:
cd kegg_data

In [None]:
from Bio import SeqIO
import pandas as pd
import json
import os
from pathlib import Path

# Optional: Uncomment if you want to use HuggingFace datasets
# from datasets import load_dataset, Dataset, DatasetDict

print("Imports loaded for nucleotide database creation")

In [None]:
# Load variant data for nucleotide database creation
network_file = CONFIG['network_data_file']
variant_data = pd.read_csv(network_file, sep='\t')
print(f"✅ Loaded variant data: {len(variant_data)} entries")
variant_data.head()

Unnamed: 0,Var_ID,Network,Entry,Source,ID,TranscriptID,NucChange,Chr,Start,End,...,Network Expanded,Pathway,Class,Disease,Gene,Variant_Name,Variant_Gene,Variant_Gene Info,Variant_Type,Disease_Names
0,KEGG_1,N00073,1019v2,ClinVar,16929,NC_000012.12,,12,57751646,57751646,...,"((595,894,896)+1019v2) -> 5925 // (1869,1870,1...",{'hsa05218': 'Melanoma'},"{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...",{'H00038': 'Melanoma is a form of skin cancer ...,"{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...",CDK4 mutation,CDK4,cyclin dependent kinase 4 [KO:K02089],,{'H00038': 'Melanoma'}
1,KEGG_2,N00073,1019v2,dbSNP,rs104894340,NC_000012.12,,12,57751646,57751646,...,"((595,894,896)+1019v2) -> 5925 // (1869,1870,1...",{'hsa05218': 'Melanoma'},"{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...",{'H00038': 'Melanoma is a form of skin cancer ...,"{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...",CDK4 mutation,CDK4,cyclin dependent kinase 4 [KO:K02089],,{'H00038': 'Melanoma'}
2,KEGG_3,N00073,1019v2,dbSNP,rs104894340,NC_000012.12,,12,57751646,57751646,...,"((595,894,896)+1019v2) -> 5925 // (1869,1870,1...",{'hsa05218': 'Melanoma'},"{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...",{'H00038': 'Melanoma is a form of skin cancer ...,"{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...",CDK4 mutation,CDK4,cyclin dependent kinase 4 [KO:K02089],,{'H00038': 'Melanoma'}
3,KEGG_4,N00073,1019v2,ClinVar,16928,NC_000012.12,,12,57751647,57751647,...,"((595,894,896)+1019v2) -> 5925 // (1869,1870,1...",{'hsa05218': 'Melanoma'},"{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...",{'H00038': 'Melanoma is a form of skin cancer ...,"{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...",CDK4 mutation,CDK4,cyclin dependent kinase 4 [KO:K02089],,{'H00038': 'Melanoma'}
4,KEGG_5,N00073,1019v2,dbSNP,rs11547328,NC_000012.12,,12,57751647,57751647,...,"((595,894,896)+1019v2) -> 5925 // (1869,1870,1...",{'hsa05218': 'Melanoma'},"{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...",{'H00038': 'Melanoma is a form of skin cancer ...,"{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc...",CDK4 mutation,CDK4,cyclin dependent kinase 4 [KO:K02089],,{'H00038': 'Melanoma'}
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1444,KEGG_1445,N00244,9817v1,COSM,6196635,ENST00000393623.6,c.706G>T,19,10492196,10492196,...,"9817v1 // 4780 => (3162,1728,119391,221357,293...",{'hsa05225': 'Hepatocellular carcinoma'},"{'nt06263': 'Hepatocellular carcinoma', 'nt062...",{'H00048': 'Hepatocellular carcinoma (HCC) is ...,{'9817': 'KEAP1; kelch like ECH associated pro...,KEAP1 mutation,KEAP1,kelch like ECH associated protein 1 [KO:K10456],,{'H00048': 'Hepatocellular carcinoma;'}
1445,KEGG_1446,N00244,9817v1,COSM,6196637,ENST00000393623.6,c.548A>G,19,10499486,10499486,...,"9817v1 // 4780 => (3162,1728,119391,221357,293...",{'hsa05225': 'Hepatocellular carcinoma'},"{'nt06263': 'Hepatocellular carcinoma', 'nt062...",{'H00048': 'Hepatocellular carcinoma (HCC) is ...,{'9817': 'KEAP1; kelch like ECH associated pro...,KEAP1 mutation,KEAP1,kelch like ECH associated protein 1 [KO:K10456],,{'H00048': 'Hepatocellular carcinoma;'}
1446,KEGG_1447,N00258,999v2,COSM,4766271,ENST00000621016.4,c.662A>G,16,68808823,68808823,...,"999v2 // 1499 -> (6932,83439,6934,51176) => (4...",{'hsa05226': 'Gastric cancer'},"{'nt06261': 'Gastric cancer', 'nt06215': 'WNT ...","{'H00018': ""Gastric cancer (GC) is one of the ...","{'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c...",CDH1 mutation,CDH1,cadherin 1 [KO:K05689],,{'H00018': 'Gastric cancer'}
1447,KEGG_1448,N00258,999v2,COSM,4766211,ENST00000621016.4,c.755T>G,16,68810264,68810264,...,"999v2 // 1499 -> (6932,83439,6934,51176) => (4...",{'hsa05226': 'Gastric cancer'},"{'nt06261': 'Gastric cancer', 'nt06215': 'WNT ...","{'H00018': ""Gastric cancer (GC) is one of the ...","{'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c...",CDH1 mutation,CDH1,cadherin 1 [KO:K05689],,{'H00018': 'Gastric cancer'}


In [6]:
len(variant_data)

1449

In [7]:
variant_data.iloc[1]["Network"]

'N00073'

In [None]:
from Bio import SeqIO
import os

# Load reference genome sequences
fasta_file = CONFIG['reference_fasta']
if not os.path.exists(fasta_file):
    print(f"❌ Reference genome file not found: {fasta_file}")
    print("Please update CONFIG['reference_fasta'] with correct path")
    raise FileNotFoundError(f"Reference genome not found: {fasta_file}")

record_dict = SeqIO.to_dict(SeqIO.parse(fasta_file, "fasta"))
print(f"✅ Loaded reference genome: {len(record_dict)} sequences")

In [None]:
# Use chromosome dictionary from configuration
chromosome_dictionary = CONFIG['chromosome_dictionary']
print(f"✅ Chromosome mapping loaded: {len(chromosome_dictionary)} chromosomes")
print("Available chromosomes:", list(chromosome_dictionary.keys()))

### Verification that the reference is present at the exact position I have in my data

In [None]:
# Verify reference sequences (alternative implementation)
chromosome_dictionary = CONFIG['chromosome_dictionary']
verification_file = "verification_alt.txt"

print(f"Starting alternative sequence verification...")
print(f"Results will be saved to: {verification_file}")

with open(verification_file, "w") as f:
    for i in range(len(variant_data)):
        try:
            # ---- Input ----
            chromosome_id = chromosome_dictionary[str(variant_data.iloc[i]['Chr'])]
            if (variant_data.iloc[i]['TranscriptID'][:4] == "ENST"):
                start = variant_data.iloc[i]['Start'] - 1
            else:
                start = variant_data.iloc[i]['Start']
            reference_allele = variant_data.iloc[i]['RefAllele']
            end = len(reference_allele) + start

            chrom_seq = record_dict[chromosome_id].seq

            # Adjust for 0-based indexing in Python
            genomic_ref = chrom_seq[start: start + len(reference_allele)]

            if genomic_ref.upper() != reference_allele.upper():
                f.write(f"⚠️ Warning: Entry number {i} with variant {variant_data.iloc[i]['ID']} expected '{reference_allele}', but found '{genomic_ref}'\n")
            else:
                f.write(f"✅ Verified: {chromosome_id}:{start}-{end} → '{reference_allele}' matches genome\n")
        
        except Exception as e:
            f.write(f"❌ Error verifying variant {i}: {str(e)}\n")
        
        if (i + 1) % 200 == 0:
            print(f"Verified {i + 1}/{len(variant_data)} variants...")

print(f"✅ Alternative verification complete. Results: {verification_file}")

## Read in Final_data JSON files

In [None]:
import re
import os
import json
import pandas as pd
from pathlib import Path

# Read final dataset JSON files and create combined DataFrame

# Path to the directory containing JSON files
json_dir = CONFIG['final_data_dir']
if not os.path.exists(json_dir):
    print(f"❌ JSON directory not found: {json_dir}")
    print("Please ensure previous processing steps completed successfully")
    raise FileNotFoundError(f"Directory not found: {json_dir}")

print(f"Processing JSON files from: {json_dir}")

# Initialize a list to hold DataFrames
df_list = []
processed_count = 0

# Loop through all files in the directory
for filename in os.listdir(json_dir):
    if filename.endswith(".json"):
        match = re.search(r"(KEGG_\d+)_with_seqs", filename)
        if match:
            kegg_id = match.group(1)  # Extract 'KEGG_<number>'
            file_path = os.path.join(json_dir, filename)
            
            try:
                with open(file_path, 'r') as f:
                    data = json.load(f)
                    
                df = pd.json_normalize(data)
                df['ID'] = kegg_id  # Add the full KEGG ID string
                df['temp_ID'] = int(kegg_id[5:])  # Extract numeric ID for sorting
                df_list.append(df)
                processed_count += 1
                
                if processed_count % 100 == 0:
                    print(f"Processed {processed_count} JSON files...")
                    
            except Exception as e:
                print(f"[Warning] Could not process {filename}: {str(e)}")

# Concatenate all DataFrames into one
if df_list:
    combined_df = pd.concat(df_list, ignore_index=True)
    print(f"✅ Combined {len(df_list)} JSON files into DataFrame")
    print(f"Total samples: {len(combined_df)}")
else:
    print("❌ No JSON files found or processed successfully")
    combined_df = pd.DataFrame()

# Display the result
combined_df.head() if not combined_df.empty else print("No data to display")

Unnamed: 0,question,answer,reference_sequence,variant_sequence,reasoning.reasoning_steps,ID,temp_ID
0,Chromosome Number: 20\nNetwork Definition of t...,Creutzfeldt-Jakob Disease,AATAAGCACAGAGGAAATAACATAATATCTCAAGTAGCTGTAACTG...,AATAAGCACAGAGGAAATAACATAATATCTCAAGTAGCTGTAACTG...,[Step 1: The variant is an insertion in the PR...,KEGG_854,854
1,Chromosome Number: 20\nNetwork Definition of t...,Creutzfeldt-Jakob Disease,AATAAGCACAGAGGAAATAACATAATATCTCAAGTAGCTGTAACTG...,AATAAGCACAGAGGAAATAACATAATATCTCAAGTAGCTGTAACTG...,[Step 1: The variant is a deletion of 47 nucle...,KEGG_841,841
2,Chromosome Number: 21\nNetwork Definition of t...,Alzheimer's disease,GCTGAAACTAACATACTAGCAATCTGGAAAGGCAATATAAAATATA...,GCTGAAACTAACATACTAGCAATCTGGAAAGGCAATATAAAATATA...,[Step 1: The TC>GA mutation in the APP gene on...,KEGG_468,468
3,Chromosome Number: 1\nNetwork Definition of th...,Primary Aldosteronism,AATGTTATAAAAGTAAATAGTCTAGGGATGTCTTATTTCCAGATAA...,AATGTTATAAAAGTAAATAGTCTAGGGATGTCTTATTTCCAGATAA...,[Step 1: The variant KEGG_635 is a 15-nucleoti...,KEGG_635,635
4,Chromosome Number: 14\nNetwork Definition of t...,Spinocerebellar Ataxia,TCATTAACTAAATGAAGGTCTACATTTAGGCAGTTTGTAATTTTGG...,TCATTAACTAAATGAAGGTCTACATTTAGGCAGTTTGTAATTTTGG...,[Step 1: The variant is a trinucleotide repeat...,KEGG_620,620
...,...,...,...,...,...,...,...
1444,Chromosome Number: 6\nNetwork Definition of th...,Spinocerebellar Ataxia,gaCGTATACATTAAATGTGTACAGTTTTTGTATATCAATTAGAAGT...,gaCGTATACATTAAATGTGTACAGTTTTTGTATATCAATTAGAAGT...,[Step 1: The variant KEGG_286 is an A>G substi...,KEGG_286,286
1445,Chromosome Number: 6\nNetwork Definition of th...,Spinocerebellar Ataxia,TTTTCaagattataaaatatgaaatgtcaAAATATTACCTTCATCA...,TTTTCaagattataaaatatgaaatgtcaAAATATTACCTTCATCA...,[Step 1: The variant is a single cytosine (C) ...,KEGG_293,293
1446,Chromosome Number: 12\nNetwork Definition of t...,Pituitary Adenoma,GTGGACAAGATGTGGCTAAGAAAACAAGCTACACATCAAGCTCATC...,GTGGACAAGATGTGGCTAAGAAAACAAGCTACACATCAAGCTCATC...,[Step 1: The variant is a 20-nucleotide duplic...,KEGG_7,7
1447,Chromosome Number: 11\nNetwork Definition of t...,Spinocerebellar Ataxia,ATCTGTGGGTTCTTTTAAATAGGCCAGATTTTATCCTAAAGGTAAG...,ATCTGTGGGTTCTTTTAAATAGGCCAGATTTTATCCTAAAGGTAAG...,[Step 1: The variant KEGG_1285 is an A>G subst...,KEGG_1285,1285


In [30]:
combined_df = combined_df.sort_values(by=['temp_ID'])
combined_df = combined_df.rename(columns={"reasoning.reasoning_steps" : "reasoning"})
combined_df = combined_df.drop(columns=['temp_ID'])

In [33]:
combined_df = combined_df[['ID','question','answer','reference_sequence','variant_sequence','reasoning']]
combined_df = combined_df.reset_index(drop=True)

In [34]:
combined_df

Unnamed: 0,ID,question,answer,reference_sequence,variant_sequence,reasoning
0,KEGG_1,Chromosome Number: 12\nNetwork Definition of t...,Melanoma,gcttgagcccaggagttctagatcagcctgggcaagcaagaccttg...,gcttgagcccaggagttctagatcagcctgggcaagcaagaccttg...,[Step 1: The C>T mutation at position 57751646...
1,KEGG_2,Chromosome Number: 12\nNetwork Definition of t...,Melanoma,gcttgagcccaggagttctagatcagcctgggcaagcaagaccttg...,gcttgagcccaggagttctagatcagcctgggcaagcaagaccttg...,[Step 1: The C>A mutation at position 57751646...
2,KEGG_3,Chromosome Number: 12\nNetwork Definition of t...,Melanoma,gcttgagcccaggagttctagatcagcctgggcaagcaagaccttg...,gcttgagcccaggagttctagatcagcctgggcaagcaagaccttg...,[Step 1: The C>G mutation at position 57751646...
3,KEGG_4,Chromosome Number: 12\nNetwork Definition of t...,Melanoma,cttgagcccaggagttctagatcagcctgggcaagcaagaccttgt...,cttgagcccaggagttctagatcagcctgggcaagcaagaccttgt...,[Step 1: The G>A mutation at position 57751647...
4,KEGG_5,Chromosome Number: 12\nNetwork Definition of t...,Melanoma,cttgagcccaggagttctagatcagcctgggcaagcaagaccttgt...,cttgagcccaggagttctagatcagcctgggcaagcaagaccttgt...,[Step 1: The G>C mutation at position 57751647...
...,...,...,...,...,...,...
1444,KEGG_1445,Chromosome Number: 19\nNetwork Definition of t...,Hepatocellular carcinoma,gagctgagatcatgccactgcactccaacctgggcaacagagcgag...,gagctgagatcatgccactgcactccaacctgggcaacagagcgag...,[Step 1: The variant is a C>A substitution at ...
1445,KEGG_1446,Chromosome Number: 19\nNetwork Definition of t...,Hepatocellular carcinoma,TGAAGGGTAGTACTGCCTCATAGGACAGTTGGGAACAGTCAATGAT...,TGAAGGGTAGTACTGCCTCATAGGACAGTTGGGAACAGTCAATGAT...,[Step 1: The variant is a T>C substitution at ...
1446,KEGG_1447,Chromosome Number: 16\nNetwork Definition of t...,Gastric cancer,CAAACACAGGATGTAGAGGGCAGAGAGCATaggtgtgtgcgcatgt...,CAAACACAGGATGTAGAGGGCAGAGAGCATaggtgtgtgcgcatgt...,[Step 1: The variant KEGG_1447 represents an A...
1447,KEGG_1448,Chromosome Number: 16\nNetwork Definition of t...,Gastric cancer,GATATATATAATTTGTCATTGATAAGAGAATGTGTCATTAAATTCA...,GATATATATAATTTGTCATTGATAAGAGAATGTGTCATTAAATTCA...,[Step 1: The variant KEGG_1448 is a T>G substi...


### Performing the mutation and saving the reference and variant allele with a 1000 nt window

In [42]:
len(combined_df.iloc[0]['reference_sequence'])

4001

In [44]:
KEGG_2000 = combined_df.copy()

In [49]:
KEGG_2000.at[1,'ID']

'KEGG_2'

In [None]:
# Generate sequences with updated window size
chromosome_dictionary = CONFIG['chromosome_dictionary']
window = CONFIG['sequence_window']

print(f"Generating sequences with {window}bp windows...")
KEGG_2000 = combined_df.copy()

for i in range(len(KEGG_2000)):
    try:
        chromosome_id = chromosome_dictionary[str(variant_data.iloc[i]['Chr'])]
        if (variant_data.iloc[i]['TranscriptID'][:4] == "ENST"):
            start = variant_data.iloc[i]['Start'] - 1
        else:
            start = variant_data.iloc[i]['Start']
        reference_allele = variant_data.iloc[i]['RefAllele']
        variant_allele = variant_data.iloc[i]['AltAllele']

        end = len(reference_allele) + start
        
        chrom_seq = record_dict[chromosome_id].seq

        # Extract region
        region_start = max(0, start - window)
        region_end = end + window

        ref_seq = chrom_seq[region_start:region_end]

        if (variant_allele == "deletion"):
            # Apply mutation
            mutated_seq = ref_seq[:window] + ref_seq[window + len(reference_allele):]

            KEGG_2000.at[i,'reference_sequence'] = str(ref_seq)
            KEGG_2000.at[i,'variant_sequence'] = str(mutated_seq)
            
        else:
            del_len = len(reference_allele)
            # Apply mutation
            mutated_seq = ref_seq[:window] + variant_allele + ref_seq[window + del_len:]

            KEGG_2000.at[i,'reference_sequence'] = str(ref_seq)
            KEGG_2000.at[i,'variant_sequence'] = str(mutated_seq)
            
        if (i + 1) % 100 == 0:
            print(f"Generated sequences for {i + 1}/{len(KEGG_2000)} variants...")
            
    except Exception as e:
        print(f"[Error] Failed to generate sequence for variant {i}: {str(e)}")

print(f"✅ Sequence generation complete for {window}bp windows")

In [64]:
KEGG_2000

Unnamed: 0,ID,question,answer,reference_sequence,variant_sequence,reasoning
0,KEGG_1,Chromosome Number: 12\nNetwork Definition of t...,Melanoma,TTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA...,TTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA...,[Step 1: The C>T mutation at position 57751646...
1,KEGG_2,Chromosome Number: 12\nNetwork Definition of t...,Melanoma,TTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA...,TTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA...,[Step 1: The C>A mutation at position 57751646...
2,KEGG_3,Chromosome Number: 12\nNetwork Definition of t...,Melanoma,TTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA...,TTCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACA...,[Step 1: The C>G mutation at position 57751646...
3,KEGG_4,Chromosome Number: 12\nNetwork Definition of t...,Melanoma,TCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACAT...,TCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACAT...,[Step 1: The G>A mutation at position 57751647...
4,KEGG_5,Chromosome Number: 12\nNetwork Definition of t...,Melanoma,TCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACAT...,TCAAGGTAGTCCAGGGTATGTGGGTCCCATACTTTCGACGAAACAT...,[Step 1: The G>C mutation at position 57751647...
...,...,...,...,...,...,...
1444,KEGG_1445,Chromosome Number: 19\nNetwork Definition of t...,Hepatocellular carcinoma,gcactccagcctgggcaacagagcaagagagacagggtcttactct...,gcactccagcctgggcaacagagcaagagagacagggtcttactct...,[Step 1: The variant is a C>A substitution at ...
1445,KEGG_1446,Chromosome Number: 19\nNetwork Definition of t...,Hepatocellular carcinoma,ctcccaaagtgctgggattacaggcgtgagccactgggccctgcCC...,ctcccaaagtgctgggattacaggcgtgagccactgggccctgcCC...,[Step 1: The variant is a T>C substitution at ...
1446,KEGG_1447,Chromosome Number: 16\nNetwork Definition of t...,Gastric cancer,ggctgggtgtggtggctcatgcctgtaatcccagcattttgggagg...,ggctgggtgtggtggctcatgcctgtaatcccagcattttgggagg...,[Step 1: The variant KEGG_1447 represents an A...
1447,KEGG_1448,Chromosome Number: 16\nNetwork Definition of t...,Gastric cancer,tttgagatagggtttcactctgtcacccaggctggaaccacaacct...,tttgagatagggtttcactctgtcacccaggctggaaccacaacct...,[Step 1: The variant KEGG_1448 is a T>G substi...


In [None]:
# Create dataset structure (HuggingFace datasets optional)
try:
    from datasets import Dataset, DatasetDict
    
    # Create Hugging Face Datasets
    train_dataset = Dataset.from_pandas(KEGG_2000)
    
    # Combine into a DatasetDict
    dataset = DatasetDict({
        "train": train_dataset,
    })
    
    print("✅ HuggingFace dataset created")
    use_hf_datasets = True
    
except ImportError:
    print("⚠️ HuggingFace datasets not available, using pandas only")
    dataset = KEGG_2000
    train_dataset = KEGG_2000
    use_hf_datasets = False

print(f"Final dataset contains {len(train_dataset)} samples")

In [69]:
dataset

DatasetDict({
    train: Dataset({
        features: ['ID', 'question', 'answer', 'reference_sequence', 'variant_sequence', 'reasoning'],
        num_rows: 1449
    })
})

In [None]:
# Save final dataset locally instead of uploading to HuggingFace
# Users can upload to their own repositories if needed

output_file = "kegg_variant_dataset_final.parquet"
dataset_info_file = "dataset_info.json"

# Save dataset as Parquet for efficient storage
train_dataset.to_parquet(output_file)
print(f"✅ Dataset saved to: {output_file}")

# Save dataset information
dataset_info = {
    "name": "KEGG Variant Dataset",
    "description": "Genetic variants with biological reasoning for disease association",
    "total_samples": len(train_dataset),
    "sequence_length": f"~{CONFIG['sequence_window']*2}bp",
    "features": list(train_dataset.column_names),
    "diseases": len(set(disease)) if 'disease' in locals() else "Unknown",
    "created_by": "KEGG Data Processing Pipeline",
    "version": "1.0"
}

with open(dataset_info_file, 'w') as f:
    json.dump(dataset_info, f, indent=2)
    
print(f"✅ Dataset information saved to: {dataset_info_file}")
print(f"\nDataset ready for use:")
print(f"  - Main dataset: {output_file}")
print(f"  - Information: {dataset_info_file}")
print(f"  - Samples: {len(train_dataset)}")
print(f"  - Features: {train_dataset.column_names}")

print("\n📝 To upload to HuggingFace Hub:")
print("dataset.push_to_hub('your-username/your-dataset-name')")

# KEGG Dataset with Alternative Window Size

This section demonstrates creating the dataset with different sequence window parameters.