In [None]:
# Configuration - Update these paths for your environment
import os
from pathlib import Path

# Create and navigate to kegg_data directory
data_dir = Path('kegg_data')
data_dir.mkdir(exist_ok=True)
os.chdir(data_dir)

# Configuration parameters
CONFIG = {
    # Output directories
    'network_dir': 'kegg_network',
    'variant_network_dir': 'network_variant', 
    'variant_info_dir': 'variant_info',
    
    # Reference data paths (update these to point to your reference files)
    'cosmic_fusion_data': 'data/Cosmic_Fusion_v101_GRCh38.tsv',  # Update path as needed
    'reference_genome': 'data/GRCh38_genomic.fna',  # Update path as needed
    
    # Processing parameters
    'num_threads': 4,  # Adjust based on your system
    'batch_size': 1000
}

# Create required directories
for dir_name in [CONFIG['network_dir'], CONFIG['variant_network_dir'], CONFIG['variant_info_dir']]:
    Path(dir_name).mkdir(exist_ok=True)

print(f"Working directory: {os.getcwd()}")
print("Configuration loaded. Directory structure created.")
print("\n📝 Update CONFIG dictionary above with your actual file paths for reference data")

# KEGG Data Processing Pipeline - Part 1: Data Retrieval and Network Analysis

## Overview

This notebook is the first part of a comprehensive KEGG (Kyoto Encyclopedia of Genes and Genomes) data processing pipeline for genetic variant analysis. It focuses on downloading and processing KEGG network data, disease associations, and variant information.

## What This Notebook Does

1. **KEGG Data Retrieval**: Downloads disease lists, network data, and pathway information from KEGG REST API
2. **Network Analysis**: Processes KEGG network files to identify reference vs disease networks
3. **Variant Extraction**: Identifies and extracts genetic variants from network data
4. **Data Filtering**: Cleans and filters variant information for downstream analysis
5. **Reference Data**: Processes genomic reference sequences and chromosome data

## Prerequisites

- Python 3.7+ with required packages (see requirements below)
- `kegg_pull` package for KEGG data retrieval
- `seqkit` for sequence processing
- Internet connection for KEGG API access
- Sufficient storage space (several GB for full dataset)

## Required Packages

```bash
pip install kegg-pull biopython pandas
```

## Directory Structure

This notebook expects and creates the following structure:
```
kegg_data/
├── kegg_diseases.txt
├── network_pathway.tsv
├── network_disease.tsv
├── kegg_network/
├── network_variant/
├── variant_info/
└── output files...
```

## Important Notes

- **Processing Time**: Full dataset processing can take several hours
- **Storage Requirements**: ~5-10GB of storage needed for complete dataset
- **API Limits**: KEGG REST API has rate limits; process may need pausing
- **Network Access**: Requires stable internet connection for data downloads

## Next Steps

After completing this notebook:
1. Run `KEGG_Data_2.ipynb` for variant information parsing
2. Run `KEGG_Data_3.ipynb` for final dataset creation with sequences

## Configuration

Set up paths and parameters for the data processing pipeline:

In [1]:
cd kegg_data

In [None]:
curl -s "https://rest.kegg.jp/list/disease" > kegg_diseases.txt

In [1]:
ls

KEGG_data.ipynb		classify.py		model.py
LICENSE			dataset.py		model_decoder.py
README.md		dna_classifier.py	playground.ipynb
baseline.py		finetune.py		requirements.txt
baseline_model.py	kegg_diseases.txt


In [1]:
curl -s "https://rest.kegg.jp/list/network" | wc -l

    1593


Use kegg_pull for retrieving KEGG data https://github.com/MoseleyBioinformaticsLab/kegg_pull

```python3 -m pip install kegg-pull```

In [1]:
kegg_pull -v

3.1.0


In [18]:
kegg_pull --full-help


Usage:
    kegg_pull -h | --help           Show this help message.
    kegg_pull -v | --version        Displays the package version.
    kegg_pull --full-help           Show the help message of all sub commands.
    kegg_pull pull ...              Pull, separate, and store an arbitrary number of KEGG entries to the local file system.
    kegg_pull entry-ids ...         Obtain a list of KEGG entry IDs.
    kegg_pull map ...               Obtain a mapping of entry IDs (KEGG or outside databases) to the IDs of related entries.
    kegg_pull pathway-organizer ... Creates a flattened version of a pathways Brite hierarchy.
    kegg_pull rest ...              Executes one of the KEGG REST API operations.

--------------------------------------------------------------------------------

Usage:
    kegg_pull pull -h | --help
    kegg_pull pull database <database> [--force-single-entry] [--multi-process] [--n-workers=<n-workers>] [--output=<output>] [--print] [--sep=<print-separator>] [--entry-

In [4]:
kegg_pull rest list network

N00001	EGF-EGFR-RAS-ERK signaling pathway
N00002	BCR-ABL fusion kinase to RAS-ERK signaling pathway
N00003	Mutation-activated KIT to RAS-ERK signaling pathway
N00004	Duplication or mutation-activated FLT3 to RAS-ERK signaling pathway
N00005	Mutation-activated MET to RAS-ERK signaling pathway
N00006	Amplified EGFR to RAS-ERK signaling pathway
N00007	EML4-ALK fusion kinase to RAS-ERK signaling pathway
N00008	RET fusion kinase to RAS-ERK signaling pathway
N00009	TRK fusion kinase to RAS-ERK signaling pathway
N00010	Mutation-inactivated PTCH1 to Hedgehog signaling pathway
N00011	Mutation-activated FGFR3 to RAS-ERK signaling pathway
N00012	Mutation-activated KRAS/NRAS to ERK signaling pathway
N00013	Mutation-activated BRAF to ERK signaling pathway
N00014	Mutation-activated EGFR to RAS-ERK signaling pathway
N00015	PDGF-PDGFR-RAS-ERK signaling pathway
N00016	PDGF-overexpression to RAS-ERK signaling pathway
N00017	Mutation-activated SMO to Hedgehog signaling pathway
N00018	Amplified PDGFR to R

In [12]:
#kegg_pull pull database network

# Pulling all nodes in the network database. Will download it to current working directory. 

In [17]:
kegg_pull rest info network

network          KEGG Network Database
ne               Release 114.0+/04-11, Apr 25
                 Kanehisa Laboratories
                 1,637 entries

linked db        pathway
                 ko
                 hsa
                 compound
                 variant
                 disease
                 pubmed



In [24]:
kegg_pull rest link network pathway | wc -l

    1415


In [42]:
kegg_pull rest link network pathway --output network_pathway.tsv

In [43]:
wc -l network_pathway.tsv

    1414 network_pathway.tsv


In [44]:
head network_pathway.tsv

path:hsa05225	ne:N00005
path:hsa05211	ne:N00005
path:hsa05223	ne:N00007
path:hsa05216	ne:N00009
path:hsa05210	ne:N00012
path:hsa05212	ne:N00012
path:hsa05226	ne:N00012
path:hsa05216	ne:N00012
path:hsa05221	ne:N00012
path:hsa05213	ne:N00012


In [27]:
kegg_pull rest link network disease | wc -l

    1306


In [45]:
kegg_pull rest link network disease --output network_disease.tsv

In [46]:
wc -l network_disease.tsv

    1305 network_disease.tsv


In [47]:
head network_disease.tsv

ds:H01489	ne:nt06018
ds:H01486	ne:nt06018
ds:H01488	ne:nt06018
ds:H01487	ne:nt06018
ds:H01127	ne:nt06018
ds:H01485	ne:nt06018
ds:H00216	ne:nt06019
ds:H02314	ne:nt06019
ds:H00259	ne:nt06019
ds:H01111	ne:nt06019


In [32]:
kegg_pull pull entry-ids H01489

100%|█████████████████████████████████████████████| 1/1 [00:01<00:00,  1.37s/it]


In [35]:
kegg_pull rest link disease pathway | wc -l

      76


In [36]:
kegg_pull rest link disease pathway | head -20

path:hsa05211	ds:H00021
path:hsa05110	ds:H00110
path:hsa05220	ds:H00004
path:hsa05210	ds:H00020
path:hsa05212	ds:H00019
path:hsa05217	ds:H00039
path:hsa05130	ds:H00277
path:hsa05130	ds:H00278
path:hsa05332	ds:H00084
path:hsa05132	ds:H00111
path:hsa05223	ds:H00014
path:hsa05135	ds:H00298
path:hsa05214	ds:H00042
path:hsa05221	ds:H00003
path:hsa05166	ds:H00009
path:hsa05226	ds:H00018
path:hsa05224	ds:H00031
path:hsa05216	ds:H00032
path:hsa05161	ds:H00412
path:hsa05144	ds:H00361


In [37]:
kegg_pull rest info pathway

pathway          KEGG Pathway Database
path             Release 114.0+/04-11, Apr 25
                 Kanehisa Laboratories
                 579 entries

linked db        module
                 ko
                 <org>
                 genome
                 compound
                 glycan
                 reaction
                 rclass
                 enzyme
                 network
                 disease
                 drug
                 pubmed



In [38]:
kegg_pull rest info disease

disease          KEGG Disease Database
ds               Release 114.0+/04-11, Apr 25
                 Kanehisa Laboratories
                 2,900 entries

linked db        pathway
                 brite
                 ko
                 hsa
                 genome
                 network
                 variant
                 drug
                 pubmed



In [48]:
kegg_pull rest list network --output kegg_network.txt

## Getting the number of reference vs disease networks

In [52]:
kegg_pull pull database network --output kegg_network

100%|███████████████████████████████████████| 1637/1637 [11:53<00:00,  2.30it/s]


In [53]:
# Output file
output="kegg_network_types.tsv"
> "$output"  # Clear or create the file

# Iterate over each .txt file in the kegg_network directory
for file in kegg_network/*.txt; do
    # Get the filename without path and extension
    base=$(basename "$file" .txt)

    # Extract the line containing TYPE
    type_line=$(grep "TYPE" "$file")

    # Extract the TYPE line and remove the word "TYPE" and any whitespace
    type_value=$(grep "^TYPE" "$file" | sed 's/TYPE[ \t]*//')

    # Write to the output file
    echo -e "${base}\t${type_value}" >> "$output"
done

In [59]:
kegg_pull pull entry-ids hsa_var:1950v1 --print

100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  1.17it/s]
hsa_var:1950v1
ENTRY       1950v1                      Variant
NAME        EGF overexpression
TYPE        Gain of function
GENE        EGF  epidermal growth factor [KO:K04357]
ORGANISM    hsa_var Human gene variants (Homo sapiens)
VARIATION   overexpression
NETWORK     nt06210  ERK signaling (cancer)
            nt06214  PI3K signaling (cancer)
            nt06260  Colorectal cancer
            nt06526  MAPK signaling
            nt06530  PI3K signaling
DISEASE     H00020  Colorectal cancer
REFERENCE   PMID:7912978
  AUTHORS   Hayashi Y, Widjono YW, Ohta K, Hanioka K, Obayashi C, Itoh K, Imai Y, Itoh H
  TITLE     Expression of EGF, EGF-receptor, p53, v-erb B and ras p21 in colorectal neoplasms by immunostaining paraffin-embedded tissues.
  JOURNAL   Pathol Int 44:124-30 (1994)
            DOI:10.1111/j.1440-1827.1994.tb01696.x
REFERENCE   PMID:15668269
  AUTHORS   Spano JP, Fagard R, Soria JC, Rixe O, Kha

In [63]:
kegg_pull rest get hsa:1950

ENTRY       1950              CDS       T01001
SYMBOL      EGF, HOMG4, URG
NAME        (RefSeq) epidermal growth factor
ORTHOLOGY   K04357  epidermal growth factor
ORGANISM    hsa  Homo sapiens (human)
PATHWAY     hsa01521  EGFR tyrosine kinase inhibitor resistance
            hsa04010  MAPK signaling pathway
            hsa04012  ErbB signaling pathway
            hsa04014  Ras signaling pathway
            hsa04015  Rap1 signaling pathway
            hsa04020  Calcium signaling pathway
            hsa04066  HIF-1 signaling pathway
            hsa04068  FoxO signaling pathway
            hsa04072  Phospholipase D signaling pathway
            hsa04151  PI3K-Akt signaling pathway
            hsa04510  Focal adhesion
            hsa04540  Gap junction
            hsa04630  JAK-STAT signaling pathway
            hsa04810  Regulation of actin cytoskeleton
            hsa05160  Hepatitis C
            hsa05165  Human papillomavirus infection
            hsa05200  Pathways in cancer
       

In [64]:
kegg_pull rest info hsa

T01001           Homo sapiens (human) KEGG Genes Database
hsa              Release 114.0+/04-11, Apr 25
                 Kanehisa Laboratories
                 24,685 entries

linked db        pathway
                 brite
                 module
                 ko
                 genome
                 enzyme
                 network
                 disease
                 drug
                 ncbi-geneid
                 ncbi-proteinid
                 uniprot



In [69]:
kegg_pull rest info variant

variant          KEGG Variant Database
hsa_var          Release 114.0+/04-12, Apr 25
                 Kanehisa Laboratories
                 1,536 entries

linked db        network
                 disease
                 drug
                 pubmed



In [70]:
kegg_pull rest list variant | head -20

10000v1	AKT3 mutation
10026v1	PIGK deficiency
10075v1	HUWE1 mutation
100v1	ADA deficiency
10111v1	RAD50 mutation
10133v1	OPTN mutation
10133v2	OPTN activating mutation
10157v1	AASS deficiency
10195v1	ALG3 deficiency
1019v1	CDK4 amplification
1019v2	CDK4 mutation
10243v1	GPHN deficiency
10274v1	STAG1 mutation
1027v1	CDKN1B loss
1027v2	CDKN1B reduced expression
1027v3	CDKN1B mutation
10280v1	SIGMAR1 mutation
10293v1	TRAIP mutation
10297v1	APC2 mutation
1029v1	CDKN2A deletion


## Subsetting data to the Variant set of the networks

In [71]:
mkdir network_variant

In [72]:
while read p; do
cp kegg_network/$p.txt network_variant/

done < network_variants.txt

In [73]:
ls network_variant/* | wc -l

     298


In [78]:
#!/bin/bash

output="gene_variants.tsv"
> "$output"  # Clear the output file

for file in network_variant/*.txt; do
    base=$(basename "$file" .txt)

    # Find and extract all matches of digits-v-digits
    grep -oE "[0-9]+v[0-9]+" "$file" | while read -r match; do
        echo -e "${base}\t${match}" >> "$output"
    done
done

In [80]:
sort gene_variants.tsv | uniq > temp.tsv && mv temp.tsv gene_variants.tsv

In [83]:
wc -l gene_variants.tsv

     328 gene_variants.tsv


In [98]:
cut -f 2 gene_variants.tsv > gene_variants.txt

In [99]:
sort gene_variants.txt | uniq > temp.tsv && mv temp.tsv gene_variants.txt

In [100]:
wc -l gene_variants.txt

     200 gene_variants.txt


In [104]:
sed -i '' 's/^/hsa_var:/' gene_variants.txt

In [88]:
mkdir variant_info

In [105]:
cat gene_variants.txt | kegg_pull pull entry-ids - --output=variant_info

100%|█████████████████████████████████████████| 200/200 [00:51<00:00,  3.85it/s]


In [11]:
cat variant_info/* > all_variants.txt

In [None]:
cp all_variants.txt all_variants_filtered.txt

### Switching to python

In [None]:
cd kegg_data

In [2]:
import re

def remove_references(text):
    # This regex matches 'REFERENCE' lines and all subsequent indented lines (those starting with 2+ spaces)
    cleaned_text = re.sub(r'REFERENCE\s+PMID:\d+\n(?: {2}.*\n)*', '', text)
    return cleaned_text

In [3]:
with open('all_variants_filtered.txt', 'r') as f:
    original_text = f.read()

cleaned_text = remove_references(original_text)

with open('all_variants_filtered.txt', 'w') as f:
    f.write(cleaned_text)

In [11]:
def remove_network(text):
    lines = text.split('\n')
    cleaned_lines = []
    skip_block = False

    for line in lines:
        if line.startswith("NETWORK"):
            skip_block = True
            continue
        if skip_block:
            if line.startswith(" ") or line.startswith("\t"):
                continue
            else:
                skip_block = False
        if not skip_block:
            cleaned_lines.append(line)

    return '\n'.join(cleaned_lines)


with open('all_variants_filtered.txt', 'r') as f:
    original_text = f.read()

cleaned_text = remove_network(original_text)

with open('all_variants_filtered.txt', 'w') as f:
    f.write(cleaned_text)

In [12]:
def remove_network(text):
    lines = text.split('\n')
    cleaned_lines = []
    skip_block = False

    for line in lines:
        if line.startswith("DISEASE"):
            skip_block = True
            continue
        if skip_block:
            if line.startswith(" ") or line.startswith("\t"):
                continue
            else:
                skip_block = False
        if not skip_block:
            cleaned_lines.append(line)

    return '\n'.join(cleaned_lines)


with open('all_variants_filtered.txt', 'r') as f:
    original_text = f.read()

cleaned_text = remove_network(original_text)

with open('all_variants_filtered.txt', 'w') as f:
    f.write(cleaned_text)

In [13]:
def remove_network(text):
    lines = text.split('\n')
    cleaned_lines = []
    skip_block = False

    for line in lines:
        if line.startswith("DRUG_TARGET"):
            skip_block = True
            continue
        if skip_block:
            if line.startswith(" ") or line.startswith("\t"):
                continue
            else:
                skip_block = False
        if not skip_block:
            cleaned_lines.append(line)

    return '\n'.join(cleaned_lines)


with open('all_variants_filtered.txt', 'r') as f:
    original_text = f.read()

cleaned_text = remove_network(original_text)

with open('all_variants_filtered.txt', 'w') as f:
    f.write(cleaned_text)

Chatgpt to parse out this file and give me a tsv with 3 columns. Entry, Source and ID

Source is which SNV database it is from. Omimvar or clinvar or dbsnp or cosm or dbvar or cosf

### switch back to bash

# Downloading all Variant Information

**Not using dbVar as it has been discontinued and most of the links to dbvar are bad** ClinVar is the alternate and holds all of the data

In [3]:
cd kegg_data

In [2]:
rm all_variants.txt
rm all_variants_filtered.txt

In [4]:
grep OmimVar parsed_variants.tsv | wc -l
grep ClinVar  parsed_variants.tsv | wc -l
grep dbSNP  parsed_variants.tsv | wc -l
grep COSM  parsed_variants.tsv | wc -l
grep dbVar  parsed_variants.tsv | wc -l
grep COSF parsed_variants.tsv | wc -l

      60
     235
     201
     202
      28
      87


### OmimVar

In [None]:
esearch -db clinvar -query "601556[mim]" | efetch -format docsum

From the output that you get, look for the variant ID in the output and then get that specific document summary

In [10]:
grep OmimVar parsed_variants.tsv | cut -f3 > Omim/OmimVar_id.txt

It is being really difficult to run this with a loop in bash, so just running it all manually like this

In [None]:
esearch -db clinvar -query "601978[mim]" | efetch -format docsum > Omim/601978.xml
esearch -db clinvar -query "602533[mim]" | efetch -format docsum > Omim/602533.xml
esearch -db clinvar -query "609007[mim]" | efetch -format docsum > Omim/609007.xml
esearch -db clinvar -query "111730[mim]" | efetch -format docsum > Omim/111730.xml
esearch -db clinvar -query "603448[mim]" | efetch -format docsum > Omim/603448.xml
esearch -db clinvar -query "608300[mim]" | efetch -format docsum > Omim/608300.xml
esearch -db clinvar -query "601143[mim]" | efetch -format docsum > Omim/601143.xml
esearch -db clinvar -query "614260[mim]" | efetch -format docsum > Omim/614260.xml
esearch -db clinvar -query "600543[mim]" | efetch -format docsum > Omim/600543.xml
esearch -db clinvar -query "605078[mim]" | efetch -format docsum > Omim/605078.xml
esearch -db clinvar -query "137070[mim]" | efetch -format docsum > Omim/137070.xml
esearch -db clinvar -query "211100[mim]" | efetch -format docsum > Omim/211100.xml
esearch -db clinvar -query "182100[mim]" | efetch -format docsum > Omim/182100.xml
esearch -db clinvar -query "111100[mim]" | efetch -format docsum > Omim/111100.xml
esearch -db clinvar -query "189980[mim]" | efetch -format docsum > Omim/189980.xml
esearch -db clinvar -query "606463[mim]" | efetch -format docsum > Omim/606463.xml
esearch -db clinvar -query "600429[mim]" | efetch -format docsum > Omim/600429.xml
esearch -db clinvar -query "603371[mim]" | efetch -format docsum > Omim/603371.xml
esearch -db clinvar -query "613109[mim]" | efetch -format docsum > Omim/613109.xml
esearch -db clinvar -query "604834[mim]" | efetch -format docsum > Omim/604834.xml
esearch -db clinvar -query "604473[mim]" | efetch -format docsum > Omim/604473.xml
esearch -db clinvar -query "300264[mim]" | efetch -format docsum > Omim/300264.xml
esearch -db clinvar -query "613004[mim]" | efetch -format docsum > Omim/613004.xml
esearch -db clinvar -query "308000[mim]" | efetch -format docsum > Omim/308000.xml
esearch -db clinvar -query "104760[mim]" | efetch -format docsum > Omim/104760.xml
esearch -db clinvar -query "102600[mim]" | efetch -format docsum > Omim/102600.xml
esearch -db clinvar -query "176264[mim]" | efetch -format docsum > Omim/176264.xml
esearch -db clinvar -query "605411[mim]" | efetch -format docsum > Omim/605411.xml
esearch -db clinvar -query "600734[mim]" | efetch -format docsum > Omim/600734.xml
esearch -db clinvar -query "607047[mim]" | efetch -format docsum > Omim/607047.xml
esearch -db clinvar -query "176763[mim]" | efetch -format docsum > Omim/176763.xml
esearch -db clinvar -query "602544[mim]" | efetch -format docsum > Omim/602544.xml
esearch -db clinvar -query "131340[mim]" | efetch -format docsum > Omim/131340.xml
esearch -db clinvar -query "176610[mim]" | efetch -format docsum > Omim/176610.xml
esearch -db clinvar -query "607922[mim]" | efetch -format docsum > Omim/607922.xml
esearch -db clinvar -query "176640[mim]" | efetch -format docsum > Omim/176640.xml
esearch -db clinvar -query "176801[mim]" | efetch -format docsum > Omim/176801.xml
esearch -db clinvar -query "104311[mim]" | efetch -format docsum > Omim/104311.xml
esearch -db clinvar -query "600759[mim]" | efetch -format docsum > Omim/600759.xml
esearch -db clinvar -query "601556[mim]" | efetch -format docsum > Omim/601556.xml
esearch -db clinvar -query "601517[mim]" | efetch -format docsum > Omim/601517.xml
esearch -db clinvar -query "612895[mim]" | efetch -format docsum > Omim/612895.xml
esearch -db clinvar -query "608309[mim]" | efetch -format docsum > Omim/608309.xml
esearch -db clinvar -query "163890[mim]" | efetch -format docsum > Omim/163890.xml
esearch -db clinvar -query "147450[mim]" | efetch -format docsum > Omim/147450.xml
esearch -db clinvar -query "604985[mim]" | efetch -format docsum > Omim/604985.xml
esearch -db clinvar -query "606765[mim]" | efetch -format docsum > Omim/606765.xml
esearch -db clinvar -query "602345[mim]" | efetch -format docsum > Omim/602345.xml
esearch -db clinvar -query "191110[mim]" | efetch -format docsum > Omim/191110.xml
esearch -db clinvar -query "191342[mim]" | efetch -format docsum > Omim/191342.xml
esearch -db clinvar -query "601023[mim]" | efetch -format docsum > Omim/601023.xml
esearch -db clinvar -query "608537[mim]" | efetch -format docsum > Omim/608537.xml
esearch -db clinvar -query "601011[mim]" | efetch -format docsum > Omim/601011.xml
esearch -db clinvar -query "114206[mim]" | efetch -format docsum > Omim/114206.xml
esearch -db clinvar -query "603094[mim]" | efetch -format docsum > Omim/603094.xml
esearch -db clinvar -query "601530[mim]" | efetch -format docsum > Omim/601530.xml
esearch -db clinvar -query "607904[mim]" | efetch -format docsum > Omim/607904.xml
esearch -db clinvar -query "605704[mim]" | efetch -format docsum > Omim/605704.xml

In [23]:
wc -l Omim/OmimVar_id.txt

      58 Omim/OmimVar_id.txt


In [25]:
while read p; do
if test -f Omim/$p.xml; then
  echo "$p exists."
else
    echo "$p does not exist."
fi
done < Omim/OmimVar_id.txt

601978 exists.
602533 exists.
609007 exists.
111730 exists.
603448 exists.
608300 exists.
601143 exists.
614260 exists.
600543 exists.
605078 exists.
137070 exists.
211100 exists.
182100 exists.
111100 exists.
189980 exists.
606463 exists.
600429 exists.
603371 exists.
613109 exists.
604834 exists.
604473 exists.
300264 exists.
613004 exists.
308000 exists.
104760 exists.
102600 exists.
176264 exists.
605411 exists.
600734 exists.
607047 exists.
176763 exists.
602544 exists.
131340 exists.
176610 exists.
607922 exists.
176640 exists.
176801 exists.
104311 exists.
600759 exists.
601556 exists.
601517 exists.
612895 exists.
608309 exists.
163890 exists.
147450 exists.
604985 exists.
606765 exists.
602345 exists.
191110 exists.
191342 exists.
601023 exists.
608537 exists.
601011 exists.
114206 exists.
603094 exists.
601530 exists.
607904 exists.
605704 exists.


Switch to python

In [None]:
cd kegg_data

In [28]:
import xml.etree.ElementTree as ET
import re

In [60]:
import xml.etree.ElementTree as ET

def extract_linked_ids(xml_path, target_omim_prefix, outfile):
    tree = ET.parse(xml_path)
    root = tree.getroot()

    for variation_xrefs in root.iter('variation_xrefs'):
        block = []
        matched_omim_id = None

        for xref in variation_xrefs.findall('variation_xref'):
            db_source = xref.findtext('db_source')
            db_id = xref.findtext('db_id')

            if db_source and db_id:
                if db_source == "OMIM" and db_id.startswith(target_omim_prefix):
                    matched_omim_id = db_id
                block.append((db_source, db_id))

        if matched_omim_id:
            outfile.write(f"OMIM ID found: {matched_omim_id}\n")
            for source, id_ in block:
                if source != "OMIM":
                    outfile.write(f"{source}:{id_}\n")
            outfile.write("\n")  # Blank line between blocks

In [61]:
# Load OMIM IDs from file into a list
with open("Omim/OmimVar_id.txt", "r") as f:
    omim_ids = [line.strip() for line in f if line.strip()]

# Optional: print first few IDs
print("Loaded OMIM IDs:", omim_ids[:5])

Loaded OMIM IDs: ['601978', '602533', '609007', '111730', '603448']


In [62]:
import os
import re

# There were issues with some XMLs being malformed. So editing the problematic ones to make one common root.
problematic_ids = [
    "609007", "601143", "604985", "608537", "601011", "114206", "607904"
]

input_folder = "Omim"
output_folder = "Omim_fixed"
os.makedirs(output_folder, exist_ok=True)

for omim_id in problematic_ids:
    input_file = os.path.join(input_folder, f"{omim_id}.xml")
    output_file = os.path.join(output_folder, f"{omim_id}.xml")

    with open(input_file, "r") as f:
        xml_content = f.read()

    # Remove leading/trailing whitespace
    xml_content = xml_content.strip()

    # Remove any existing XML declaration or DOCTYPE lines
    xml_content = re.sub(r'<\?xml[^>]+\?>', '', xml_content)
    xml_content = re.sub(r'<!DOCTYPE[^>]*>', '', xml_content)

    # Wrap content in <root> and insert declarations at the top
    fixed_xml = (
        '<?xml version="1.0" encoding="UTF-8"?>\n'
        '<!DOCTYPE root>\n'
        '<root>\n'
        f'{xml_content.strip()}\n'
        '</root>'
    )

    # Write the fixed file
    with open(output_file, "w") as f:
        f.write(fixed_xml)

    print(f"✅ Fixed: {omim_id} → saved to {output_file}")

✅ Fixed: 609007 → saved to Omim_fixed/609007.xml
✅ Fixed: 601143 → saved to Omim_fixed/601143.xml
✅ Fixed: 604985 → saved to Omim_fixed/604985.xml
✅ Fixed: 608537 → saved to Omim_fixed/608537.xml
✅ Fixed: 601011 → saved to Omim_fixed/601011.xml
✅ Fixed: 114206 → saved to Omim_fixed/114206.xml
✅ Fixed: 607904 → saved to Omim_fixed/607904.xml


Iterating over all XMLs and parsing them

In [63]:
good_ids = [id for id in omim_ids if id not in problematic_ids]

In [64]:
print(len(good_ids), len(problematic_ids))

51 7


In [65]:
for id in good_ids:
    with open(f'Omim/{id}_parsed.txt', "w") as f:
        try:
            extract_linked_ids(f'Omim/{id}.xml', id, f)
        except:
            print(id)
            break
            
for id in problematic_ids:
    with open(f'Omim/{id}_parsed.txt', "w") as f:
        try:
            extract_linked_ids(f'Omim_fixed/{id}.xml', id, f)
        except:
            print(id)
            break

In [None]:
while read p; do
if test -f Omim/"$p"_parsed.txt; then
  echo "$p exists."
else
    echo "$p does not exist."
fi
done < Omim/OmimVar_id.txt

In [14]:
cat Omim/*_parsed.txt > Omim_parsed.txt

In [15]:
sed -i '' '/^ClinGen/d' Omim_parsed.txt
sed -i '' '/^UniProtKB/d' Omim_parsed.txt
sed -i '' '/^ClinVar/d' Omim_parsed.txt
sed -i '' '/^dbVar/d' Omim_parsed.txt
sed -i '' '/^Genetic/d' Omim_parsed.txt
sed -i '' '/^LOVD/d' Omim_parsed.txt

In [13]:
#!/bin/bash

input_file="Omim_parsed.txt"       # Your input file
output_file="Omim_parsed.tsv"     # Output TSV file

# Write header
echo -e "omim_id\tdbsnp_id" > "$output_file"

# Initialize variables
omim_id=""
dbsnp_id=""

# Read the file line-by-line
while IFS= read -r line || [ -n "$line" ]; do
    # If it's an OMIM line
    if [[ $line == OMIM\ ID\ found:* ]]; then
        # If we had a previous OMIM without dbSNP, write it now
        if [[ -n $omim_id ]]; then
            echo -e "${omim_id}\t${dbsnp_id}" >> "$output_file"
        fi
        omim_id="${line#OMIM ID found: }"
        dbsnp_id=""  # Reset dbSNP
    elif [[ $line == dbSNP:* ]]; then
        dbsnp_id="${line#dbSNP:}"
    fi
done < "$input_file"

# Write the last record
if [[ -n $omim_id ]]; then
    echo -e "${omim_id}\t${dbsnp_id}" >> "$output_file"
fi

echo "✅ Parsed file into $output_file"

✅ Parsed file into Omim_parsed.tsv


Adding 624 dbSNP IDs to the dbSNP file for retrieval

### ClinVar

In [1]:
cd kegg_data

In [55]:
esearch -db clinvar -query 17584 | efetch -format docsum

<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE DocumentSummarySet>
<DocumentSummarySet status="OK">
  <DbBuild>Build250414-1300.1</DbBuild>
  <DocumentSummary>
    <Id>17584</Id>
    <obj_type>single nucleotide variant</obj_type>
    <accession>VCV000017584</accession>
    <accession_version>VCV000017584.5</accession_version>
    <title>NM_001904.4(CTNNB1):c.101G&gt;A (p.Gly34Glu)</title>
    <variation_set>
      <variation>
        <measure_id>32623</measure_id>
        <variation_xrefs>
          <variation_xref>
            <db_source>ClinGen</db_source>
            <db_id>CA127277</db_id>
          </variation_xref>
          <variation_xref>
            <db_source>UniProtKB</db_source>
            <db_id>P35222#VAR_017620</db_id>
          </variation_xref>
          <variation_xref>
            <db_source>OMIM</db_source>
            <db_id>116806.0008</db_id>
          </variation_xref>
          <variation_xref>
            <db_source>dbSNP</db_source>
            <db_id>28

In [2]:
grep ClinVar parsed_variants.tsv | cut -f3 > ClinVar/ClinVar_id.txt

In [12]:
wc -l ClinVar/ClinVar_id.txt

     232 ClinVar/ClinVar_id.txt


Saved all of the esearch queries to clinvar_esearch.sh . 232 of them

In [4]:
chmod +x ClinVar/clinvar_esearch.sh

In [13]:
wc -l ClinVar/clinvar_esearch.sh

     232 ClinVar/clinvar_esearch.sh


In [None]:
ClinVar/./clinvar_esearch.sh

In [18]:
while read p; do
[ -s ClinVar/$p.xml ] || echo "$p is empty"
done < ClinVar/ClinVar_id.txt

376308 is empty
376242 is empty
376235 is empty
376233 is empty
375895 is empty
376282 is empty
376280 is empty
396706 is empty
375971 is empty
376068 is empty
376728 is empty
160870 is empty
376464 is empty
376461 is empty
375873 is empty
376220 is empty
375871 is empty
375872 is empty
376221 is empty
376069 is empty


There are 20 XMLs as seen above that have been deleted so I cannot access them

In [16]:
esearch -db clinvar -query 177620 | efetch -format docsum > ClinVar/177620.xml

In [19]:
while read p; do
[ -s ClinVar/$p.xml ] || rm ClinVar/$p.xml
done < ClinVar/ClinVar_id.txt

In [23]:
sed -i '' '/^376308$/d' ClinVar/ClinVar_id.txt
sed -i '' '/^376242$/d' ClinVar/ClinVar_id.txt
sed -i '' '/^376235$/d' ClinVar/ClinVar_id.txt
sed -i '' '/^376233$/d' ClinVar/ClinVar_id.txt
sed -i '' '/^375895$/d' ClinVar/ClinVar_id.txt
sed -i '' '/^376282$/d' ClinVar/ClinVar_id.txt
sed -i '' '/^376280$/d' ClinVar/ClinVar_id.txt
sed -i '' '/^396706$/d' ClinVar/ClinVar_id.txt
sed -i '' '/^375971$/d' ClinVar/ClinVar_id.txt
sed -i '' '/^376068$/d' ClinVar/ClinVar_id.txt
sed -i '' '/^376728$/d' ClinVar/ClinVar_id.txt
sed -i '' '/^160870$/d' ClinVar/ClinVar_id.txt
sed -i '' '/^376464$/d' ClinVar/ClinVar_id.txt
sed -i '' '/^376461$/d' ClinVar/ClinVar_id.txt
sed -i '' '/^375873$/d' ClinVar/ClinVar_id.txt
sed -i '' '/^376220$/d' ClinVar/ClinVar_id.txt
sed -i '' '/^375871$/d' ClinVar/ClinVar_id.txt
sed -i '' '/^375872$/d' ClinVar/ClinVar_id.txt
sed -i '' '/^376221$/d' ClinVar/ClinVar_id.txt
sed -i '' '/^376069$/d' ClinVar/ClinVar_id.txt

In [24]:
wc -l ClinVar/ClinVar_id.txt

     212 ClinVar/ClinVar_id.txt


In [21]:
ls ClinVar | wc -l

     214


214 is good and checks out. 214 - 2 = 212 which is how many ids we have

In [None]:
cd kegg_data

In [2]:
import xml.etree.ElementTree as ET
import os

# Paths
id_file = "ClinVar/ClinVar_id.txt"
input_folder = "ClinVar"
output_file = "ClinVar_parsed_output.tsv"

# Read all IDs from the input file
with open(id_file, "r") as f:
    clinvar_ids = [line.strip() for line in f if line.strip()]

# Prepare output file
with open(output_file, "w") as out:
    # Write header
    out.write("ClinVar_ID\tseq_id\tposition\tref\talt\n")

    for cid in clinvar_ids:
        xml_path = os.path.join(input_folder, f"{cid}.xml")
        if not os.path.exists(xml_path):
            print(f"⚠️ File not found: {xml_path}")
            continue

        try:
            # Parse XML
            tree = ET.parse(xml_path)
            root = tree.getroot()

            # Find all canonical_spdi tags
            for spdi in root.iter("canonical_spdi"):
                text = spdi.text
                if text and ":" in text:
                    parts = text.split(":")
                    if len(parts) == 4:
                        seq_id, pos, ref, alt = parts
                        out.write(f"{cid}\t{seq_id}\t{pos}\t{ref}\t{alt}\n")

        except ET.ParseError as e:
            print(f"❌ Parse error in {cid}.xml: {e}")

In [4]:
parsed = ['16928','16929','183391','183393','183395','8823','420108','9409','376307','220711','376310','376305','376303','77637','376384','233484','127526','182409','376306','182423','17577','17576','17580','17587','17588','17579','17583','17582','376231','17584','376232','17589','17578','376228','177620','16609','45263','16613','16339','16359','16332','16333','16342','16348','16276','16273','16272','375972','16274','15933','15934','15935','15936','801','184937','802','12602','12613','35554','180848','160364','376033','219296','9834','9381','39571','39572','14801','13860','13863','13852','12582','12583','12580','12578','16677','16685','16686','16688','186141','13881','13882','13886','13883','376126','13888','13889','13890','162466','162468','162465','375876','13901','13900','373003','39648','73058','375874','177778','162469','162470','5286','225431','225433','225434','225432','31944','13655','13652','13653','13659','91945','12674','164995','13244','13245','13246','13247','13251','13250','13249','409162','418436','7829','427590','187657','7814','7837','7836','7838','7833','189486','428256','186396','404151','375958','7813','189403','185200','189484','7815','92828','189448','9511','9512','13087','428681','13919','13911','38629','37102','13951','8117','8118','13961','375941','12511','213936','217016','12374','12356','12366','12347','12365','43594','12364','12355','127819','376570','12372','2216','43604','93326','2223','417961','14464','6390','41166','41209','4893','4886','4892','161992','161993','161995']

In [14]:
len(parsed)

185

In [5]:
remaining = [id for id in clinvar_ids if id not in parsed]

In [8]:
!mkdir ClinVar_remaining

In [11]:
import os
import shutil 
# Paths
source_dir = "ClinVar"
dest_dir = "ClinVar_remaining"

# Ensure destination folder exists
os.makedirs(dest_dir, exist_ok=True)

# Iterate and copy files
for clinvar_id in remaining:
    src = os.path.join(source_dir, f"{clinvar_id}.xml")
    dst = os.path.join(dest_dir, f"{clinvar_id}.xml")

    if os.path.exists(src):
        shutil.copy(src, dst)
        print(f"✅ Copied: {clinvar_id}.xml")
    else:
        print(f"⚠️ Missing: {clinvar_id}.xml")

✅ Copied: 268075.xml
✅ Copied: 150740.xml
✅ Copied: 59680.xml
✅ Copied: 59682.xml
✅ Copied: 148363.xml
✅ Copied: 58696.xml
✅ Copied: 57282.xml
✅ Copied: 59782.xml
✅ Copied: 153718.xml
✅ Copied: 148679.xml
✅ Copied: 16270.xml
✅ Copied: 59715.xml
✅ Copied: 394884.xml
✅ Copied: 153231.xml
✅ Copied: 151754.xml
✅ Copied: 149554.xml
✅ Copied: 153441.xml
✅ Copied: 148269.xml
✅ Copied: 57074.xml
✅ Copied: 394609.xml
✅ Copied: 58030.xml
✅ Copied: 58029.xml
✅ Copied: 58028.xml
✅ Copied: 441904.xml
✅ Copied: 146814.xml
✅ Copied: 144406.xml
✅ Copied: 57042.xml


In [12]:
!ls Clinvar_remaining | wc -l

      27


In [13]:
!cat Clinvar_remaining/* > Clinvar_remaining/all_remaining_variants.xml

They are all copy number gain variations. Nothing that I can do for this project. So we will stick with our 185 parsed

In [15]:
!wc -l ClinVar_parsed_output.tsv

     186 ClinVar_parsed_output.tsv


In [None]:
rm -r Clinvar_remaining

### dbSNP

Have to get the variants from OmimVar

In [1]:
cd kegg_data

In [2]:
esearch -db snp -query rs1131690863 | efetch -format docsum

<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE DocumentSummarySet>
<DocumentSummarySet status="OK">
  <DbBuild>Build250306-1408.1</DbBuild>
  <DocumentSummary>
    <Id>1131690863</Id>
    <SNP_ID>1131690863</SNP_ID>
    <GLOBAL_SAMPLESIZE>0</GLOBAL_SAMPLESIZE>
    <CLINICAL_SIGNIFICANCE>uncertain-significance,pathogenic</CLINICAL_SIGNIFICANCE>
    <GENES>
      <GENE_E>
        <NAME>RB1</NAME>
        <GENE_ID>5925</GENE_ID>
      </GENE_E>
      <GENE_E>
        <NAME>LOC112268118</NAME>
        <GENE_ID>112268118</GENE_ID>
      </GENE_E>
    </GENES>
    <ACC>NC_000013.11</ACC>
    <CHR>13</CHR>
    <HANDLE>EVA,CSS-BFX,CLINVAR</HANDLE>
    <SPDI>NC_000013.11:48362846:C:A,NC_000013.11:48362846:C:G,NC_000013.11:48362846:C:T</SPDI>
    <FXN_CLASS>coding_sequence_variant,stop_gained,500B_downstream_variant,synonymous_variant,missense_variant,downstream_transcript_variant</FXN_CLASS>
    <VALIDATED>by-cluster</VALIDATED>
    <DOCSUM>HGVS=NC_000013.11:g.48362847C&gt;A,NC_000013.11:g.4

In [3]:
mkdir dbSNP

In [16]:
grep dbSNP parsed_variants.tsv | cut -f3 > dbSNP/dbSNP_id.txt

In [17]:
wc -l dbSNP/dbSNP_id.txt

     201 dbSNP/dbSNP_id.txt


Added from Omim and removed repeats

In [19]:
wc -l dbSNP/dbSNP_id.txt

     761 dbSNP/dbSNP_id.txt


In [21]:
# Saved the scripts to download all 761
chmod +x dbSNP/dbSNP_search.sh

In [24]:
dbSNP/./dbSNP_search.sh

(B[m[31m[1m[7m ERROR: (B[m[31m[1m Missing -db argument(B[m
(B[m[31m[1m[7m ERROR: (B[m[31m[1m Missing -db argument(B[m


In [26]:
while read p; do
if ! test -f dbSNP/"$p".xml; then
    echo "$p does not exist."
fi
done < dbSNP/dbSNP_id.txt

In [27]:
while read p; do
[ -s dbSNP/"$p".xml ] || echo "$p is empty"
done < dbSNP/dbSNP_id.txt

rs121908237 is empty
rs137852480 is empty
rs13785281 is empty


In [29]:
esearch -db snp -query rs121908237 | efetch -format docsum > dbSNP/rs121908237.xml

In [30]:
esearch -db snp -query rs137852480 | efetch -format docsum > dbSNP/rs137852480.xml

In [31]:
esearch -db snp -query rs13785281 | efetch -format docsum > dbSNP/rs13785281.xml

In [32]:
while read p; do
[ -s dbSNP/"$p".xml ] || echo "$p is empty"
done < dbSNP/dbSNP_id.txt

rs13785281 is empty


rs13785281 is not found and is removed from the id file

In [33]:
wc -l dbSNP/dbSNP_id.txt

     760 dbSNP/dbSNP_id.txt


In [35]:
while read -r p; do
    if ! grep -q SPDI "dbSNP/$p.xml"; then
        echo "$p"
    fi
done < dbSNP/dbSNP_id.txt

No output means every file has SPDI. yay

In [None]:
cd kegg_data

In [20]:
import os
import xml.etree.ElementTree as ET

# File paths
input_ids_file = "dbSNP/dbSNP_id.txt"
input_folder = "dbSNP"
output_file = "dbSNP_output.tsv"

# Read SNP IDs
with open(input_ids_file, "r") as f:
    dbsnp_ids = [line.strip() for line in f if line.strip()]

# Open TSV output file
with open(output_file, "w") as out:
    out.write("dbsnp_id\tsequence_id\tposition\tref\talt\n")

    for dbsnp_id in dbsnp_ids:
        xml_path = os.path.join(input_folder, f"{dbsnp_id}.xml")

        if not os.path.exists(xml_path):
            print(f"⚠️ Missing: {xml_path}")
            continue

        try:
            tree = ET.parse(xml_path)
            root = tree.getroot()

            for spdi in root.iter("SPDI"):
                if spdi.text:
                    spdi_items = spdi.text.strip().split(",")
                    for item in spdi_items:
                        parts = item.strip().split(":")
                        if len(parts) == 4:
                            seq_id, pos, ref, alt = parts
                            out.write(f"{dbsnp_id}\t{seq_id}\t{pos}\t{ref}\t{alt}\n")
                        else:
                            print(f"⚠️ Invalid SPDI format in {dbsnp_id}: {item}")

        except ET.ParseError as e:
            print(f"❌ Parse error in {dbsnp_id}.xml: {e}")
        except Exception as e:
            print(f"❌ Unexpected error in {dbsnp_id}.xml: {e}")

Removed the duplicate lines and am left with 1408 mutations

In [23]:
!wc -l dbSNP_output.tsv

    1409 dbSNP_output.tsv


### COSM

In [2]:
cd kegg_data

In [3]:
mkdir COSM

In [32]:
grep COSM parsed_variants.tsv | cut -f 3 > COSM/COSM_ids.txt

In [33]:
wc -l COSM/COSM_ids.txt

     202 COSM/COSM_ids.txt


In [34]:
head COSM/COSM_ids.txt

1677139
1989836
12523
13800
12475
12504
12506
13281
12512
12476


In [7]:
while read id; do
curl --silent "https://rest.ensembl.org/variation/human/"$id"?content-type=application/json" > COSM/"$id".txt
done < COSM/COSM_ids.txt

In [8]:
ls COSM/* | wc -l

     203


download the COSM database from here https://cancer.sanger.ac.uk/cosmic/download/cosmic/v101/completetargetedscreensmutanttsv

In [51]:
sed 's/$/\t/' COSM/COSM_ids.txt > COSM/COSM_ids_tab.txt

In [53]:
grep -F -f COSM/COSM_ids_tab.txt Cosmic_CompleteTargetedScreensMutant_v101_GRCh38.tsv > COSM_matched.tsv




In [54]:
wc -l COSM_matched.tsv

  372391 COSM_matched.tsv


In [55]:
cut -f 8 COSM_matched.tsv > COSM_matched_id.txt
sort -u COSM_matched_id.txt > COSM_matched_id_unique.txt
wc -l COSM_matched_id_unique.txt

     160 COSM_matched_id_unique.txt


In [61]:
while read -r p; do
    if ! grep -q $p COSM_matched_id_unique.txt; then
        echo "$p"
    fi
done < COSM/COSM_ids.txt

COSM12475
COSM12506
COSM12512
COSM13766
COSM13786
COSM13675
COSM13224
COSM13723
COSM13474
COSM12505
COSM785
COSM238553
COSM5564006
COSM5015793
COSM1673476
COSM6196669
COSM878
COSM965
COSM4766182


In [62]:
echo 'COSM12475
COSM12506
COSM12512
COSM13766
COSM13786
COSM13675
COSM13224
COSM13723
COSM13474
COSM12505
COSM785
COSM238553
COSM5564006
COSM5015793
COSM1673476
COSM6196669
COSM878
COSM965
COSM4766182' > COSM_unmatched_id.txt

In [63]:
wc -l COSM_unmatched_id.txt

      19 COSM_unmatched_id.txt


In [69]:
sed 's/$/\t/' COSM_unmatched_id.txt > COSM_unmatched_tab_id.txt

In [66]:
grep -F -f COSM_unmatched_tab_id.txt Cosmic_CompleteTargetedScreensMutant_v101_GRCh37.tsv > COSM_unmatched.tsv




In [71]:
wc -l COSM_unmatched.tsv

     207 COSM_unmatched.tsv


In [72]:
cut -f 8 COSM_unmatched.tsv > COSM_unmatched_id_parsed.txt
sort -u COSM_unmatched_id_parsed.txt > COSM_unmatched_id_unique.txt
wc -l COSM_unmatched_id_unique.txt

       5 COSM_unmatched_id_unique.txt


**Removing the COSM unmatched IDs from the text file**

In [122]:
rm COSM/COSM_total_parsed.tsv

In [123]:
cat COSM/COSM_matched.tsv >> COSM/COSM_total_parsed.tsv
cat COSM/COSM_unmatched.tsv >> COSM/COSM_total_parsed.tsv

In [124]:
cp COSM/COSM_ids.txt COSM/COSM_ids_final.txt

In [125]:
while read -r p; do
    if ! grep -q $p COSM/COSM_total_parsed.tsv; then
        echo $p
        sed -i '' '/'$p'/d' COSM/COSM_ids_final.txt
    fi
done < COSM/COSM_ids.txt

COSM12475
COSM12506
COSM12512
COSM13766
COSM13675
COSM13224
COSM13474
COSM238553
COSM5564006
COSM5015793
COSM1673476
COSM6196669
COSM5159
COSM5313
COSM5154
COSM5105
COSM5204
COSM5141
COSM5283
COSM5079
COSM5046
COSM86063
COSM5142
COSM5322
COSM23625
COSM3736941
COSM5052
COSM1167954
COSM5143
COSM5119
COSM5148
COSM861
COSM878
COSM859
COSM860
COSM862
COSM864
COSM965
COSM1237919
COSM13152
COSM33076
COSM17983
COSM25676
COSM17855
COSM142849
COSM4387483
COSM4766182


In [128]:
wc -l COSM/COSM_ids_final.txt

     132 COSM/COSM_ids_final.txt


In [129]:
cut -f 8 COSM/COSM_total_parsed.tsv > COSM/COSM_total_parsed_id.txt
sort -u COSM/COSM_total_parsed_id.txt > COSM/COSM_total_parsed_id_unique.txt
wc -l COSM/COSM_total_parsed_id_unique.txt

rm COSM/COSM_total_parsed_id.txt
rm COSM/COSM_total_parsed_id_unique.txt

     166 COSM/COSM_total_parsed_id_unique.txt


### Parsing the Matched TSV File

Got it into excel and deleting columns that don't matter

In [130]:
wc -l COSM/COSM_total_parsed.tsv

    1140 COSM/COSM_total_parsed.tsv


### COSF

download the COSF database from here https://cancer.sanger.ac.uk/cosmic/download/cosmic/v101/fusion

In [91]:
grep COSF parsed_variants.tsv | cut -f 3 > COSF/cosf_ids_temp.txt

In [92]:
sort -u COSF/cosf_ids_temp.txt > COSF/cosf_ids_temp_uniq.txt

In [94]:
rm COSF/cosf_ids.txt

In [95]:
while read p; do
echo COSF$p >> COSF/cosf_ids.txt

done < COSF/cosf_ids_temp_uniq.txt

In [96]:
rm COSF/cosf_ids_temp.txt
rm COSF/cosf_ids_temp_uniq.txt

In [97]:
head COSF/cosf_ids.txt

COSF121
COSF1216
COSF1220
COSF1224
COSF1231
COSF125
COSF1271
COSF128
COSF1319
COSF1320


In [98]:
wc -l COSF/cosf_ids.txt

      65 COSF/cosf_ids.txt


In [None]:
cat Cosmic_Fusion_v101_GRCh38.tsv >> COSF/Cosmic_Fusion.tsv

In [107]:
du -h COSF/Cosmic_Fusion.tsv

 18M	COSF/Cosmic_Fusion.tsv


In [108]:
sed 's/$/\t/' COSF/cosf_ids.txt > COSF/cosf_ids_tab.txt

In [111]:
#!/bin/bash

# Paths (edit these as needed)
COSF_ID_FILE="COSF/cosf_ids_tab.txt"
COSMIC_TSV="COSF/Cosmic_Fusion.tsv"
OUTPUT_TSV="COSF/kegg_data_cosf.tsv"

# Header based on README
HEADER="COSMIC_SAMPLE_ID\tSAMPLE_NAME\tCOSMIC_PHENOTYPE_ID\tCOSMIC_FUSION_ID\tFUSION_SYNTAX\tFIVE_PRIME_CHROMOSOME\tFIVE_PRIME_STRAND\tFIVE_PRIME_TRANSCRIPT_ID\tFIVE_PRIME_GENE_SYMBOL\tFIVE_PRIME_LAST_OBSERVE_EXON\tFIVE_PRIME_GENOME_START_FROM\tFIVE_PRIME_GENOME_START_TO\tFIVE_PRIME_GENOME_STOP_FROM\tFIVE_PRIME_GENOME_STOP_TO\tTHREE_PRIME_CHROMOSOME\tTHREE_PRIME_STRAND\tTHREE_PRIME_TRANSCRIPT_ID\tTHREE_PRIME_GENE_SYMBOL\tTHREE_PRIME_FIRST_OBSERVE_EXON\tTHREE_PRIME_GENOME_START_FROM\tTHREE_PRIME_GENOME_START_TO\tTHREE_PRIME_GENOME_STOP_FROM\tTHREE_PRIME_GENOME_STOP_TO\tFUSION_TYPE\tPUBMED_PMID"

# Write header to output
echo -e "$HEADER" > "$OUTPUT_TSV"

grep -F -f $COSF_ID_FILE $COSMIC_TSV >> $OUTPUT_TSV

echo "✅ Extracted COSF entries saved to: $OUTPUT_TSV"

✅ Extracted COSF entries saved to: COSF/kegg_data_cosf.tsv


In [112]:
cut -f 4 COSF/kegg_data_cosf.tsv > COSF/kegg_data_cosf_parsed.txt
sort -u COSF/kegg_data_cosf_parsed.txt > COSF/kegg_data_cosf_parsed_uniq.txt
wc -l COSF/kegg_data_cosf_parsed_uniq.txt

rm COSF/kegg_data_cosf_parsed.txt

      29 COSF/kegg_data_cosf_parsed_uniq.txt


In [119]:
cp COSF/cosf_ids.txt COSF/cosf_ids_final.txt

In [120]:
while read -r p; do
    if ! grep -q $p COSF/kegg_data_cosf_parsed_uniq.txt; then
        echo $p
        sed -i '' '/'$p'/d' COSF/cosf_ids_final.txt
    fi
done < COSF/cosf_ids.txt

COSF1220
COSF1224
COSF125
COSF128
COSF1330
COSF1490
COSF154
COSF155
COSF166
COSF168
COSF1756
COSF1758
COSF1805
COSF187
COSF189
COSF1949
COSF1960
COSF2067
COSF2124
COSF218
COSF220
COSF2246
COSF2248
COSF248
COSF300
COSF302
COSF355
COSF356
COSF394
COSF396
COSF463
COSF501
COSF504
COSF528
COSF806
COSF808


In [121]:
wc -l COSF/cosf_ids_final.txt

      29 COSF/cosf_ids_final.txt


I was looking at the data and they don't give any proper ways to get the exact nt sequence, so I am leaving this out.

# Matching Variant and Nt sequence to each Network/Pathway

In [None]:
cd kegg_data

In [2]:
import pandas as pd

In [19]:
parsed_variants = pd.read_csv("parsed_variants.tsv", sep='\t')
parsed_variants

Unnamed: 0,ENTRY,Source,ID
0,10133v1,OmimVar,10133
1,1019v1,ClinVar,268075
2,1019v1,ClinVar,150740
3,1019v1,dbVar,nsv917029
4,1019v2,ClinVar,16928
...,...,...,...
783,9817v1,COSM,6196638
784,999v2,COSM,4766182
785,999v2,COSM,1379150
786,999v2,COSM,4766211


### ClinVar

In [20]:
clinvar_data = pd.read_csv("ClinVar_parsed_output.tsv",sep='\t')
clinvar_data = clinvar_data.rename(columns={"ClinVar_ID": "ID"})
clinvar_data['ID'] = clinvar_data['ID'].astype('string')
clinvar_data

Unnamed: 0,ID,seq_id,position,ref,alt
0,16928,NC_000012.12,57751647,G,A
1,16929,NC_000012.12,57751646,C,T
2,183391,NC_000012.12,12717896,CAGGCGGAGCACCCCAAGCC,CAGGCGGAGCACCCCAAGCCAGGCGGAGCACCCCAAGCC
3,183393,NC_000012.12,12718044,C,T
4,183395,NC_000012.12,12718210,CTCT,CT
...,...,...,...,...,...
180,4886,NC_000011.10,67483197,C,T
181,4892,NC_000011.10,67490803,C,A
182,161992,NC_000015.10,50490442,T,C
183,161993,NC_000015.10,50490443,C,G


In [21]:
# Ensure ClinVar_ID is treated as string to avoid dtype mismatch
clinvar_ids = clinvar_data["ID"].astype(str).unique()

missing_num = 0

# Iterate and print missing ClinVar IDs
for _, row in parsed_variants.iterrows():
    if row["Source"] == "ClinVar" and str(row["ID"]) not in clinvar_ids:
        missing_num+=1
print(f'Number of missing ClinVar variant is {missing_num}')

Number of missing ClinVar variant is 49


In [45]:
clinvar_final = parsed_variants.merge(clinvar_data, on='ID')
clinvar_final

Unnamed: 0,ENTRY,Source,ID,seq_id,position,ref,alt
0,1019v2,ClinVar,16928,NC_000012.12,57751647,G,A
1,1019v2,ClinVar,16929,NC_000012.12,57751646,C,T
2,1027v3,ClinVar,183391,NC_000012.12,12717896,CAGGCGGAGCACCCCAAGCC,CAGGCGGAGCACCCCAAGCCAGGCGGAGCACCCCAAGCC
3,1027v3,ClinVar,183393,NC_000012.12,12718044,C,T
4,1027v3,ClinVar,183395,NC_000012.12,12718210,CTCT,CT
...,...,...,...,...,...,...,...
180,9049v1,ClinVar,4886,NC_000011.10,67483197,C,T
181,9049v1,ClinVar,4892,NC_000011.10,67490803,C,A
182,9101v1,ClinVar,161992,NC_000015.10,50490442,T,C
183,9101v1,ClinVar,161993,NC_000015.10,50490443,C,G


### dbSNP

In [54]:
dbsnp_data = pd.read_csv("dbSNP_output.tsv",sep='\t')
dbsnp_data = dbsnp_data.rename(columns={"True Id": "ID","sequence_id":'seq_id'})
dbsnp_data['ID'] = dbsnp_data['ID'].astype('string')
dbsnp_data

Unnamed: 0,ID,dbsnp_id,seq_id,position,ref,alt
0,104311,rs661,NC_000014.9,73217224,G,A
1,104311,rs661,NC_000014.9,73217224,G,T
2,606463,rs364897,NC_000001.11,155238214,T,A
3,606463,rs364897,NC_000001.11,155238214,T,C
4,606463,rs368060,NC_000001.11,155235216,C,G
...,...,...,...,...,...,...
1403,rs672601307,rs672601307,NC_000015.10,50490442,T,C
1404,rs672601308,rs672601308,NC_000015.10,50490443,C,G
1405,rs672601308,rs672601308,NC_000015.10,50490443,C,T
1406,rs672601311,rs672601311,NC_000015.10,50490449,C,G


In [55]:
# Ensure ClinVar_ID is treated as string to avoid dtype mismatch
dbsnp_data_ids = dbsnp_data["ID"].astype(str).unique()

missing_num = 0

# Iterate and print missing ClinVar IDs
for _, row in parsed_variants.iterrows():
    if (row["Source"] == "dbSNP" or row["Source"] == "OmimVar") and str(row["ID"]) not in clinvar_ids:
        missing_num+=1
print(f'Number of missing dbSNP and OmimVar variant is {missing_num}')

Number of missing dbSNP and OmimVar variant is 244


In [56]:
dbsnp_final = parsed_variants.merge(dbsnp_data, on='ID')
dbsnp_final

Unnamed: 0,ENTRY,Source,ID,dbsnp_id,seq_id,position,ref,alt
0,1019v2,dbSNP,rs11547328,rs11547328,NC_000012.12,57751647,G,A
1,1019v2,dbSNP,rs11547328,rs11547328,NC_000012.12,57751647,G,C
2,1019v2,dbSNP,rs11547328,rs11547328,NC_000012.12,57751647,G,T
3,1019v2,dbSNP,rs104894340,rs104894340,NC_000012.12,57751646,C,A
4,1019v2,dbSNP,rs104894340,rs104894340,NC_000012.12,57751646,C,G
...,...,...,...,...,...,...,...,...
1417,9101v1,dbSNP,rs672601311,rs672601311,NC_000015.10,50490449,C,G
1418,9101v1,dbSNP,rs672601311,rs672601311,NC_000015.10,50490449,C,T
1419,9217v1,OmimVar,605704,rs74315431,NC_000020.11,58418317,C,T
1420,9217v1,OmimVar,605704,rs281875284,NC_000020.11,58418288,C,G


### COSM

In [40]:
cosm_data = pd.read_csv("COSM/COSM_total_parsed.tsv",sep='\t')
cosm_data['ID'] = cosm_data['COSMID'].str[4:]
cosm_data

Unnamed: 0,Gene,TranscriptID,COSMID,NucChange,AAChange,Chr,Start,End,Strand,RefAllele,AltAllele,ID
0,CTNNB1,ENST00000643031.1,COSM5692,c.134C>A,p.S45Y,3,41224646,41224646,+,C,A,5692
1,CTNNB1,ENST00000642248.1,COSM5689,c.134C>G,p.S45C,3,41224646,41224646,+,C,G,5689
2,CDKN2A,ENST00000579755.1,COSM13508,c.375G>A,p.G125=,9,21971027,21971027,-,C,T,13508
3,CTNNB1,ENST00000396183.7,COSM5681,c.95A>G,p.D32G,3,41224607,41224607,+,A,G,5681
4,CDKN2A,ENST00000530628.2,COSM13807,c.389G>T,p.G130V,9,21971013,21971013,-,C,A,13807
...,...,...,...,...,...,...,...,...,...,...,...,...
1134,CDKN2A,ENST00000579755.1,COSM13723,c.308G>A,p.G103E,9,21971093,21971093,-,C,T,13723
1135,CDKN2A,ENST00000578845.2,COSM13723,c.112G>A,p.G38S,9,21971093,21971093,-,C,T,13723
1136,CDKN2A,ENST00000579122.1,COSM12505,c.59C>A,p.A20E,9,21974768,21974768,-,G,T,12505
1137,FLT3,ENST00000380982.4,COSM785,c.2503G>C,p.D835H,13,28592642,28592642,-,C,G,785


In [41]:
# Ensure ClinVar_ID is treated as string to avoid dtype mismatch
cosm_data_ids = cosm_data["ID"].astype(str).unique()

missing_num = 0

# Iterate and print missing ClinVar IDs
for _, row in parsed_variants.iterrows():
    if row["Source"] == "COSM"  and str(row["ID"]) not in clinvar_ids:
        missing_num+=1
print(f'Number of missing COSM variant is {missing_num}')

Number of missing COSM variant is 202


In [47]:
cosm_final = parsed_variants.merge(cosm_data, on='ID')
cosm_final

Unnamed: 0,ENTRY,Source,ID,Gene,TranscriptID,COSMID,NucChange,AAChange,Chr,Start,End,Strand,RefAllele,AltAllele
0,1019v2,COSM,1677139,CDK4,ENST00000312990.10,COSM1677139,c.70C>T,p.R24C,12,57751648,57751648,-,G,A
1,1019v2,COSM,1677139,CDK4,ENST00000549606.5,COSM1677139,c.-158+527C>T,p.?,12,57751648,57751648,-,G,A
2,1019v2,COSM,1677139,CDK4,ENST00000257904.10,COSM1677139,c.70C>T,p.R24C,12,57751648,57751648,-,G,A
3,1019v2,COSM,1989836,CDK4,ENST00000312990.10,COSM1989836,c.71G>A,p.R24H,12,57751647,57751647,-,C,T
4,1019v2,COSM,1989836,CDK4,ENST00000549606.5,COSM1989836,c.-158+528G>A,p.?,12,57751647,57751647,-,C,T
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1134,999v2,COSM,4766271,CDH1,ENST00000612417.4,COSM4766271,c.662A>G,p.D221G,16,68808823,68808823,+,A,G
1135,999v2,COSM,4766271,CDH1,ENST00000611625.4,COSM4766271,c.662A>G,p.D221G,16,68808823,68808823,+,A,G
1136,999v2,COSM,4766271,CDH1,ENST00000422392.6,COSM4766271,c.662A>G,p.D221G,16,68808823,68808823,+,A,G
1137,999v2,COSM,4766271,CDH1,ENST00000621016.4,COSM4766271,c.662A>G,p.D221G,16,68808823,68808823,+,A,G


## Combining them together

In [48]:
clinvar_final

Unnamed: 0,ENTRY,Source,ID,seq_id,position,ref,alt
0,1019v2,ClinVar,16928,NC_000012.12,57751647,G,A
1,1019v2,ClinVar,16929,NC_000012.12,57751646,C,T
2,1027v3,ClinVar,183391,NC_000012.12,12717896,CAGGCGGAGCACCCCAAGCC,CAGGCGGAGCACCCCAAGCCAGGCGGAGCACCCCAAGCC
3,1027v3,ClinVar,183393,NC_000012.12,12718044,C,T
4,1027v3,ClinVar,183395,NC_000012.12,12718210,CTCT,CT
...,...,...,...,...,...,...,...
180,9049v1,ClinVar,4886,NC_000011.10,67483197,C,T
181,9049v1,ClinVar,4892,NC_000011.10,67490803,C,A
182,9101v1,ClinVar,161992,NC_000015.10,50490442,T,C
183,9101v1,ClinVar,161993,NC_000015.10,50490443,C,G


In [57]:
dbsnp_final = dbsnp_final.drop(columns=['dbsnp_id'])
dbsnp_final

Unnamed: 0,ENTRY,Source,ID,seq_id,position,ref,alt
0,1019v2,dbSNP,rs11547328,NC_000012.12,57751647,G,A
1,1019v2,dbSNP,rs11547328,NC_000012.12,57751647,G,C
2,1019v2,dbSNP,rs11547328,NC_000012.12,57751647,G,T
3,1019v2,dbSNP,rs104894340,NC_000012.12,57751646,C,A
4,1019v2,dbSNP,rs104894340,NC_000012.12,57751646,C,G
...,...,...,...,...,...,...,...
1417,9101v1,dbSNP,rs672601311,NC_000015.10,50490449,C,G
1418,9101v1,dbSNP,rs672601311,NC_000015.10,50490449,C,T
1419,9217v1,OmimVar,605704,NC_000020.11,58418317,C,T
1420,9217v1,OmimVar,605704,NC_000020.11,58418288,C,G


In [59]:
clinvar_dbsnp = pd.concat([clinvar_final, dbsnp_final])
clinvar_dbsnp

Unnamed: 0,ENTRY,Source,ID,seq_id,position,ref,alt
0,1019v2,ClinVar,16928,NC_000012.12,57751647,G,A
1,1019v2,ClinVar,16929,NC_000012.12,57751646,C,T
2,1027v3,ClinVar,183391,NC_000012.12,12717896,CAGGCGGAGCACCCCAAGCC,CAGGCGGAGCACCCCAAGCCAGGCGGAGCACCCCAAGCC
3,1027v3,ClinVar,183393,NC_000012.12,12718044,C,T
4,1027v3,ClinVar,183395,NC_000012.12,12718210,CTCT,CT
...,...,...,...,...,...,...,...
1417,9101v1,dbSNP,rs672601311,NC_000015.10,50490449,C,G
1418,9101v1,dbSNP,rs672601311,NC_000015.10,50490449,C,T
1419,9217v1,OmimVar,605704,NC_000020.11,58418317,C,T
1420,9217v1,OmimVar,605704,NC_000020.11,58418288,C,G


In [66]:
clinvar_dbsnp = clinvar_dbsnp.rename(columns={"seq_id":"TranscriptID","position":"Start","ref":"RefAllele","alt":"AltAllele"})

In [67]:
clinvar_dbsnp["End"] = clinvar_dbsnp["Start"]

In [72]:
clinvar_dbsnp['Chr'] = clinvar_dbsnp['TranscriptID'].str[7:9].astype(int)

In [74]:
clinvar_dbsnp = clinvar_dbsnp[['ENTRY', 'Source', 'ID', 'TranscriptID','Chr', 'Start', 'End','RefAllele','AltAllele']]

In [75]:
clinvar_dbsnp

Unnamed: 0,ENTRY,Source,ID,TranscriptID,Chr,Start,End,RefAllele,AltAllele
0,1019v2,ClinVar,16928,NC_000012.12,12,57751647,57751647,G,A
1,1019v2,ClinVar,16929,NC_000012.12,12,57751646,57751646,C,T
2,1027v3,ClinVar,183391,NC_000012.12,12,12717896,12717896,CAGGCGGAGCACCCCAAGCC,CAGGCGGAGCACCCCAAGCCAGGCGGAGCACCCCAAGCC
3,1027v3,ClinVar,183393,NC_000012.12,12,12718044,12718044,C,T
4,1027v3,ClinVar,183395,NC_000012.12,12,12718210,12718210,CTCT,CT
...,...,...,...,...,...,...,...,...,...
1417,9101v1,dbSNP,rs672601311,NC_000015.10,15,50490449,50490449,C,G
1418,9101v1,dbSNP,rs672601311,NC_000015.10,15,50490449,50490449,C,T
1419,9217v1,OmimVar,605704,NC_000020.11,20,58418317,58418317,C,T
1420,9217v1,OmimVar,605704,NC_000020.11,20,58418288,58418288,C,G


In [78]:
cosm_final = cosm_final.drop(columns={"Gene","COSMID","AAChange","Strand"})
cosm_final

Unnamed: 0,ENTRY,Source,ID,TranscriptID,NucChange,Chr,Start,End,RefAllele,AltAllele
0,1019v2,COSM,1677139,ENST00000312990.10,c.70C>T,12,57751648,57751648,G,A
1,1019v2,COSM,1677139,ENST00000549606.5,c.-158+527C>T,12,57751648,57751648,G,A
2,1019v2,COSM,1677139,ENST00000257904.10,c.70C>T,12,57751648,57751648,G,A
3,1019v2,COSM,1989836,ENST00000312990.10,c.71G>A,12,57751647,57751647,C,T
4,1019v2,COSM,1989836,ENST00000549606.5,c.-158+528G>A,12,57751647,57751647,C,T
...,...,...,...,...,...,...,...,...,...,...
1134,999v2,COSM,4766271,ENST00000612417.4,c.662A>G,16,68808823,68808823,A,G
1135,999v2,COSM,4766271,ENST00000611625.4,c.662A>G,16,68808823,68808823,A,G
1136,999v2,COSM,4766271,ENST00000422392.6,c.662A>G,16,68808823,68808823,A,G
1137,999v2,COSM,4766271,ENST00000621016.4,c.662A>G,16,68808823,68808823,A,G


**Final Concatenation**

In [80]:
final_data = pd.concat([cosm_final,clinvar_dbsnp])

In [81]:
final_data

Unnamed: 0,ENTRY,Source,ID,TranscriptID,NucChange,Chr,Start,End,RefAllele,AltAllele
0,1019v2,COSM,1677139,ENST00000312990.10,c.70C>T,12,57751648,57751648,G,A
1,1019v2,COSM,1677139,ENST00000549606.5,c.-158+527C>T,12,57751648,57751648,G,A
2,1019v2,COSM,1677139,ENST00000257904.10,c.70C>T,12,57751648,57751648,G,A
3,1019v2,COSM,1989836,ENST00000312990.10,c.71G>A,12,57751647,57751647,C,T
4,1019v2,COSM,1989836,ENST00000549606.5,c.-158+528G>A,12,57751647,57751647,C,T
...,...,...,...,...,...,...,...,...,...,...
1417,9101v1,dbSNP,rs672601311,NC_000015.10,,15,50490449,50490449,C,G
1418,9101v1,dbSNP,rs672601311,NC_000015.10,,15,50490449,50490449,C,T
1419,9217v1,OmimVar,605704,NC_000020.11,,20,58418317,58418317,C,T
1420,9217v1,OmimVar,605704,NC_000020.11,,20,58418288,58418288,C,G


In [82]:
final_data.to_csv("all_variant_data.tsv",sep='\t',index=False, header=True)

In Excel removed duplicates based on the same Variant ID, Chromosome number, ref allele and alt allele

After removing 1 lines from manual inspection, I am left with 761 variants and their associated variant ids

# Variant ID to Network

In [83]:
gene_variant = pd.read_csv("gene_variants.tsv", sep='\t', names=['Network','ENTRY'])

In [84]:
gene_variant

Unnamed: 0,Network,ENTRY
0,N00002,25v1
1,N00002,25v2
2,N00003,3815v1
3,N00004,2322v1
4,N00004,2322v2
...,...,...
323,N01714,2760v1
324,N01809,5052v1
325,N01873,7428v3
326,N01876,3084v1


In [86]:
all_variant_data = pd.read_csv("all_variant_data.csv")

In [87]:
all_variant_data

Unnamed: 0,ENTRY,Source,ID,TranscriptID,NucChange,Chr,Start,End,RefAllele,AltAllele
0,1019v2,ClinVar,16929,NC_000012.12,,12,57751646,57751646,C,T
1,1019v2,dbSNP,rs104894340,NC_000012.12,,12,57751646,57751646,C,A
2,1019v2,dbSNP,rs104894340,NC_000012.12,,12,57751646,57751646,C,G
3,1019v2,ClinVar,16928,NC_000012.12,,12,57751647,57751647,G,A
4,1019v2,dbSNP,rs11547328,NC_000012.12,,12,57751647,57751647,G,C
...,...,...,...,...,...,...,...,...,...,...
756,9817v1,COSM,6196635,ENST00000393623.6,c.706G>T,19,10492196,10492196,C,A
757,9817v1,COSM,6196637,ENST00000393623.6,c.548A>G,19,10499486,10499486,T,C
758,999v2,COSM,4766271,ENST00000621016.4,c.662A>G,16,68808823,68808823,A,G
759,999v2,COSM,4766211,ENST00000621016.4,c.755T>G,16,68810264,68810264,T,G


In [89]:
variant_data_together_wo_nt = all_variant_data.merge(gene_variant, on="ENTRY")

In [90]:
variant_data_together_wo_nt.to_csv("variant_data_together_wo_nt.tsv", sep='\t',index=False, header=True)

# Parsing Unique Networks and getting Gene Pathway

In [1]:
cd kegg_data

In [2]:
cut -f 1 variant_data_together_wo_nt.tsv > network_variant_data.txt
sort -u network_variant_data.txt > network_variant_data_unique.txt
sed -i '' '/Network/d' network_variant_data_unique.txt
wc -l network_variant_data_unique.txt

     182 network_variant_data_unique.txt


In [3]:
while read p; do
    if ! grep -q ENTRY network_variant/$p.txt; then
        echo "$p"
    fi
done < network_variant_data_unique.txt

In [4]:
while read p; do
    if ! grep -q NAME network_variant/$p.txt; then
        echo "$p"
    fi
done < network_variant_data_unique.txt

In [5]:
while read p; do
    if ! grep -q DEFINITION network_variant/$p.txt; then
        echo "$p"
    fi
done < network_variant_data_unique.txt

In [6]:
while read p; do
    if ! grep -q EXPANDED network_variant/$p.txt; then
        echo "$p"
    fi
done < network_variant_data_unique.txt

In [7]:
while read p; do
    if ! grep -q PATHWAY network_variant/$p.txt; then
        echo "$p"
    fi
done < network_variant_data_unique.txt

N00302
N00303
N00304
N00305
N00600
N00643
N00679
N00789
N01064
N01065
N01419
N01422
N01444
N01714


In [8]:
while read p; do
    if ! grep -q CLASS network_variant/$p.txt; then
        echo "$p"
    fi
done < network_variant_data_unique.txt

In [9]:
while read p; do
    if ! grep -q DISEASE network_variant/$p.txt; then
        echo "$p"
    fi
done < network_variant_data_unique.txt

N01683
N01689
N01697
N01698
N01699
N01700
N01702
N01704
N01714


In [10]:
while read p; do
    if ! grep -q GENE network_variant/$p.txt; then
        echo "$p"
    fi
done < network_variant_data_unique.txt

In [11]:
sed -i '' '/N01683/d' network_variant_data_unique.txt
sed -i '' '/N01689/d' network_variant_data_unique.txt
sed -i '' '/N01697/d' network_variant_data_unique.txt
sed -i '' '/N01698/d' network_variant_data_unique.txt
sed -i '' '/N01699/d' network_variant_data_unique.txt
sed -i '' '/N01700/d' network_variant_data_unique.txt
sed -i '' '/N01702/d' network_variant_data_unique.txt
sed -i '' '/N01704/d' network_variant_data_unique.txt
sed -i '' '/N01714/d' network_variant_data_unique.txt

Networks without a disease tag and thus without a ground truth paragraph

In [12]:
wc -l network_variant_data_unique.txt

     173 network_variant_data_unique.txt


**Switch to python**

In [None]:
cd kegg_data

In [2]:
import pandas as pd
import re

In [90]:
# Define column structure
network_info = pd.DataFrame(columns=["Entry", "Name", "Definition", "Expanded", "Pathway", "Class", "Disease", "Gene"])

In [91]:
# Read all variant IDs
with open('network_variant_data_unique.txt', 'r') as f:
    network_var_id = [line.strip() for line in f if line.strip()]

# Function to extract single-line values (handles leading whitespace too)
def get_single_line_value(lines, key):
    for line in lines:
        if line.lstrip().startswith(key):
            return line.split(key, 1)[-1].strip()
    return ""

# Function to extract multiline values that follow a key line (indented lines)
def get_multiline_values(lines, key):
    values = []
    recording = False
    for i, line in enumerate(lines):
        if line.startswith(key):
            # Capture first line's content after the key
            initial_value = line[len(key):].strip()
            if initial_value:
                values.append(initial_value)
            recording = True
            continue
        if recording:
            if re.match(r'^\s{2,}', line):  # line starts with 2+ spaces
                values.append(line.strip())
            else:
                break  # stop when indentation breaks
    return "| ".join(values)

# Process each network_variant file
for variant_id in network_var_id:
    file_path = f'network_variant/{variant_id}.txt'

    try:
        with open(file_path, 'r') as f:
            lines = f.readlines()

        row = {
            "Entry": variant_id,
            "Name": get_single_line_value(lines, "NAME"),
            "Definition": get_single_line_value(lines, "DEFINITION"),
            "Expanded": get_single_line_value(lines, "EXPANDED"),
            "Pathway": get_multiline_values(lines, "PATHWAY"),
            "Class": get_multiline_values(lines, "CLASS"),
            "Disease": get_multiline_values(lines, "DISEASE"),
            "Gene": get_multiline_values(lines, "GENE")
        }

        network_info = pd.concat([network_info, pd.DataFrame([row])], ignore_index=True)

    except FileNotFoundError:
        print(f"[Warning] File not found: {file_path}")

In [92]:
network_info = network_info.set_index('Entry')

In [93]:
no_pathway = ["N00302","N00303","N00304","N00305","N00600","N00643","N00679","N00789","N01064","N01065","N01419","N01422","N01444"]
for id in no_pathway:
    network_info.at[id, 'Pathway'] = pd.NA

In [94]:
network_info = network_info.reset_index()

In [95]:
# Columns to process
cols_to_clean = ["Pathway", "Class", "Disease","Gene"]

def extract_data(cell):
    if pd.isna(cell):
        return cell  # Leave NaN as is
    gene_dict = {}
    for part in cell.split("|"):
        tokens = part.strip().split()
        if len(tokens) >= 2:
            gene_dict[tokens[0]] = ' '.join(tokens[1:])
        elif len(tokens) == 1:
            gene_dict[tokens[0]] = ""
    return gene_dict

# Apply the transformation to each column
for col in cols_to_clean:
    network_info[col] = network_info[col].apply(extract_data)

In [96]:
network_info

Unnamed: 0,Entry,Name,Definition,Expanded,Pathway,Class,Disease,Gene
0,N00002,BCR-ABL fusion kinase to RAS-ERK signaling pat...,BCR-ABL -> GRB2 -> SOS -> RAS -> RAF -> MEK ->...,"(25v1,25v2) -> 2885 -> (6654,6655) -> (3265,38...",{'hsa05220': 'Chronic myeloid leukemia'},"{'nt06276': 'Chronic myeloid leukemia', 'nt062...",{'H00004': 'Chronic myeloid leukemia'},"{'25': 'ABL1; ABL proto-oncogene 1, non-recept..."
1,N00003,Mutation-activated KIT to RAS-ERK signaling pa...,KIT* -> GRB2 -> SOS -> RAS -> RAF -> MEK -> ERK,"3815v1 -> 2885 -> (6654,6655) -> (3265,3845,48...",{'hsa05221': 'Acute myeloid leukemia'},"{'nt06275': 'Acute myeloid leukemia', 'nt06210...",{'H00003': 'Acute myeloid leukemia'},{'3815': 'KIT; KIT proto-oncogene receptor tyr...
2,N00004,Duplication or mutation-activated FLT3 to RAS-...,FLT3* -> GRB2 -> SOS -> RAS -> RAF -> MEK -> ERK,"(2322v2,2322v1) -> 2885 -> (6654,6655) -> (326...",{'hsa05221': 'Acute myeloid leukemia'},"{'nt06275': 'Acute myeloid leukemia', 'nt06210...",{'H00003': 'Acute myeloid leukemia'},{'2322': 'FLT3; fms related tyrosine kinase 3'...
3,N00005,Mutation-activated MET to RAS-ERK signaling pa...,MET* -> GRB2 -> SOS -> RAS -> RAF -> MEK -> ER...,"4233v1 -> 2885 -> (6654,6655) -> (3265,3845,48...","{'hsa05225': 'Hepatocellular carcinoma', 'hsa0...","{'nt06263': 'Hepatocellular carcinoma', 'nt062...","{'H00048': 'Hepatocellular carcinoma', 'H00021...","{'4233': 'MET; MET proto-oncogene, receptor ty..."
4,N00007,EML4-ALK fusion kinase to RAS-ERK signaling pa...,EML4-ALK -> RAS -> RAF -> MEK -> ERK -> CCND1,"(238v1,238v2) -> (3265,3845,4893) -> (369,673,...",{'hsa05223': 'Non-small cell lung cancer'},"{'nt06266': 'Non-small cell lung cancer', 'nt0...",{'H00014': 'Non-small cell lung cancer'},"{'238': 'ALK; ALK receptor tyrosine kinase', '..."
...,...,...,...,...,...,...,...,...
168,N01422,HPRT1 deficiency in purine salvage pathway,"(Hypoxanthine,Guanine) // HPRT1*","(C00262,C00242) // 3251v1",,{'nt06027': 'Purine salvage pathway'},{'H00194': 'Lesch-Nyhan syndrome'},{'3251': 'HPRT1; hypoxanthine phosphoribosyltr...
169,N01444,NXN mutation to WNT5A-ROR signaling pathway,NXN* -| DVL,"64359v1 -| (1855,1856,1857)",,{'nt06505': 'WNT signaling'},{'H00485': 'Robinow syndrome'},"{'64359': 'NXN; nucleoredoxin', '1855': 'DVL1;..."
170,N01809,Mutation-caused epigenetic silencing of MMACHC,PRDX1* =| MMACHC,5052v1 =| 25974,{'hsa04980': 'Cobalamin transport and metaboli...,{'nt06538': 'Cobalamin transport and metabolism'},{'H02221': 'Methylmalonic aciduria and homocys...,"{'5052': 'PRDX1; peroxiredoxin 1', '25974': 'M..."
171,N01873,VHL mutation to HIF-2 signaling pathway,(VHL*+RBX1+ELOC+ELOB+CUL2) // EPAS1 == ARNT =>...,(7428v3+9978+6921+6923+8453) // 2034 == 405 =>...,{'hsa05211': 'Renal cell carcinoma'},{'nt06542': 'HIF signaling'},"{'H00021': 'Renal cell carcinoma', 'H00559': '...",{'7428': 'VHL; von Hippel-Lindau tumor suppres...


In [97]:
network_info.to_csv("network_variant_final_info.tsv",sep='\t', header=True, index=False)

In [98]:
all_disease_keys = []

for disease in network_info['Disease']:
    if isinstance(disease, dict):
        all_disease_keys.extend(disease.keys())

unique_disease_keys = sorted(set(all_disease_keys))
print(unique_disease_keys)

['H00003', 'H00004', 'H00013', 'H00014', 'H00018', 'H00019', 'H00020', 'H00021', 'H00022', 'H00024', 'H00026', 'H00031', 'H00032', 'H00033', 'H00034', 'H00038', 'H00039', 'H00042', 'H00048', 'H00056', 'H00057', 'H00058', 'H00059', 'H00061', 'H00063', 'H00126', 'H00135', 'H00194', 'H00195', 'H00246', 'H00247', 'H00251', 'H00260', 'H00423', 'H00485', 'H00559', 'H01032', 'H01102', 'H01398', 'H01431', 'H01522', 'H01603', 'H02049', 'H02221']


In [73]:
import subprocess

disease_dict = {}

for disease in unique_disease_keys:
    try:
        # Run the shell command and capture output
        result = subprocess.run(
            f"kegg_pull rest get {disease} | grep DESCRIPTION",
            shell=True,
            capture_output=True,
            text=True
        )
        # Save the stdout (if grep found something)
        if result.stdout:
            disease_dict[disease] = result.stdout.strip()
        else:
            disease_dict[disease] = None  # or "DESCRIPTION not found"
    except Exception as e:
        disease_dict[disease] = f"Error: {str(e)}"

In [99]:
disease_dict

{'H00003': 'DESCRIPTION Acute myeloid leukemia (AML) is a disease that is characterized by uncontrolled proliferation of clonal neoplastic cells and accumulation in the bone marrow of blasts with an impaired differentiation program. AML accounts for approximately 80% of all adult leukemias and remains the most common cause of leukemia death. Two major types of genetic events have been described that are crucial for leukemic transformation. A proposed necessary first event is disordered cell growth and upregulation of cell survival genes. The most common of these activating events were observed in the RTK Flt3, in N-Ras and K-Ras, in Kit, and sporadically in other RTKs. Alterations in myeloid transcription factors governing hematopoietic differentiation provide second necessary event for leukemogenesis. Transcription factor fusion proteins such as PML-RARalpha (in Acute promyelocytic leukemia, a subtype of AML), AML-ETO or PLZF-RARalpha block myeloid cell differentiation by repressing t

In [100]:
# Columns to process
cols_to_edit = ["Disease"]

def put_disease_data(cell):
    if pd.isna(cell):
        return cell  # Leave NaN as is
    gene_dict = {}
    for key in cell.keys():
        gene_dict[key] = disease_dict[key]
    return gene_dict

# Apply the transformation to each column
for col in cols_to_edit:
    network_info[col] = network_info[col].apply(put_disease_data)

In [101]:
network_info

Unnamed: 0,Entry,Name,Definition,Expanded,Pathway,Class,Disease,Gene
0,N00002,BCR-ABL fusion kinase to RAS-ERK signaling pat...,BCR-ABL -> GRB2 -> SOS -> RAS -> RAF -> MEK ->...,"(25v1,25v2) -> 2885 -> (6654,6655) -> (3265,38...",{'hsa05220': 'Chronic myeloid leukemia'},"{'nt06276': 'Chronic myeloid leukemia', 'nt062...",{'H00004': 'DESCRIPTION Chronic myeloid leukem...,"{'25': 'ABL1; ABL proto-oncogene 1, non-recept..."
1,N00003,Mutation-activated KIT to RAS-ERK signaling pa...,KIT* -> GRB2 -> SOS -> RAS -> RAF -> MEK -> ERK,"3815v1 -> 2885 -> (6654,6655) -> (3265,3845,48...",{'hsa05221': 'Acute myeloid leukemia'},"{'nt06275': 'Acute myeloid leukemia', 'nt06210...",{'H00003': 'DESCRIPTION Acute myeloid leukemia...,{'3815': 'KIT; KIT proto-oncogene receptor tyr...
2,N00004,Duplication or mutation-activated FLT3 to RAS-...,FLT3* -> GRB2 -> SOS -> RAS -> RAF -> MEK -> ERK,"(2322v2,2322v1) -> 2885 -> (6654,6655) -> (326...",{'hsa05221': 'Acute myeloid leukemia'},"{'nt06275': 'Acute myeloid leukemia', 'nt06210...",{'H00003': 'DESCRIPTION Acute myeloid leukemia...,{'2322': 'FLT3; fms related tyrosine kinase 3'...
3,N00005,Mutation-activated MET to RAS-ERK signaling pa...,MET* -> GRB2 -> SOS -> RAS -> RAF -> MEK -> ER...,"4233v1 -> 2885 -> (6654,6655) -> (3265,3845,48...","{'hsa05225': 'Hepatocellular carcinoma', 'hsa0...","{'nt06263': 'Hepatocellular carcinoma', 'nt062...",{'H00048': 'DESCRIPTION Hepatocellular carcino...,"{'4233': 'MET; MET proto-oncogene, receptor ty..."
4,N00007,EML4-ALK fusion kinase to RAS-ERK signaling pa...,EML4-ALK -> RAS -> RAF -> MEK -> ERK -> CCND1,"(238v1,238v2) -> (3265,3845,4893) -> (369,673,...",{'hsa05223': 'Non-small cell lung cancer'},"{'nt06266': 'Non-small cell lung cancer', 'nt0...",{'H00014': 'DESCRIPTION Lung cancer is a leadi...,"{'238': 'ALK; ALK receptor tyrosine kinase', '..."
...,...,...,...,...,...,...,...,...
168,N01422,HPRT1 deficiency in purine salvage pathway,"(Hypoxanthine,Guanine) // HPRT1*","(C00262,C00242) // 3251v1",,{'nt06027': 'Purine salvage pathway'},{'H00194': 'DESCRIPTION Deficiency of hypoxant...,{'3251': 'HPRT1; hypoxanthine phosphoribosyltr...
169,N01444,NXN mutation to WNT5A-ROR signaling pathway,NXN* -| DVL,"64359v1 -| (1855,1856,1857)",,{'nt06505': 'WNT signaling'},{'H00485': 'DESCRIPTION Robinow syndrome (RS) ...,"{'64359': 'NXN; nucleoredoxin', '1855': 'DVL1;..."
170,N01809,Mutation-caused epigenetic silencing of MMACHC,PRDX1* =| MMACHC,5052v1 =| 25974,{'hsa04980': 'Cobalamin transport and metaboli...,{'nt06538': 'Cobalamin transport and metabolism'},{'H02221': 'DESCRIPTION Methylmalonic aciduria...,"{'5052': 'PRDX1; peroxiredoxin 1', '25974': 'M..."
171,N01873,VHL mutation to HIF-2 signaling pathway,(VHL*+RBX1+ELOC+ELOB+CUL2) // EPAS1 == ARNT =>...,(7428v3+9978+6921+6923+8453) // 2034 == 405 =>...,{'hsa05211': 'Renal cell carcinoma'},{'nt06542': 'HIF signaling'},{'H00021': 'DESCRIPTION Renal cell cancer (RCC...,{'7428': 'VHL; von Hippel-Lindau tumor suppres...


In [104]:
network_info.to_csv("network_variant_final_info.tsv",sep='\t', header=True, index=False)

In [105]:
!sed -i '' 's/DESCRIPTION //g' network_variant_final_info.tsv

# Final Merge of Variant Data with Network Data

In [117]:
variant_data = pd.read_csv("variant_data_together_wo_nt.tsv", sep='\t')
network_info = pd.read_csv("network_variant_final_info.tsv",sep='\t')
network_info = network_info.rename(columns={"Entry":"Network", "Definition":"Network Definition","Expanded":"Network Expanded"})

In [118]:
variant_data

Unnamed: 0,Network,ENTRY,Source,ID,TranscriptID,NucChange,Chr,Start,End,RefAllele,AltAllele
0,N00073,1019v2,ClinVar,16929,NC_000012.12,,12,57751646,57751646,C,T
1,N00073,1019v2,dbSNP,rs104894340,NC_000012.12,,12,57751646,57751646,C,A
2,N00073,1019v2,dbSNP,rs104894340,NC_000012.12,,12,57751646,57751646,C,G
3,N00073,1019v2,ClinVar,16928,NC_000012.12,,12,57751647,57751647,G,A
4,N00073,1019v2,dbSNP,rs11547328,NC_000012.12,,12,57751647,57751647,G,C
...,...,...,...,...,...,...,...,...,...,...,...
1506,N00244,9817v1,COSM,6196635,ENST00000393623.6,c.706G>T,19,10492196,10492196,C,A
1507,N00244,9817v1,COSM,6196637,ENST00000393623.6,c.548A>G,19,10499486,10499486,T,C
1508,N00258,999v2,COSM,4766271,ENST00000621016.4,c.662A>G,16,68808823,68808823,A,G
1509,N00258,999v2,COSM,4766211,ENST00000621016.4,c.755T>G,16,68810264,68810264,T,G


In [119]:
network_info

Unnamed: 0,Network,Name,Network Definition,Network Expanded,Pathway,Class,Disease,Gene
0,N00002,BCR-ABL fusion kinase to RAS-ERK signaling pat...,BCR-ABL -> GRB2 -> SOS -> RAS -> RAF -> MEK ->...,"(25v1,25v2) -> 2885 -> (6654,6655) -> (3265,38...",{'hsa05220': 'Chronic myeloid leukemia'},"{'nt06276': 'Chronic myeloid leukemia', 'nt062...",{'H00004': 'Chronic myeloid leukemia (CML) is ...,"{'25': 'ABL1; ABL proto-oncogene 1, non-recept..."
1,N00003,Mutation-activated KIT to RAS-ERK signaling pa...,KIT* -> GRB2 -> SOS -> RAS -> RAF -> MEK -> ERK,"3815v1 -> 2885 -> (6654,6655) -> (3265,3845,48...",{'hsa05221': 'Acute myeloid leukemia'},"{'nt06275': 'Acute myeloid leukemia', 'nt06210...",{'H00003': 'Acute myeloid leukemia (AML) is a ...,{'3815': 'KIT; KIT proto-oncogene receptor tyr...
2,N00004,Duplication or mutation-activated FLT3 to RAS-...,FLT3* -> GRB2 -> SOS -> RAS -> RAF -> MEK -> ERK,"(2322v2,2322v1) -> 2885 -> (6654,6655) -> (326...",{'hsa05221': 'Acute myeloid leukemia'},"{'nt06275': 'Acute myeloid leukemia', 'nt06210...",{'H00003': 'Acute myeloid leukemia (AML) is a ...,{'2322': 'FLT3; fms related tyrosine kinase 3'...
3,N00005,Mutation-activated MET to RAS-ERK signaling pa...,MET* -> GRB2 -> SOS -> RAS -> RAF -> MEK -> ER...,"4233v1 -> 2885 -> (6654,6655) -> (3265,3845,48...","{'hsa05225': 'Hepatocellular carcinoma', 'hsa0...","{'nt06263': 'Hepatocellular carcinoma', 'nt062...",{'H00048': 'Hepatocellular carcinoma (HCC) is ...,"{'4233': 'MET; MET proto-oncogene, receptor ty..."
4,N00007,EML4-ALK fusion kinase to RAS-ERK signaling pa...,EML4-ALK -> RAS -> RAF -> MEK -> ERK -> CCND1,"(238v1,238v2) -> (3265,3845,4893) -> (369,673,...",{'hsa05223': 'Non-small cell lung cancer'},"{'nt06266': 'Non-small cell lung cancer', 'nt0...",{'H00014': 'Lung cancer is a leading cause of ...,"{'238': 'ALK; ALK receptor tyrosine kinase', '..."
...,...,...,...,...,...,...,...,...
168,N01422,HPRT1 deficiency in purine salvage pathway,"(Hypoxanthine,Guanine) // HPRT1*","(C00262,C00242) // 3251v1",,{'nt06027': 'Purine salvage pathway'},{'H00194': 'Deficiency of hypoxanthine-guanine...,{'3251': 'HPRT1; hypoxanthine phosphoribosyltr...
169,N01444,NXN mutation to WNT5A-ROR signaling pathway,NXN* -| DVL,"64359v1 -| (1855,1856,1857)",,{'nt06505': 'WNT signaling'},{'H00485': 'Robinow syndrome (RS) is a rare ge...,"{'64359': 'NXN; nucleoredoxin', '1855': 'DVL1;..."
170,N01809,Mutation-caused epigenetic silencing of MMACHC,PRDX1* =| MMACHC,5052v1 =| 25974,{'hsa04980': 'Cobalamin transport and metaboli...,{'nt06538': 'Cobalamin transport and metabolism'},{'H02221': 'Methylmalonic aciduria and homocys...,"{'5052': 'PRDX1; peroxiredoxin 1', '25974': 'M..."
171,N01873,VHL mutation to HIF-2 signaling pathway,(VHL*+RBX1+ELOC+ELOB+CUL2) // EPAS1 == ARNT =>...,(7428v3+9978+6921+6923+8453) // 2034 == 405 =>...,{'hsa05211': 'Renal cell carcinoma'},{'nt06542': 'HIF signaling'},{'H00021': 'Renal cell cancer (RCC) accounts f...,{'7428': 'VHL; von Hippel-Lindau tumor suppres...


In [125]:
final_data = variant_data.merge(network_info, on='Network')

In [126]:
final_data.to_csv("final_network_with_variant.tsv",sep='\t',header=True, index=False)

# Extracting Human Chromosomes

Downloaded the human genome from here https://www.ncbi.nlm.nih.gov/datasets/genome/GCF_000001405.26/

Got all the chromosomes and their ids that we have variants for

NC_000001.11
NC_000002.12
NC_000003.12
NC_000004.12
NC_000005.10
NC_000006.12
NC_000007.14
NC_000009.12
NC_000010.11
NC_000011.10
NC_000012.12
NC_000013.11
NC_000014.9
NC_000015.10
NC_000016.10
NC_000017.11
NC_000018.10
NC_000019.10
NC_000020.11
NC_000021.9
NC_000023.11


In [1]:
cd kegg_data

In [None]:
seqkit grep -r -n -f chromosomes.txt /ncbi_dataset/data/GCF_000001405.26/GCF_000001405.26_GRCh38_genomic.fna -o chromosomes.fasta

[INFO][0m 21 patterns loaded from file


In [3]:
seqkit stats chromosomes.fasta

file               format  type  num_seqs        sum_len     min_len        avg_len      max_len
chromosomes.fasta  FASTA   DNA         21  2,835,085,313  46,709,983  135,004,062.5  248,956,422


In [4]:
du -h chromosomes.fasta

2.7G	chromosomes.fasta


In [5]:
seqkit fx2tab chromosomes.fasta | cut -f1

NC_000001.11 Homo sapiens chromosome 1, GRCh38 Primary Assembly
NC_000002.12 Homo sapiens chromosome 2, GRCh38 Primary Assembly
NC_000003.12 Homo sapiens chromosome 3, GRCh38 Primary Assembly
NC_000004.12 Homo sapiens chromosome 4, GRCh38 Primary Assembly
NC_000005.10 Homo sapiens chromosome 5, GRCh38 Primary Assembly
NC_000006.12 Homo sapiens chromosome 6, GRCh38 Primary Assembly
NC_000007.14 Homo sapiens chromosome 7, GRCh38 Primary Assembly
NC_000009.12 Homo sapiens chromosome 9, GRCh38 Primary Assembly
NC_000010.11 Homo sapiens chromosome 10, GRCh38 Primary Assembly
NC_000011.10 Homo sapiens chromosome 11, GRCh38 Primary Assembly
NC_000012.12 Homo sapiens chromosome 12, GRCh38 Primary Assembly
NC_000013.11 Homo sapiens chromosome 13, GRCh38 Primary Assembly
NC_000014.9 Homo sapiens chromosome 14, GRCh38 Primary Assembly
NC_000015.10 Homo sapiens chromosome 15, GRCh38 Primary Assembly
NC_000016.10 Homo sapiens chromosome 16, GRCh38 Primary Assembly
NC_000017.11 Homo sapiens chromoso

# Creating the Nt Variant Database

In [None]:
cd kegg_data

In [2]:
from Bio import SeqIO
import pandas as pd

In [3]:
variant_data = pd.read_csv("final_network_with_variant.tsv", sep='\t')
variant_data

Unnamed: 0,Var_ID,Network,ENTRY,Source,ID,TranscriptID,NucChange,Chr,Start,End,RefAllele,AltAllele,Name,Network Definition,Network Expanded,Pathway,Class,Disease,Gene
0,KEGG_1,N00073,1019v2,ClinVar,16929,NC_000012.12,,12,57751646,57751646,C,T,Mutation-activated CDK4 to cell cycle G1/S,(CCND+CDK4*) -> RB1 // E2F,"((595,894,896)+1019v2) -> 5925 // (1869,1870,1...",{'hsa05218': 'Melanoma'},"{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...",{'H00038': 'Melanoma is a form of skin cancer ...,"{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc..."
1,KEGG_2,N00073,1019v2,dbSNP,rs104894340,NC_000012.12,,12,57751646,57751646,C,A,Mutation-activated CDK4 to cell cycle G1/S,(CCND+CDK4*) -> RB1 // E2F,"((595,894,896)+1019v2) -> 5925 // (1869,1870,1...",{'hsa05218': 'Melanoma'},"{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...",{'H00038': 'Melanoma is a form of skin cancer ...,"{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc..."
2,KEGG_3,N00073,1019v2,dbSNP,rs104894340,NC_000012.12,,12,57751646,57751646,C,G,Mutation-activated CDK4 to cell cycle G1/S,(CCND+CDK4*) -> RB1 // E2F,"((595,894,896)+1019v2) -> 5925 // (1869,1870,1...",{'hsa05218': 'Melanoma'},"{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...",{'H00038': 'Melanoma is a form of skin cancer ...,"{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc..."
3,KEGG_4,N00073,1019v2,ClinVar,16928,NC_000012.12,,12,57751647,57751647,G,A,Mutation-activated CDK4 to cell cycle G1/S,(CCND+CDK4*) -> RB1 // E2F,"((595,894,896)+1019v2) -> 5925 // (1869,1870,1...",{'hsa05218': 'Melanoma'},"{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...",{'H00038': 'Melanoma is a form of skin cancer ...,"{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc..."
4,KEGG_5,N00073,1019v2,dbSNP,rs11547328,NC_000012.12,,12,57751647,57751647,G,C,Mutation-activated CDK4 to cell cycle G1/S,(CCND+CDK4*) -> RB1 // E2F,"((595,894,896)+1019v2) -> 5925 // (1869,1870,1...",{'hsa05218': 'Melanoma'},"{'nt06268': 'Melanoma', 'nt06230': 'Cell cycle...",{'H00038': 'Melanoma is a form of skin cancer ...,"{'595': 'CCND1; cyclin D1', '894': 'CCND2; cyc..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1444,KEGG_1445,N00244,9817v1,COSM,6196635,ENST00000393623.6,c.706G>T,19,10492196,10492196,C,A,Mutation-inactivated KEAP1 to KEAP1-NRF2 signa...,"KEAP1* // NRF2 => (HMOX1,NQO1,GST,TXNRD1)","9817v1 // 4780 => (3162,1728,119391,221357,293...",{'hsa05225': 'Hepatocellular carcinoma'},"{'nt06263': 'Hepatocellular carcinoma', 'nt062...",{'H00048': 'Hepatocellular carcinoma (HCC) is ...,{'9817': 'KEAP1; kelch like ECH associated pro...
1445,KEGG_1446,N00244,9817v1,COSM,6196637,ENST00000393623.6,c.548A>G,19,10499486,10499486,T,C,Mutation-inactivated KEAP1 to KEAP1-NRF2 signa...,"KEAP1* // NRF2 => (HMOX1,NQO1,GST,TXNRD1)","9817v1 // 4780 => (3162,1728,119391,221357,293...",{'hsa05225': 'Hepatocellular carcinoma'},"{'nt06263': 'Hepatocellular carcinoma', 'nt062...",{'H00048': 'Hepatocellular carcinoma (HCC) is ...,{'9817': 'KEAP1; kelch like ECH associated pro...
1446,KEGG_1447,N00258,999v2,COSM,4766271,ENST00000621016.4,c.662A>G,16,68808823,68808823,A,G,Mutation-inactivated CDH1 to beta-catenin sign...,"CDH1* // CTNNB1 -> TCF/LEF => (MYC,CCND1)","999v2 // 1499 -> (6932,83439,6934,51176) => (4...",{'hsa05226': 'Gastric cancer'},"{'nt06261': 'Gastric cancer', 'nt06215': 'WNT ...","{'H00018': ""Gastric cancer (GC) is one of the ...","{'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c..."
1447,KEGG_1448,N00258,999v2,COSM,4766211,ENST00000621016.4,c.755T>G,16,68810264,68810264,T,G,Mutation-inactivated CDH1 to beta-catenin sign...,"CDH1* // CTNNB1 -> TCF/LEF => (MYC,CCND1)","999v2 // 1499 -> (6932,83439,6934,51176) => (4...",{'hsa05226': 'Gastric cancer'},"{'nt06261': 'Gastric cancer', 'nt06215': 'WNT ...","{'H00018': ""Gastric cancer (GC) is one of the ...","{'999': 'CDH1; cadherin 1', '1499': 'CTNNB1; c..."


In [4]:
len(variant_data)

1449

In [5]:
variant_data.iloc[1]["Network"]

'N00073'

In [6]:
fasta_file = "chromosomes.fasta"
record_dict = SeqIO.to_dict(SeqIO.parse(fasta_file, "fasta"))

In [7]:
chromosome_dictionary = {
    "1": "NC_000001.11",
    "2": "NC_000002.12",
    "3": "NC_000003.12",
    "4": "NC_000004.12",
    "5": "NC_000005.10",
    "6": "NC_000006.12",
    "7": "NC_000007.14",
    "9": "NC_000009.12",
    "10": "NC_000010.11",
    "11": "NC_000011.10",
    "12": "NC_000012.12",
    "13": "NC_000013.11",
    "14": "NC_000014.9",
    "15": "NC_000015.10",
    "16": "NC_000016.10",
    "17": "NC_000017.11",
    "18": "NC_000018.10",
    "19": "NC_000019.10",
    "20": "NC_000020.11",
    "21": "NC_000021.9",
    "23": "NC_000023.11"
}

### Verification that the reference is present at the exact position I have in my data

In [8]:
with open("verification.txt", "w") as f:
    for i in range(len(variant_data)):
        # ---- Input ----
        chromosome_id = chromosome_dictionary[str(variant_data.iloc[i]['Chr'])]
        if (variant_data.iloc[i]['TranscriptID'][:4] == "ENST"):
            start = variant_data.iloc[i]['Start'] - 1
        else:
            start = variant_data.iloc[i]['Start']
        reference_allele = variant_data.iloc[i]['RefAllele']
        end = len(reference_allele) + start

        chrom_seq = record_dict[chromosome_id].seq

        # Adjust for 0-based indexing in Python
        genomic_ref = chrom_seq[start: start + len(reference_allele)]

        if genomic_ref.upper() != reference_allele.upper():
            f.write(f"⚠️ Warning: Entry number {i} with variant {variant_data.iloc[i]['ID']} expected '{reference_allele}', but found '{genomic_ref}'\n")
        else:
            f.write(f"✅ Verified: {chromosome_id}:{start}-{end} → '{reference_allele}' matches genome\n")

In [9]:
mkdir nt_seq

### Performing the mutation and saving the reference and variant allele with a 1000 nt window

In [18]:
for i in range(len(variant_data)):
    with open(f"nt_seq/{variant_data.iloc[i]['Var_ID']}.txt", "w") as f:
        # ---- Input ----
        chromosome_id = chromosome_dictionary[str(variant_data.iloc[i]['Chr'])]
        if (variant_data.iloc[i]['TranscriptID'][:4] == "ENST"):
            start = variant_data.iloc[i]['Start'] - 1
        else:
            start = variant_data.iloc[i]['Start']
        reference_allele = variant_data.iloc[i]['RefAllele']
        variant_allele = variant_data.iloc[i]['AltAllele']

        end = len(reference_allele) + start
        window = 1000
        
        chrom_seq = record_dict[chromosome_id].seq

        # Extract region
        region_start = max(0, start - window)
        region_end = end + window

        ref_seq = chrom_seq[region_start:region_end]
    
        if (variant_allele == "deletion"):
            # Apply mutation
            mutated_seq = ref_seq[:window] + variant_allele + ref_seq[window + len(reference_allele):]
    
            f.write(f">{variant_data.iloc[i]['ID']}_reference_{reference_allele}\n")
            f.write(f"{ref_seq}\n")
            f.write(f">{variant_data.iloc[i]['ID']}_variant_{variant_allele}\n")
            f.write(f"{mutated_seq}\n")
        else:
            del_len = len(reference_allele)
            # Apply mutation
            mutated_seq = ref_seq[:window] + ref_seq[window + del_len:]
    
            f.write(f">{variant_data.iloc[i]['ID']}_reference_{reference_allele}\n")
            f.write(f"{ref_seq}\n")
            f.write(f">{variant_data.iloc[i]['ID']}_variant_{variant_allele}\n")
            f.write(f"{mutated_seq}\n")