In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# install pysam to work with vcf files
!pip install pysam

Collecting pysam
  Downloading pysam-0.22.0-cp310-cp310-manylinux_2_28_x86_64.whl (21.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.9/21.9 MB[0m [31m36.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pysam
Successfully installed pysam-0.22.0


In [3]:
import pysam
import os
import pandas as pd

In [None]:
# step 3: OMIM database

import pandas as pd

# Specify the file path
file_path = "/content/drive/MyDrive/Mode of Inheritance Task/Mode of Inheritance Task files/databases/OMIM_genemap2_chr17.txt"

# Read the text file into a DataFrame, skipping initial rows starting with #
omim_data = pd.read_csv(file_path, delimiter='\t', comment='#')


In [None]:
# Display the DataFrame
omim_data.head(1)

Unnamed: 0,Chromosome,Genomic Position Start,Genomic Position End,Cyto Location,Computed Cyto Location,MIM Number,Gene/Locus And Other Related Symbols,Gene Name,Approved Gene Symbol,Entrez Gene ID,Ensembl Gene ID,Comments,Phenotypes,Mouse Gene Symbol/ID
0,chr17,1,10800000,17p13,,608631,ASPG2,"Asperger syndrome, susceptibility to, 2",,431711.0,,breakpoints between CHRNE and GP1BA,"{Asperger syndrome susceptibility 2}, 608631 (...",


In [None]:
# preprocess OMIM database: filter some columns and add MOI column based on Phenotypes
filtered_columns = ['Genomic Position Start', 'Genomic Position End', 'Gene/Locus And Other Related Symbols', 'Gene Name', 'Phenotypes']

filtered_omim_data = omim_data[filtered_columns]

filtered_omim_data.loc[:, 'MOI'] = ''

for i, phenotype in enumerate(filtered_omim_data['Phenotypes']):
    MOI_list = []
    if "dominant" in str(phenotype).lower() and "recessive" in str(phenotype).lower():
      MOI_list.append('AD/AR')
    elif "dominant" in str(phenotype).lower():
      MOI_list.append('AD')
    elif "recessive" in str(phenotype).lower():
      MOI_list.append('AR')
    else:
      MOI_list.append('')
    filtered_omim_data.loc[i, 'MOI'] = ', '.join(MOI_list)


output_file_path = "/content/drive/MyDrive/Mode of Inheritance Task/Mode of Inheritance Task files/preprocessed databases/filtered_omim_data.csv"

filtered_omim_data.to_csv(output_file_path, index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_omim_data.loc[:, 'MOI'] = ''


In [None]:
# CDG database

# Specify the file path
file_path = "/content/drive/MyDrive/Mode of Inheritance Task/Mode of Inheritance Task files/databases/CGD.txt.gz"

# Read the text file into a DataFrame, skipping initial rows starting with #
CGD_data = pd.read_csv(file_path, delimiter='\t')


In [None]:
# Display the DataFrame
CGD_data.head(3)

Unnamed: 0,#GENE,HGNC ID,ENTREZ GENE ID,CONDITION,INHERITANCE,AGE GROUP,ALLELIC CONDITIONS,MANIFESTATION CATEGORIES,INTERVENTION CATEGORIES,COMMENTS,INTERVENTION/RATIONALE,REFERENCES
0,A2M,7,2,Alpha-2-macroglobulin deficiency,AD,,,General,General,Variants have been implicated in pulmonary dis...,The clinical consequences of variants are unclear,94459; 2475424; 1370808
1,A2ML1,23336,144568,"Otitis media, susceptibility to",AD,Pediatric,,Allergy/Immunology/Infectious,Allergy/Immunology/Infectious,,Individuals may have increased susceptibility ...,26121085
2,A4GALT,18149,53947,"Blood group, P1PK system",BG,Pediatric,,Hematologic,Hematologic,,Variants associated with a blood group may be ...,10993874


In [4]:
!pip install biopython

Collecting biopython
  Downloading biopython-1.83-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: biopython
Successfully installed biopython-1.83


In [None]:
# note: the next cell takes about 50 minutes to run

In [None]:
import pandas as pd
from Bio import Entrez

file_path = "/content/drive/MyDrive/Mode of Inheritance Task/Mode of Inheritance Task files/databases/CGD.txt.gz"

CGD_data = pd.read_csv(file_path, delimiter='\t')

# Entrez email address (required by NCBI)
Entrez.email = "alihajisadeghian250@gmail.com"

start_pos_list = []
end_pos_list = []

import time

# Function to retrieve gene information with retry mechanism
def retrieve_gene_info_with_retry(entrez_gene_id, max_retries=5, delay=1):
    num_retries = 0
    while num_retries < max_retries:
        try:
            handle = Entrez.efetch(db="gene", id=entrez_gene_id, rettype="gb", retmode="text")
            gene_record = handle.read()
            handle.close()
            return gene_record
        except Exception as e:
            print("Error retrieving gene info for Entrez Gene ID:", entrez_gene_id)
            print("Error message:", e)
            print("Retrying...")
            num_retries += 1
            time.sleep(delay)
    return None

# Iterate over each row in CGD_data DataFrame
for index, row in CGD_data.iterrows():
    # Retrieve the ENTREZ GENE ID from the current row
    entrez_gene_id = str(row['ENTREZ GENE ID'])

    # Query Entrez Gene database to retrieve gene information with retry mechanism
    gene_record = retrieve_gene_info_with_retry(entrez_gene_id)

    # Check if gene_record is None (indicating an error occurred)
    if gene_record is None:
        # Append None values to start and end positions lists
        start_pos_list.append(None)
        end_pos_list.append(None)
        continue

    # Check if the gene is located on chromosome 17
    if "chromosome 17" in gene_record:
        # Extract start and end positions
        start_pos = int(gene_record.split('\n')[6].split('(')[1].split('..')[0])
        end_pos = int(gene_record.split('\n')[6].split('(')[1].split('..')[1].split(',')[0].split(')')[0])
    else:
        start_pos = None
        end_pos = None

    # Append start and end positions to lists
    start_pos_list.append(start_pos)
    end_pos_list.append(end_pos)

# Add new columns to CGD_data DataFrame
CGD_data['start_pos'] = start_pos_list
CGD_data['end_pos'] = end_pos_list

# Filter desired columns
filtered_CGD_data = CGD_data[['#GENE', 'ENTREZ GENE ID', 'INHERITANCE', 'start_pos', 'end_pos']]

# Save the filtered data to a new CSV file
filtered_file_path = "/content/drive/MyDrive/Mode of Inheritance Task/Mode of Inheritance Task files/preprocessed databases/filtered_CGD_data.csv"
filtered_CGD_data.to_csv(filtered_file_path, index=False)


In [None]:
import pandas as pd

# Load the filtered CGD data from the CSV file
filtered_file_path = "/content/drive/MyDrive/Mode of Inheritance Task/Mode of Inheritance Task files/preprocessed databases/filtered_CGD_data.csv"
filtered_CGD_data = pd.read_csv(filtered_file_path)

# Filter rows where both 'start_pos' and 'end_pos' columns have values
filtered_data_with_positions = filtered_CGD_data.dropna(subset=['start_pos', 'end_pos'])

# Save the filtered data to a new CSV file
filtered_data_with_positions.to_csv("/content/drive/MyDrive/Mode of Inheritance Task/Mode of Inheritance Task files/preprocessed databases/filtered_CGD_data_with_positions.csv", index=False)

print("Filtered CGD data with positions saved.")


Filtered CGD data with positions saved.


In [None]:
# let's start clingen
file_path = "/content/drive/MyDrive/Mode of Inheritance Task/Mode of Inheritance Task files/databases/Clingen-Gene-Disease-Summary-2024-02-26.csv"

clingen_data = pd.read_csv(file_path, header = 4, skiprows=[5])

In [None]:
clingen_data.head()

Unnamed: 0,GENE SYMBOL,GENE ID (HGNC),DISEASE LABEL,DISEASE ID (MONDO),MOI,SOP,CLASSIFICATION,ONLINE REPORT,CLASSIFICATION DATE,GCEP
0,A2ML1,HGNC:23336,Noonan syndrome,MONDO:0018997,AD,SOP5,Disputed,https://search.clinicalgenome.org/kb/gene-vali...,2018-06-07T16:00:00.000Z,RASopathy
1,AARS2,HGNC:21022,mitochondrial disease,MONDO:0044970,AR,SOP8,Definitive,https://search.clinicalgenome.org/kb/gene-vali...,2022-04-18T16:00:00.000Z,Mitochondrial Diseases
2,AASS,HGNC:17366,hyperlysinemia,MONDO:0009388,AR,SOP9,Definitive,https://search.clinicalgenome.org/kb/gene-vali...,2022-10-14T16:00:00.000Z,Aminoacidopathy
3,ABAT,HGNC:23,developmental and epileptic encephalopathy,MONDO:0100062,AR,SOP8,Moderate,https://search.clinicalgenome.org/kb/gene-vali...,2022-04-19T16:00:00.000Z,Epilepsy
4,ABCA4,HGNC:34,ABCA4-related retinopathy,MONDO:0800406,AR,SOP9,Definitive,https://search.clinicalgenome.org/kb/gene-vali...,2022-10-06T16:00:00.000Z,Retina


In [None]:
import pandas as pd
from Bio import Entrez
import time

file_path = "/content/drive/MyDrive/Mode of Inheritance Task/Mode of Inheritance Task files/databases/Clingen-Gene-Disease-Summary-2024-02-26.csv"

clingen_data = pd.read_csv(file_path, header = 4, skiprows=[5])

# Entrez email address (required by NCBI)
Entrez.email = "alihajisadeghian250@gmail.com"

start_pos_list = []
end_pos_list = []

# Function to retrieve gene information with retry mechanism
def retrieve_gene_info_with_retry_input_symbol(gene_symbol, max_retries=5, delay=1):
    num_retries = 0
    while num_retries < max_retries:
        try:
            handle = Entrez.esearch(db="gene", term=gene_symbol)
            record = Entrez.read(handle)
            gene_id = record['IdList'][0]
            handle = Entrez.efetch(db="gene", id=gene_id, rettype="gb", retmode="text")
            gene_record = handle.read()
            handle.close()
            return gene_record
        except Exception as e:
            print("Error retrieving gene info for gene symbol:", gene_symbol)
            print("Error message:", e)
            print("Retrying...")
            num_retries += 1
            time.sleep(delay)
    return None

# Iterate over each row in CGD_data DataFrame
for index, row in clingen_data.iterrows():

    # Query Entrez Gene database to retrieve gene information with retry mechanism
    gene_record = retrieve_gene_info_with_retry_input_symbol(clingen_data['GENE SYMBOL'][index])

    # Check if gene_record is None (indicating an error occurred)
    if gene_record is None:
        # Append None values to start and end positions lists
        start_pos_list.append(None)
        end_pos_list.append(None)
        continue

    # Check if the gene is located on chromosome 17
    if "chromosome 17" in gene_record:
        # Extract start and end positions
        start_pos = int(gene_record.split('\n')[6].split('(')[1].split('..')[0])
        end_pos = int(gene_record.split('\n')[6].split('(')[1].split('..')[1].split(',')[0].split(')')[0])
    else:
        start_pos = None
        end_pos = None

    # Append start and end positions to lists
    start_pos_list.append(start_pos)
    end_pos_list.append(end_pos)

# Add new columns to CGD_data DataFrame
clingen_data['start_pos'] = start_pos_list
clingen_data['end_pos'] = end_pos_list

# Filter desired columns
filtered_clingen_data = clingen_data[['GENE SYMBOL', 'MOI', 'start_pos', 'end_pos']]

# Save the filtered data to a new CSV file
filtered_file_path = "/content/drive/MyDrive/Mode of Inheritance Task/Mode of Inheritance Task files/preprocessed databases/filtered_clingen_data.csv"
filtered_clingen_data.to_csv(filtered_file_path, index=False)


Error retrieving gene info for gene symbol: BRAF
Error message: HTTP Error 400: Bad Request
Retrying...
Error retrieving gene info for gene symbol: CFL2
Error message: HTTP Error 400: Bad Request
Retrying...
Error retrieving gene info for gene symbol: CHEK1
Error message: HTTP Error 400: Bad Request
Retrying...
Error retrieving gene info for gene symbol: CPAMD8
Error message: HTTP Error 400: Bad Request
Retrying...
Error retrieving gene info for gene symbol: DNAH8
Error message: HTTP Error 400: Bad Request
Retrying...
Error retrieving gene info for gene symbol: DOCK8
Error message: HTTP Error 400: Bad Request
Retrying...
Error retrieving gene info for gene symbol: F5
Error message: HTTP Error 400: Bad Request
Retrying...
Error retrieving gene info for gene symbol: FGD4
Error message: HTTP Error 400: Bad Request
Retrying...
Error retrieving gene info for gene symbol: FXYD2
Error message: HTTP Error 400: Bad Request
Retrying...
Error retrieving gene info for gene symbol: GALNT2
Error mes

In [None]:
import pandas as pd

# Load the filtered clingen data from the CSV file
filtered_file_path = "/content/drive/MyDrive/Mode of Inheritance Task/Mode of Inheritance Task files/preprocessed databases/filtered_clingen_data.csv"
filtered_clingen_data = pd.read_csv(filtered_file_path)

# Filter rows where both 'start_pos' and 'end_pos' columns have values
filtered_data_with_positions = filtered_clingen_data.dropna(subset=['start_pos', 'end_pos'])

# Save the filtered data to a new CSV file
filtered_data_with_positions.to_csv("/content/drive/MyDrive/Mode of Inheritance Task/Mode of Inheritance Task files/preprocessed databases/filtered_clingen_data_with_positions.csv", index=False)

print("Filtered clingen data with positions saved.")


Filtered clingen data with positions saved.


In [5]:
# gene2phenotype: 1
# CancerG2p
import pandas as pd
file_path = '/content/drive/MyDrive/Mode of Inheritance Task/Mode of Inheritance Task files/databases/gene2phenotype/CancerG2P_26_2_2024.csv.gz'
cancer_data = pd.read_csv(file_path)

In [9]:
# CancerG2p
# preprocess database: filter some columns and add MOI column based on disease name
filtered_columns = ['gene symbol', 'disease name']

filtered_cancer_data = cancer_data[filtered_columns]

filtered_cancer_data.loc[:, 'MOI'] = ''

for i, name in enumerate(filtered_cancer_data['disease name']):
    MOI_list = []
    if "dominant" in str(name).lower() and "recessive" in str(name).lower():
      MOI_list.append('AD/AR')
    elif "dominant" in str(name).lower():
      MOI_list.append('AD')
    elif "recessive" in str(name).lower():
      MOI_list.append('AR')
    else:
      MOI_list.append('')
    filtered_cancer_data.loc[i, 'MOI'] = ', '.join(MOI_list)

filtered_cancer_data = filtered_cancer_data[filtered_cancer_data['MOI'] != '']
output_file_path = "/content/drive/MyDrive/Mode of Inheritance Task/Mode of Inheritance Task files/preprocessed databases/filtered_cancer_data.csv"

filtered_cancer_data.to_csv(output_file_path, index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_cancer_data.loc[:, 'MOI'] = ''


In [11]:
# pos in cancerG2p
import pandas as pd
from Bio import Entrez
import time

file_path = "/content/drive/MyDrive/Mode of Inheritance Task/Mode of Inheritance Task files/preprocessed databases/filtered_cancer_data.csv"

filtered_cancer_data = pd.read_csv(file_path)

# Entrez email address (required by NCBI)
Entrez.email = "alihajisadeghian250@gmail.com"

start_pos_list = []
end_pos_list = []

# Function to retrieve gene information with retry mechanism
def retrieve_gene_info_with_retry_input_symbol(gene_symbol, max_retries=5, delay=1):
    num_retries = 0
    while num_retries < max_retries:
        try:
            handle = Entrez.esearch(db="gene", term=gene_symbol)
            record = Entrez.read(handle)
            gene_id = record['IdList'][0]
            handle = Entrez.efetch(db="gene", id=gene_id, rettype="gb", retmode="text")
            gene_record = handle.read()
            handle.close()
            return gene_record
        except Exception as e:
            print("Error retrieving gene info for gene symbol:", gene_symbol)
            print("Error message:", e)
            print("Retrying...")
            num_retries += 1
            time.sleep(delay)
    return None

# Iterate over each row in CGD_data DataFrame
for index, row in filtered_cancer_data.iterrows():

    # Query Entrez Gene database to retrieve gene information with retry mechanism
    gene_record = retrieve_gene_info_with_retry_input_symbol(filtered_cancer_data['gene symbol'][index])

    # Check if gene_record is None (indicating an error occurred)
    if gene_record is None:
        # Append None values to start and end positions lists
        start_pos_list.append(None)
        end_pos_list.append(None)
        continue

    # Check if the gene is located on chromosome 17
    if "chromosome 17" in gene_record:
        # Extract start and end positions
        start_pos = int(gene_record.split('\n')[6].split('(')[1].split('..')[0])
        end_pos = int(gene_record.split('\n')[6].split('(')[1].split('..')[1].split(',')[0].split(')')[0])
    else:
        start_pos = None
        end_pos = None

    # Append start and end positions to lists
    start_pos_list.append(start_pos)
    end_pos_list.append(end_pos)

# Add new columns to CGD_data DataFrame
filtered_cancer_data['start_pos'] = start_pos_list
filtered_cancer_data['end_pos'] = end_pos_list

# Filter rows where both 'start_pos' and 'end_pos' columns have values
filtered_cancer_data_with_positions = filtered_cancer_data.dropna(subset=['start_pos', 'end_pos'])

# Save the filtered data to a new CSV file
filtered_file_path_with_position = "/content/drive/MyDrive/Mode of Inheritance Task/Mode of Inheritance Task files/preprocessed databases/filtered_cancer_data_with_position.csv"
filtered_cancer_data_with_positions.to_csv(filtered_file_path_with_position, index=False)


Error retrieving gene info for gene symbol: PARN
Error message: HTTP Error 400: Bad Request
Retrying...


In [14]:
# gene2phenotype: 2
# CardiacG2p
import pandas as pd
file_path = '/content/drive/MyDrive/Mode of Inheritance Task/Mode of Inheritance Task files/databases/gene2phenotype/CardiacG2P_26_2_2024.csv.gz'
cardiac_data = pd.read_csv(file_path)

In [16]:
# CardiacG2p
# preprocess database: filter some columns and add MOI column based on disease name
filtered_columns = ['gene symbol', 'comments']

filtered_cardiac_data = cardiac_data[filtered_columns]

filtered_cardiac_data.loc[:, 'MOI'] = ''

for i, comment in enumerate(filtered_cardiac_data['comments']):
    MOI_list = []
    if "dominant" in str(comment).lower() and "recessive" in str(comment).lower():
      MOI_list.append('AD/AR')
    elif "dominant" in str(comment).lower():
      MOI_list.append('AD')
    elif "recessive" in str(comment).lower():
      MOI_list.append('AR')
    else:
      MOI_list.append('')
    filtered_cardiac_data.loc[i, 'MOI'] = ', '.join(MOI_list)

filtered_cardiac_data = filtered_cardiac_data[filtered_cardiac_data['MOI'] != '']
output_file_path = "/content/drive/MyDrive/Mode of Inheritance Task/Mode of Inheritance Task files/preprocessed databases/filtered_cardiac_data.csv"

filtered_cardiac_data.to_csv(output_file_path, index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_cardiac_data.loc[:, 'MOI'] = ''


In [17]:
# pos in cardiacG2p
import pandas as pd
from Bio import Entrez
import time

file_path = "/content/drive/MyDrive/Mode of Inheritance Task/Mode of Inheritance Task files/preprocessed databases/filtered_cardiac_data.csv"

filtered_cardiac_data = pd.read_csv(file_path)

# Entrez email address (required by NCBI)
Entrez.email = "alihajisadeghian250@gmail.com"

start_pos_list = []
end_pos_list = []

# Function to retrieve gene information with retry mechanism
def retrieve_gene_info_with_retry_input_symbol(gene_symbol, max_retries=5, delay=1):
    num_retries = 0
    while num_retries < max_retries:
        try:
            handle = Entrez.esearch(db="gene", term=gene_symbol)
            record = Entrez.read(handle)
            gene_id = record['IdList'][0]
            handle = Entrez.efetch(db="gene", id=gene_id, rettype="gb", retmode="text")
            gene_record = handle.read()
            handle.close()
            return gene_record
        except Exception as e:
            print("Error retrieving gene info for gene symbol:", gene_symbol)
            print("Error message:", e)
            print("Retrying...")
            num_retries += 1
            time.sleep(delay)
    return None

# Iterate over each row in CGD_data DataFrame
for index, row in filtered_cardiac_data.iterrows():

    # Query Entrez Gene database to retrieve gene information with retry mechanism
    gene_record = retrieve_gene_info_with_retry_input_symbol(filtered_cardiac_data['gene symbol'][index])

    # Check if gene_record is None (indicating an error occurred)
    if gene_record is None:
        # Append None values to start and end positions lists
        start_pos_list.append(None)
        end_pos_list.append(None)
        continue

    # Check if the gene is located on chromosome 17
    if "chromosome 17" in gene_record:
        # Extract start and end positions
        start_pos = int(gene_record.split('\n')[6].split('(')[1].split('..')[0])
        end_pos = int(gene_record.split('\n')[6].split('(')[1].split('..')[1].split(',')[0].split(')')[0])
    else:
        start_pos = None
        end_pos = None

    # Append start and end positions to lists
    start_pos_list.append(start_pos)
    end_pos_list.append(end_pos)

# Add new columns to CGD_data DataFrame
filtered_cardiac_data['start_pos'] = start_pos_list
filtered_cardiac_data['end_pos'] = end_pos_list

# Filter rows where both 'start_pos' and 'end_pos' columns have values
filtered_cardiac_data_with_positions = filtered_cardiac_data.dropna(subset=['start_pos', 'end_pos'])

# Save the filtered data to a new CSV file
filtered_file_path_with_position = "/content/drive/MyDrive/Mode of Inheritance Task/Mode of Inheritance Task files/preprocessed databases/filtered_cardiac_data_with_position.csv"
filtered_cardiac_data_with_positions.to_csv(filtered_file_path_with_position, index=False)


Error retrieving gene info for gene symbol: PKP2
Error message: HTTP Error 400: Bad Request
Retrying...


In [20]:
# gene2phenotype: 3
# DDG2P
import pandas as pd
file_path = '/content/drive/MyDrive/Mode of Inheritance Task/Mode of Inheritance Task files/databases/gene2phenotype/DDG2P_26_2_2024.csv.gz'
DDG_data = pd.read_csv(file_path)

In [22]:
# DDG2P
# preprocess database: filter some columns and add MOI column based on disease name
filtered_columns = ['gene symbol', 'disease name']

filtered_DDG_data = DDG_data[filtered_columns]

filtered_DDG_data.loc[:, 'MOI'] = ''

for i, name in enumerate(filtered_DDG_data['disease name']):
    MOI_list = []
    if "dominant" in str(name).lower() and "recessive" in str(name).lower():
      MOI_list.append('AD/AR')
    elif "dominant" in str(name).lower():
      MOI_list.append('AD')
    elif "recessive" in str(name).lower():
      MOI_list.append('AR')
    else:
      MOI_list.append('')
    filtered_DDG_data.loc[i, 'MOI'] = ', '.join(MOI_list)

filtered_DDG_data = filtered_DDG_data[filtered_DDG_data['MOI'] != '']
output_file_path = "/content/drive/MyDrive/Mode of Inheritance Task/Mode of Inheritance Task files/preprocessed databases/filtered_DDG_data.csv"

filtered_DDG_data.to_csv(output_file_path, index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_DDG_data.loc[:, 'MOI'] = ''


In [23]:
# pos in cardiacG2p
import pandas as pd
from Bio import Entrez
import time

file_path = "/content/drive/MyDrive/Mode of Inheritance Task/Mode of Inheritance Task files/preprocessed databases/filtered_DDG_data.csv"

filtered_DDG_data = pd.read_csv(file_path)

# Entrez email address (required by NCBI)
Entrez.email = "alihajisadeghian250@gmail.com"

start_pos_list = []
end_pos_list = []

# Function to retrieve gene information with retry mechanism
def retrieve_gene_info_with_retry_input_symbol(gene_symbol, max_retries=5, delay=1):
    num_retries = 0
    while num_retries < max_retries:
        try:
            handle = Entrez.esearch(db="gene", term=gene_symbol)
            record = Entrez.read(handle)
            gene_id = record['IdList'][0]
            handle = Entrez.efetch(db="gene", id=gene_id, rettype="gb", retmode="text")
            gene_record = handle.read()
            handle.close()
            return gene_record
        except Exception as e:
            print("Error retrieving gene info for gene symbol:", gene_symbol)
            print("Error message:", e)
            print("Retrying...")
            num_retries += 1
            time.sleep(delay)
    return None

# Iterate over each row in CGD_data DataFrame
for index, row in filtered_DDG_data.iterrows():

    # Query Entrez Gene database to retrieve gene information with retry mechanism
    gene_record = retrieve_gene_info_with_retry_input_symbol(filtered_DDG_data['gene symbol'][index])

    # Check if gene_record is None (indicating an error occurred)
    if gene_record is None:
        # Append None values to start and end positions lists
        start_pos_list.append(None)
        end_pos_list.append(None)
        continue

    # Check if the gene is located on chromosome 17
    if "chromosome 17" in gene_record:
        # Extract start and end positions
        start_pos = int(gene_record.split('\n')[6].split('(')[1].split('..')[0])
        end_pos = int(gene_record.split('\n')[6].split('(')[1].split('..')[1].split(',')[0].split(')')[0])
    else:
        start_pos = None
        end_pos = None

    # Append start and end positions to lists
    start_pos_list.append(start_pos)
    end_pos_list.append(end_pos)

# Add new columns to CGD_data DataFrame
filtered_DDG_data['start_pos'] = start_pos_list
filtered_DDG_data['end_pos'] = end_pos_list

# Filter rows where both 'start_pos' and 'end_pos' columns have values
filtered_DDG_data_with_positions = filtered_DDG_data.dropna(subset=['start_pos', 'end_pos'])

# Save the filtered data to a new CSV file
filtered_file_path_with_position = "/content/drive/MyDrive/Mode of Inheritance Task/Mode of Inheritance Task files/preprocessed databases/filtered_DDG_data_with_position.csv"
filtered_DDG_data_with_positions.to_csv(filtered_file_path_with_position, index=False)


Error retrieving gene info for gene symbol: SLC25A38
Error message: HTTP Error 400: Bad Request
Retrying...
Error retrieving gene info for gene symbol: RGS7
Error message: HTTP Error 400: Bad Request
Retrying...
Error retrieving gene info for gene symbol: ST14
Error message: HTTP Error 400: Bad Request
Retrying...
Error retrieving gene info for gene symbol: TUBG1
Error message: HTTP Error 400: Bad Request
Retrying...


In [26]:
# gene2phenotype: 4
# Eye2P
import pandas as pd
file_path = '/content/drive/MyDrive/Mode of Inheritance Task/Mode of Inheritance Task files/databases/gene2phenotype/EyeG2P_26_2_2024.csv.gz'
Eye_data = pd.read_csv(file_path)

In [28]:
# Eye2P
# preprocess database: filter some columns and add MOI column based on disease name
filtered_columns = ['gene symbol', 'disease name']

filtered_Eye_data = Eye_data[filtered_columns]

filtered_Eye_data.loc[:, 'MOI'] = ''

for i, name in enumerate(filtered_Eye_data['disease name']):
    MOI_list = []
    if "dominant" in str(name).lower() and "recessive" in str(name).lower():
      MOI_list.append('AD/AR')
    elif "dominant" in str(name).lower():
      MOI_list.append('AD')
    elif "recessive" in str(name).lower():
      MOI_list.append('AR')
    else:
      MOI_list.append('')
    filtered_Eye_data.loc[i, 'MOI'] = ', '.join(MOI_list)

filtered_Eye_data = filtered_Eye_data[filtered_Eye_data['MOI'] != '']
output_file_path = "/content/drive/MyDrive/Mode of Inheritance Task/Mode of Inheritance Task files/preprocessed databases/filtered_Eye_data.csv"

filtered_Eye_data.to_csv(output_file_path, index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_Eye_data.loc[:, 'MOI'] = ''


In [30]:
# pos in Eye2P
import pandas as pd
from Bio import Entrez
import time

file_path = "/content/drive/MyDrive/Mode of Inheritance Task/Mode of Inheritance Task files/preprocessed databases/filtered_Eye_data.csv"

filtered_Eye_data = pd.read_csv(file_path)

# Entrez email address (required by NCBI)
Entrez.email = "alihajisadeghian250@gmail.com"

start_pos_list = []
end_pos_list = []

# Function to retrieve gene information with retry mechanism
def retrieve_gene_info_with_retry_input_symbol(gene_symbol, max_retries=5, delay=1):
    num_retries = 0
    while num_retries < max_retries:
        try:
            handle = Entrez.esearch(db="gene", term=gene_symbol)
            record = Entrez.read(handle)
            gene_id = record['IdList'][0]
            handle = Entrez.efetch(db="gene", id=gene_id, rettype="gb", retmode="text")
            gene_record = handle.read()
            handle.close()
            return gene_record
        except Exception as e:
            print("Error retrieving gene info for gene symbol:", gene_symbol)
            print("Error message:", e)
            print("Retrying...")
            num_retries += 1
            time.sleep(delay)
    return None

# Iterate over each row in CGD_data DataFrame
for index, row in filtered_Eye_data.iterrows():

    # Query Entrez Gene database to retrieve gene information with retry mechanism
    gene_record = retrieve_gene_info_with_retry_input_symbol(filtered_Eye_data['gene symbol'][index])

    # Check if gene_record is None (indicating an error occurred)
    if gene_record is None:
        # Append None values to start and end positions lists
        start_pos_list.append(None)
        end_pos_list.append(None)
        continue

    # Check if the gene is located on chromosome 17
    if "chromosome 17" in gene_record:
        # Extract start and end positions
        start_pos = int(gene_record.split('\n')[6].split('(')[1].split('..')[0])
        end_pos = int(gene_record.split('\n')[6].split('(')[1].split('..')[1].split(',')[0].split(')')[0])
    else:
        start_pos = None
        end_pos = None

    # Append start and end positions to lists
    start_pos_list.append(start_pos)
    end_pos_list.append(end_pos)

# Add new columns to CGD_data DataFrame
filtered_Eye_data['start_pos'] = start_pos_list
filtered_Eye_data['end_pos'] = end_pos_list

# Filter rows where both 'start_pos' and 'end_pos' columns have values
filtered_Eye_data_with_positions = filtered_Eye_data.dropna(subset=['start_pos', 'end_pos'])

# Save the filtered data to a new CSV file
filtered_file_path_with_position = "/content/drive/MyDrive/Mode of Inheritance Task/Mode of Inheritance Task files/preprocessed databases/filtered_Eye_data_with_position.csv"
filtered_Eye_data_with_positions.to_csv(filtered_file_path_with_position, index=False)


Error retrieving gene info for gene symbol: TRPM1
Error message: HTTP Error 400: Bad Request
Retrying...


In [31]:
# gene2phenotype: 5
# skeletal
import pandas as pd
file_path = '/content/drive/MyDrive/Mode of Inheritance Task/Mode of Inheritance Task files/databases/gene2phenotype/SkeletalG2P_26_2_2024.csv.gz'
skeletal_data = pd.read_csv(file_path)

In [32]:
# skeletal
# preprocess database: filter some columns and add MOI column based on disease name
filtered_columns = ['gene symbol', 'disease name']

filtered_skeletal_data = skeletal_data[filtered_columns]

filtered_skeletal_data.loc[:, 'MOI'] = ''

for i, name in enumerate(filtered_skeletal_data['disease name']):
    MOI_list = []
    if "dominant" in str(name).lower() and "recessive" in str(name).lower():
      MOI_list.append('AD/AR')
    elif "dominant" in str(name).lower():
      MOI_list.append('AD')
    elif "recessive" in str(name).lower():
      MOI_list.append('AR')
    else:
      MOI_list.append('')
    filtered_skeletal_data.loc[i, 'MOI'] = ', '.join(MOI_list)

filtered_skeletal_data = filtered_skeletal_data[filtered_skeletal_data['MOI'] != '']
output_file_path = "/content/drive/MyDrive/Mode of Inheritance Task/Mode of Inheritance Task files/preprocessed databases/filtered_skeletal_data.csv"

filtered_skeletal_data.to_csv(output_file_path, index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_skeletal_data.loc[:, 'MOI'] = ''


In [33]:
# pos in skeletal
import pandas as pd
from Bio import Entrez
import time

file_path = "/content/drive/MyDrive/Mode of Inheritance Task/Mode of Inheritance Task files/preprocessed databases/filtered_skeletal_data.csv"

filtered_skeletal_data = pd.read_csv(file_path)

# Entrez email address (required by NCBI)
Entrez.email = "alihajisadeghian250@gmail.com"

start_pos_list = []
end_pos_list = []

# Function to retrieve gene information with retry mechanism
def retrieve_gene_info_with_retry_input_symbol(gene_symbol, max_retries=5, delay=1):
    num_retries = 0
    while num_retries < max_retries:
        try:
            handle = Entrez.esearch(db="gene", term=gene_symbol)
            record = Entrez.read(handle)
            gene_id = record['IdList'][0]
            handle = Entrez.efetch(db="gene", id=gene_id, rettype="gb", retmode="text")
            gene_record = handle.read()
            handle.close()
            return gene_record
        except Exception as e:
            print("Error retrieving gene info for gene symbol:", gene_symbol)
            print("Error message:", e)
            print("Retrying...")
            num_retries += 1
            time.sleep(delay)
    return None

# Iterate over each row in CGD_data DataFrame
for index, row in filtered_skeletal_data.iterrows():

    # Query Entrez Gene database to retrieve gene information with retry mechanism
    gene_record = retrieve_gene_info_with_retry_input_symbol(filtered_skeletal_data['gene symbol'][index])

    # Check if gene_record is None (indicating an error occurred)
    if gene_record is None:
        # Append None values to start and end positions lists
        start_pos_list.append(None)
        end_pos_list.append(None)
        continue

    # Check if the gene is located on chromosome 17
    if "chromosome 17" in gene_record:
        # Extract start and end positions
        start_pos = int(gene_record.split('\n')[6].split('(')[1].split('..')[0])
        end_pos = int(gene_record.split('\n')[6].split('(')[1].split('..')[1].split(',')[0].split(')')[0])
    else:
        start_pos = None
        end_pos = None

    # Append start and end positions to lists
    start_pos_list.append(start_pos)
    end_pos_list.append(end_pos)

# Add new columns to CGD_data DataFrame
filtered_skeletal_data['start_pos'] = start_pos_list
filtered_skeletal_data['end_pos'] = end_pos_list

# Filter rows where both 'start_pos' and 'end_pos' columns have values
filtered_skeletal_data_with_positions = filtered_skeletal_data.dropna(subset=['start_pos', 'end_pos'])

# Save the filtered data to a new CSV file
filtered_file_path_with_position = "/content/drive/MyDrive/Mode of Inheritance Task/Mode of Inheritance Task files/preprocessed databases/filtered_skeletal_data_with_position.csv"
filtered_skeletal_data_with_positions.to_csv(filtered_file_path_with_position, index=False)


In [35]:
# gene2phenotype: 6
# skin
import pandas as pd
file_path = '/content/drive/MyDrive/Mode of Inheritance Task/Mode of Inheritance Task files/databases/gene2phenotype/SkinG2P_26_2_2024.csv.gz'
skin_data = pd.read_csv(file_path)

In [36]:
# skin
# preprocess database: filter some columns and add MOI column based on disease name
filtered_columns = ['gene symbol', 'disease name']

filtered_skin_data = skin_data[filtered_columns]

filtered_skin_data.loc[:, 'MOI'] = ''

for i, name in enumerate(filtered_skin_data['disease name']):
    MOI_list = []
    if "dominant" in str(name).lower() and "recessive" in str(name).lower():
      MOI_list.append('AD/AR')
    elif "dominant" in str(name).lower():
      MOI_list.append('AD')
    elif "recessive" in str(name).lower():
      MOI_list.append('AR')
    else:
      MOI_list.append('')
    filtered_skin_data.loc[i, 'MOI'] = ', '.join(MOI_list)

filtered_skin_data = filtered_skin_data[filtered_skin_data['MOI'] != '']
output_file_path = "/content/drive/MyDrive/Mode of Inheritance Task/Mode of Inheritance Task files/preprocessed databases/filtered_skin_data.csv"

filtered_skin_data.to_csv(output_file_path, index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_skin_data.loc[:, 'MOI'] = ''


In [37]:
# pos in skin
import pandas as pd
from Bio import Entrez
import time

file_path = "/content/drive/MyDrive/Mode of Inheritance Task/Mode of Inheritance Task files/preprocessed databases/filtered_skin_data.csv"

filtered_skin_data = pd.read_csv(file_path)

# Entrez email address (required by NCBI)
Entrez.email = "alihajisadeghian250@gmail.com"

start_pos_list = []
end_pos_list = []

# Function to retrieve gene information with retry mechanism
def retrieve_gene_info_with_retry_input_symbol(gene_symbol, max_retries=5, delay=1):
    num_retries = 0
    while num_retries < max_retries:
        try:
            handle = Entrez.esearch(db="gene", term=gene_symbol)
            record = Entrez.read(handle)
            gene_id = record['IdList'][0]
            handle = Entrez.efetch(db="gene", id=gene_id, rettype="gb", retmode="text")
            gene_record = handle.read()
            handle.close()
            return gene_record
        except Exception as e:
            print("Error retrieving gene info for gene symbol:", gene_symbol)
            print("Error message:", e)
            print("Retrying...")
            num_retries += 1
            time.sleep(delay)
    return None

# Iterate over each row in CGD_data DataFrame
for index, row in filtered_skin_data.iterrows():

    # Query Entrez Gene database to retrieve gene information with retry mechanism
    gene_record = retrieve_gene_info_with_retry_input_symbol(filtered_skin_data['gene symbol'][index])

    # Check if gene_record is None (indicating an error occurred)
    if gene_record is None:
        # Append None values to start and end positions lists
        start_pos_list.append(None)
        end_pos_list.append(None)
        continue

    # Check if the gene is located on chromosome 17
    if "chromosome 17" in gene_record:
        # Extract start and end positions
        start_pos = int(gene_record.split('\n')[6].split('(')[1].split('..')[0])
        end_pos = int(gene_record.split('\n')[6].split('(')[1].split('..')[1].split(',')[0].split(')')[0])
    else:
        start_pos = None
        end_pos = None

    # Append start and end positions to lists
    start_pos_list.append(start_pos)
    end_pos_list.append(end_pos)

# Add new columns to CGD_data DataFrame
filtered_skin_data['start_pos'] = start_pos_list
filtered_skin_data['end_pos'] = end_pos_list

# Filter rows where both 'start_pos' and 'end_pos' columns have values
filtered_skin_data_with_positions = filtered_skin_data.dropna(subset=['start_pos', 'end_pos'])

# Save the filtered data to a new CSV file
filtered_file_path_with_position = "/content/drive/MyDrive/Mode of Inheritance Task/Mode of Inheritance Task files/preprocessed databases/filtered_skin_data_with_position.csv"
filtered_skin_data_with_positions.to_csv(filtered_file_path_with_position, index=False)


In [None]:
# temp
vcf_directory = '/content/drive/MyDrive/Mode of Inheritance Task/Mode of Inheritance Task files/vcf_files'
ID_list = []
pos_list = []
for filename in os.listdir(vcf_directory):
  vcf_path = os.path.join(vcf_directory, filename)
  vcf_file = pysam.VariantFile(vcf_path)
  for variant in vcf_file:
    if variant.pos == 4898816:
      print(variant)
  vcf_file.close()

chr17	4898816	chr17_4898816_C_G	C	G	2335.6	.	AC=1;AF=0.5;AN=2;BaseQRankSum=-0.09;DP=226;ExcessHet=3.0103;FS=1.022;MLEAC=1;MLEAF=0.5;MQ=60;MQRankSum=0;QD=10.33;ReadPosRankSum=1.425;SOR=0.777	GT:AD:DP:GQ:PL	0/1:117,109:226:99:2343,0,2512

chr17	4898816	chr17_4898816_C_G	C	G	1647.6	.	AC=1;AF=0.5;AN=2;BaseQRankSum=-1.012;DP=147;ExcessHet=3.0103;FS=0;MLEAC=1;MLEAF=0.5;MQ=60;MQRankSum=0;QD=11.21;ReadPosRankSum=0.455;SOR=0.719	GT:AD:DP:GQ:PL	0/1:73,74:147:99:1655,0,1696



In [None]:
# step 1 : save ID and pos of all varients in a list
vcf_directory = '/content/drive/MyDrive/Mode of Inheritance Task/Mode of Inheritance Task files/vcf_files'
ID_list = []
pos_list = []
for filename in os.listdir(vcf_directory):
  vcf_path = os.path.join(vcf_directory, filename)
  vcf_file = pysam.VariantFile(vcf_path)
  for variant in vcf_file:
    ID_list.append(variant.id)
    pos_list.append(variant.pos)
  vcf_file.close()


In [None]:
# step 2 : create a csv file temp_output.csv

csv_data = {'ID': ID_list,
            'pos': pos_list,

            'symbol_OMIM': [None for i in range(len(ID_list))],
            'MOI_OMIM': [None for i in range(len(ID_list))],

            'symbol_CGD': [None for i in range(len(ID_list))],
            'MOI_CGD': [None for i in range(len(ID_list))],

            'symbol_ClinGen': [None for i in range(len(ID_list))],
            'MOI_ClinGen': [None for i in range(len(ID_list))],

            'symbol_gene2phenotype': [None for i in range(len(ID_list))],
            'MOI_gene2phenotype': [None for i in range(len(ID_list))],

            'symbol_GenCC': [None for i in range(len(ID_list))],
            'MOI_GenCC': [None for i in range(len(ID_list))],

            'P_AD': [None for i in range(len(ID_list))]}

df = pd.DataFrame(csv_data)

df.to_csv('/content/drive/MyDrive/Mode of Inheritance Task/Mode of Inheritance Task files/temp_output.csv', index=False)

In [None]:
len(pos_list)

22554

In [None]:
gene_interval_list = [(omim_data['Genomic Position Start'][i], omim_data['Genomic Position End'][i]) for i in range(len(omim_data))]

In [None]:
len(gene_interval_list)

1067

In [None]:
len(pos_list)

22554

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Mode of Inheritance Task/Mode of Inheritance Task files/temp_output_updated.csv')

In [None]:
df[df['pos'] == 4898816]

Unnamed: 0,ID,pos,symbol_OMIM,MOI_OMIM,symbol_CGD,MOI_CGD,symbol_ClinGen,MOI_ClinGen,symbol_gene2phenotype,MOI_gene2phenotype,symbol_GenCC,MOI_GenCC,P_AD
10173,chr17_4898816_C_G,4898816,,AD/AR,,,,,,,,,
16964,chr17_4898816_C_G,4898816,,AD/AR,,,,,,,,,


In [None]:
indexes = [index for index, value in enumerate(pos_list) if value == 4898816]
print(indexes)

[10173, 16964]


In [None]:
len(set(pos_list))

5737

In [None]:
pos_index_dict = {}
for pos in pos_list:
  index_list = []
  for i ,(start, end) in enumerate(gene_interval_list):
    if start <= pos <= end:
      index_list.append(i)
  pos_index_dict[pos] = index_list

In [None]:
len(pos_index_dict)

5737

In [None]:
count = 0
for pos in pos_index_dict.keys():
  if count <= 10:
    print(pos,': ',pos_index_dict[pos])
    count += 1

137603 :  [0, 1, 2, 3, 4, 5, 6, 7, 8, 1065, 1066]
138213 :  [0, 1, 2, 3, 4, 5, 6, 7, 8, 1065, 1066]
261904 :  [0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 1065, 1066]
321346 :  [0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 1065, 1066]
410351 :  [0, 1, 2, 3, 4, 5, 6, 7, 8, 1065, 1066]
413503 :  [0, 1, 2, 3, 4, 5, 6, 7, 8, 1065, 1066]
562535 :  [0, 1, 2, 3, 4, 5, 6, 7, 8, 12, 1065, 1066]
562753 :  [0, 1, 2, 3, 4, 5, 6, 7, 8, 12, 1065, 1066]
733080 :  [0, 1, 2, 3, 4, 5, 6, 7, 8, 13, 1065, 1066]
744946 :  [0, 1, 2, 3, 4, 5, 6, 7, 8, 14, 1065, 1066]
745827 :  [0, 1, 2, 3, 4, 5, 6, 7, 8, 14, 1065, 1066]


In [None]:
pos_index_list = []
for pos in pos_list:
    index_list = []
    for i, (start, end) in enumerate(gene_interval_list):
        if start <= pos <= end:
            index_list.append(i)
    pos_index_list.append((pos, index_list))


In [None]:
len(pos_index_list)

22554

In [None]:
count = 10
for i, (pos, index_list) in enumerate(pos_index_list):
  if i < count:
    print(pos, ' : ', index_list)

137603  :  [0, 1, 2, 3, 4, 5, 6, 7, 8, 1065, 1066]
138213  :  [0, 1, 2, 3, 4, 5, 6, 7, 8, 1065, 1066]
261904  :  [0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 1065, 1066]
321346  :  [0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 1065, 1066]
410351  :  [0, 1, 2, 3, 4, 5, 6, 7, 8, 1065, 1066]
413503  :  [0, 1, 2, 3, 4, 5, 6, 7, 8, 1065, 1066]
562535  :  [0, 1, 2, 3, 4, 5, 6, 7, 8, 12, 1065, 1066]
562753  :  [0, 1, 2, 3, 4, 5, 6, 7, 8, 12, 1065, 1066]
733080  :  [0, 1, 2, 3, 4, 5, 6, 7, 8, 13, 1065, 1066]
744946  :  [0, 1, 2, 3, 4, 5, 6, 7, 8, 14, 1065, 1066]


In [None]:
pos_phenotype_dict = {}
for pos in pos_list:
  phenotype_list = []
  for index in pos_index_dict[pos]:
    phenotype_list.append(omim_data['Phenotypes'][index])
  pos_phenotype_dict[pos] = phenotype_list

In [None]:
len(pos_phenotype_dict)

5737

In [None]:
count = 0
for pos in pos_phenotype_dict.keys():
  if count <= 10:
    print(pos,': ',pos_phenotype_dict[pos])
    count += 1

137603 :  ['{Asperger syndrome susceptibility 2}, 608631 (2), Multifactorial, Isolated cases', 'Breast cancer (1)', 'Cataract 24, anterior polar, 601202 (2), Autosomal dominant', 'Dowling-Degos disease 3, 615674 (2), Autosomal dominant', 'Chromosome 17p13.3 duplication syndrome, 613215 (4)', 'Miller-Dieker lissencephaly syndrome, 247200 (4), Autosomal dominant', 'Nail disorder, nonsyndromic congenital, 7, 605779 (2), Autosomal dominant', 'Split-hand/foot malformation with long bone deficiency 3, 612576 (4), Autosomal dominant', '[Sex hormone-binding globulin circulating level QTL 1], 613498 (2)', nan, '{Opioid dependence, susceptibility to, 1}, 610064 (2)']
138213 :  ['{Asperger syndrome susceptibility 2}, 608631 (2), Multifactorial, Isolated cases', 'Breast cancer (1)', 'Cataract 24, anterior polar, 601202 (2), Autosomal dominant', 'Dowling-Degos disease 3, 615674 (2), Autosomal dominant', 'Chromosome 17p13.3 duplication syndrome, 613215 (4)', 'Miller-Dieker lissencephaly syndrome, 24

In [None]:
pos_phenotype_list = []
for (pos, index_list) in pos_index_list:
  phenotype_list = []
  for index in index_list:
    phenotype_list.append(omim_data['Phenotypes'][index])
  pos_phenotype_list.append((pos, phenotype_list))

In [None]:
len(pos_phenotype_list)

22554

In [None]:
count = 10
for i, (pos, phenotype_list) in enumerate(pos_phenotype_list):
  if i < count:
    print(pos, " : ", phenotype_list)

137603  :  ['{Asperger syndrome susceptibility 2}, 608631 (2), Multifactorial, Isolated cases', 'Breast cancer (1)', 'Cataract 24, anterior polar, 601202 (2), Autosomal dominant', 'Dowling-Degos disease 3, 615674 (2), Autosomal dominant', 'Chromosome 17p13.3 duplication syndrome, 613215 (4)', 'Miller-Dieker lissencephaly syndrome, 247200 (4), Autosomal dominant', 'Nail disorder, nonsyndromic congenital, 7, 605779 (2), Autosomal dominant', 'Split-hand/foot malformation with long bone deficiency 3, 612576 (4), Autosomal dominant', '[Sex hormone-binding globulin circulating level QTL 1], 613498 (2)', nan, '{Opioid dependence, susceptibility to, 1}, 610064 (2)']
138213  :  ['{Asperger syndrome susceptibility 2}, 608631 (2), Multifactorial, Isolated cases', 'Breast cancer (1)', 'Cataract 24, anterior polar, 601202 (2), Autosomal dominant', 'Dowling-Degos disease 3, 615674 (2), Autosomal dominant', 'Chromosome 17p13.3 duplication syndrome, 613215 (4)', 'Miller-Dieker lissencephaly syndrome, 

In [None]:
pos_MOIlist_dict = {}
for pos in pos_list:
  MOI_list = []
  for phenotype in pos_phenotype_dict[pos]:
    if "dominant" in str(phenotype).lower() and "recessive" in str(phenotype).lower():
      MOI_list.append('AD/AR')
    elif "dominant" in str(phenotype).lower():
      MOI_list.append('AD')
    elif "recessive" in str(phenotype).lower():
      MOI_list.append('AR')
    else:
      MOI_list.append('')
  pos_MOIlist_dict[pos] = MOI_list

In [None]:
len(pos_MOIlist_dict)

5737

In [None]:
count = 0
for pos in pos_MOIlist_dict.keys():
  if count <= 10:
    print(pos,': ',pos_MOIlist_dict[pos])
    count += 1

137603 :  ['', '', 'AD', 'AD', '', 'AD', 'AD', 'AD', '', '', '']
138213 :  ['', '', 'AD', 'AD', '', 'AD', 'AD', 'AD', '', '', '']
261904 :  ['', '', 'AD', 'AD', '', 'AD', 'AD', 'AD', '', '', '', '']
321346 :  ['', '', 'AD', 'AD', '', 'AD', 'AD', 'AD', '', '', '', '']
410351 :  ['', '', 'AD', 'AD', '', 'AD', 'AD', 'AD', '', '', '']
413503 :  ['', '', 'AD', 'AD', '', 'AD', 'AD', 'AD', '', '', '']
562535 :  ['', '', 'AD', 'AD', '', 'AD', 'AD', 'AD', '', 'AR', '', '']
562753 :  ['', '', 'AD', 'AD', '', 'AD', 'AD', 'AD', '', 'AR', '', '']
733080 :  ['', '', 'AD', 'AD', '', 'AD', 'AD', 'AD', '', '', '', '']
744946 :  ['', '', 'AD', 'AD', '', 'AD', 'AD', 'AD', '', 'AR', '', '']
745827 :  ['', '', 'AD', 'AD', '', 'AD', 'AD', 'AD', '', 'AR', '', '']


In [None]:
pos_MOIlist_list = []
for (pos, phenotype_list) in pos_phenotype_list:
  MOI_list = []
  for phenotype in phenotype_list:
    if "dominant" in str(phenotype).lower() and "recessive" in str(phenotype).lower():
      MOI_list.append('AD/AR')
    elif "dominant" in str(phenotype).lower():
      MOI_list.append('AD')
    elif "recessive" in str(phenotype).lower():
      MOI_list.append('AR')
    else:
      MOI_list.append('')
  pos_MOIlist_list.append((pos, MOI_list))

In [None]:
len(pos_MOIlist_list)

22554

In [None]:
count = 10
for i, (pos, MOIlist) in enumerate(pos_MOIlist_list):
  if i < count:
    print(pos, " : ", MOIlist)

137603  :  ['', '', 'AD', 'AD', '', 'AD', 'AD', 'AD', '', '', '']
138213  :  ['', '', 'AD', 'AD', '', 'AD', 'AD', 'AD', '', '', '']
261904  :  ['', '', 'AD', 'AD', '', 'AD', 'AD', 'AD', '', '', '', '']
321346  :  ['', '', 'AD', 'AD', '', 'AD', 'AD', 'AD', '', '', '', '']
410351  :  ['', '', 'AD', 'AD', '', 'AD', 'AD', 'AD', '', '', '']
413503  :  ['', '', 'AD', 'AD', '', 'AD', 'AD', 'AD', '', '', '']
562535  :  ['', '', 'AD', 'AD', '', 'AD', 'AD', 'AD', '', 'AR', '', '']
562753  :  ['', '', 'AD', 'AD', '', 'AD', 'AD', 'AD', '', 'AR', '', '']
733080  :  ['', '', 'AD', 'AD', '', 'AD', 'AD', 'AD', '', '', '', '']
744946  :  ['', '', 'AD', 'AD', '', 'AD', 'AD', 'AD', '', 'AR', '', '']


In [None]:
for pos in range(4898000, 4899965):
  if pos in pos_list:
    print(pos, ' : ', pos_MOIlist_dict[pos])

4898816  :  ['', 'AD', 'AD', 'AD', '', 'AR', 'AD/AR', '', '']
4898960  :  ['', 'AD', 'AD', 'AD', '', 'AR', 'AD/AR', '', '']
4899034  :  ['', 'AD', 'AD', 'AD', '', 'AR', 'AD/AR', '', '']
4899390  :  ['', 'AD', 'AD', 'AD', '', 'AR', 'AD/AR', '', '']
4899534  :  ['', 'AD', 'AD', 'AD', '', 'AR', 'AD/AR', '', '']


In [None]:
for position in range(4898000, 4899965):
  if position in pos_list:
    print(position, ' : ', [MOI_list for (pos, MOIlist) in pos_MOIlist_list if pos == position])

4898816  :  [['', '', '', '', 'AD', '', 'AR', '', '', '', '', ''], ['', '', '', '', 'AD', '', 'AR', '', '', '', '', '']]
4898960  :  [['', '', '', '', 'AD', '', 'AR', '', '', '', '', ''], ['', '', '', '', 'AD', '', 'AR', '', '', '', '', '']]
4899034  :  [['', '', '', '', 'AD', '', 'AR', '', '', '', '', ''], ['', '', '', '', 'AD', '', 'AR', '', '', '', '', '']]
4899390  :  [['', '', '', '', 'AD', '', 'AR', '', '', '', '', ''], ['', '', '', '', 'AD', '', 'AR', '', '', '', '', '']]
4899534  :  [['', '', '', '', 'AD', '', 'AR', '', '', '', '', '']]


In [None]:
omim_data['']

In [None]:
def determine_MOI(MOI_list):
    # Remove empty strings from MOI list
    filtered_MOI_list = [MOI for MOI in MOI_list if MOI]

    if not filtered_MOI_list:
        return 'Unknown'

    if len(set(filtered_MOI_list)) == 1:
        return filtered_MOI_list[0]

    # If both 'AD' and 'AR' present, assign 'AD/AR'
    if 'AD' in filtered_MOI_list and 'AR' in filtered_MOI_list:
        return 'AD/AR'

    # Handle conflicting MOIs
    return 'Mixed/Uncertain'

# Determine MOI for each variant

In [None]:
# Determine MOI for each variant
pos_MOI_dict = {}
for pos, MOI_list in pos_MOIlist_dict.items():
    MOI = determine_MOI(MOI_list)
    pos_MOI_dict[pos] = MOI

In [None]:
# Print variant MOIs
for pos, MOI in pos_MOI_dict.items():
    print(f"Variant at position {pos}: MOI = {MOI}")

In [None]:
for pos, MOI in pos_MOI_dict.items():
  if MOI == 'AR':
    print(pos)

27301794
27303362
27312362
27311633
27312303


In [None]:
import matplotlib.pyplot as plt

# Count occurrences of each MOI
MOIs_count = {'AD': 0, 'AR': 0, 'AD/AR': 0, 'Mixed/Uncertain': 0, 'Unknown': 0}

# Count occurrences of each MOI
for MOI in pos_MOI_dict.values():
    MOIs_count[MOI] += 1

# Convert counts to percentages
total_variants = sum(MOIs_count.values())
percentages = [count / total_variants * 100 for count in MOIs_count.values()]

# Define MOIs and colors
MOIs = list(MOIs_count.keys())
colors = ['lightblue', 'lightgreen', 'orange', 'lightcoral', 'lightgrey']

# Create figure and axis
fig, ax = plt.subplots()

# Plot circle
circle = plt.Circle((0, 0), 0.7, color='white')
ax.add_artist(circle)

# Plot pie chart
ax.pie(percentages, labels=MOIs, autopct='%1.1f%%', startangle=90, colors=colors)

# Equal aspect ratio ensures that pie is drawn as a circle
ax.axis('equal')

# Add title
plt.title('Percentage of Modes of Inheritance')

# Show plot
plt.show()


In [None]:
len(pos_list)

22554

In [None]:
len(pos_MOIlist_dict)

5737

In [None]:
len(pos_index_dict)

5737

In [None]:
# pos_symbollidt_dict
pos_symbollist_dict = {}
for pos in pos_list:
  symbollist = []
  MOI_list = pos_MOIlist_dict[pos]
  for i, MOI in enumerate(MOI_list):
    if MOI != '':
      index = pos_index_dict[pos][i]
      symbollist.append(omim_data['Gene/Locus And Other Related Symbols'][index])
  pos_symbollist_dict[pos] = symbollist

In [None]:
len(pos_symbollist_dict)

5737

In [None]:
count = 0
for pos in pos_symbollist_dict.keys():
  if count <= 10:
    print(pos,': ',pos_symbollist_dict[pos])
    count += 1

137603 :  ['CTRCT24, CTAA2', 'DDD3', 'MDLS, MDS, MDCR, DEL17p13.3, C17DELp13.3', 'NDNC7', 'SHFLD3, DUP17p13.3, C17DUPp13.3']
138213 :  ['CTRCT24, CTAA2', 'DDD3', 'MDLS, MDS, MDCR, DEL17p13.3, C17DELp13.3', 'NDNC7', 'SHFLD3, DUP17p13.3, C17DUPp13.3']
261904 :  ['CTRCT24, CTAA2', 'DDD3', 'MDLS, MDS, MDCR, DEL17p13.3, C17DELp13.3', 'NDNC7', 'SHFLD3, DUP17p13.3, C17DUPp13.3']
321346 :  ['CTRCT24, CTAA2', 'DDD3', 'MDLS, MDS, MDCR, DEL17p13.3, C17DELp13.3', 'NDNC7', 'SHFLD3, DUP17p13.3, C17DUPp13.3']
410351 :  ['CTRCT24, CTAA2', 'DDD3', 'MDLS, MDS, MDCR, DEL17p13.3, C17DELp13.3', 'NDNC7', 'SHFLD3, DUP17p13.3, C17DUPp13.3']
413503 :  ['CTRCT24, CTAA2', 'DDD3', 'MDLS, MDS, MDCR, DEL17p13.3, C17DELp13.3', 'NDNC7', 'SHFLD3, DUP17p13.3, C17DUPp13.3']
562535 :  ['CTRCT24, CTAA2', 'DDD3', 'MDLS, MDS, MDCR, DEL17p13.3, C17DELp13.3', 'NDNC7', 'SHFLD3, DUP17p13.3, C17DUPp13.3', 'VPS53, HCCS1, PCH2E']
562753 :  ['CTRCT24, CTAA2', 'DDD3', 'MDLS, MDS, MDCR, DEL17p13.3, C17DELp13.3', 'NDNC7', 'SHFLD3, DUP

In [None]:
count = 0
for pos in pos_MOI_dict.keys():
  if count <= 10:
    print(pos,': ',pos_MOI_dict[pos])
    count += 1

137603 :  AD
138213 :  AD
261904 :  AD
321346 :  AD
410351 :  AD
413503 :  AD
562535 :  AD/AR
562753 :  AD/AR
733080 :  AD
744946 :  AD/AR
745827 :  AD/AR


In [None]:
# adding OMIM database to temp_output csv file
df = pd.read_csv('/content/drive/MyDrive/Mode of Inheritance Task/Mode of Inheritance Task files/temp_output.csv')
for index, row in df.iterrows():
  pos = row['pos']
  df.at[index, 'MOI_OMIM'] = pos_MOI_dict[pos]
  df.at[index, 'symbol_OMIM'] = pos_symbollist_dict[pos]
df.to_csv('/content/drive/MyDrive/Mode of Inheritance Task/Mode of Inheritance Task files/temp_output_updated.csv', index=False)

ValueError: Must have equal len keys and value when setting with an iterable

In [None]:
!pip install biopython

Collecting biopython
  Downloading biopython-1.83-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: biopython
Successfully installed biopython-1.83


In [None]:
from Bio import Entrez

# Entrez email address (required by NCBI)
Entrez.email = "alihajisadeghian250@gmail.com"

# Entrez Gene ID of the gene you're interested in
entrez_gene_id = "2"

# Query Entrez Gene database to retrieve gene information
handle = Entrez.efetch(db="gene", id=entrez_gene_id, rettype="gb", retmode="text")
gene_record = handle.read()
handle.close()


In [None]:
gene_record

'\n1. A2M\nOfficial Symbol: A2M and Name: alpha-2-macroglobulin [Homo sapiens (human)]\nOther Aliases: A2MD, CPAMD5, FWP007, S863-7\nOther Designations: alpha-2-macroglobulin; C3 and PZP-like alpha-2-macroglobulin domain-containing protein 5; alpha-2-M\nChromosome: 12; Location: 12p13.31\nAnnotation: Chromosome 12 NC_000012.12 (9067708..9116229, complement)\nMIM: 103950\nID: 2\n\n'

In [None]:
chr = int(gene_record.split('\n')[5].split(' ')[1].split(';')[0])

In [None]:
start_pos = int(gene_record.split('\n')[6].split('(')[1].split('..')[0])

In [None]:
end_pos = int(gene_record.split('\n')[6].split('(')[1].split('..')[1].split(',')[0].split(')')[0])

In [None]:
chr

12

In [None]:
start_pos

9067708

In [None]:
end_pos

9116229