In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt 

The file was downloaded from National Institute of Health, ClinVar, which is a freely accessible, public archive of reports of the relationships among human variations and phenotypes, with supporting evidence. ClinVar facilitates access to and communication about the relationships asserted between human variation and observed health status, and the history of that interpretation. 

Reading the file

In [2]:
file = pd.read_csv('../Data/raw/gene_condition_source_id.csv')

In [3]:
file

Unnamed: 0,GeneID,AssociatedGenes,RelatedGenes,ConceptID,DiseaseName,SourceName,SourceID,DiseaseMIM,LastUpdated
0,144568,A2ML1,,C1833692,Otitis media: susceptibility to,MONDO,MONDO:0008162,166760.0,Feb 16 2016
1,53947,A4GALT,,C3549485,Blood group: P1PK system,NCBI curation,,111400.0,Mar 10 2022
2,8086,AAAS,,C0271742,Glucocorticoid deficiency with achalasia,NCBI curation,,231550.0,Feb 16 2016
3,79719,AAGAB,,CN031225,Palmoplantar keratoderma: punctate type 1A,MONDO,MONDO:0007858,148600.0,May 21 2021
4,16,AARS1,,C2750090,Charcot-Marie-Tooth disease axonal type 2N,MONDO,MONDO:0013212,613287.0,Jan 21 2022
...,...,...,...,...,...,...,...,...,...
12305,130557,,ZNF513,C0035334,Retinitis pigmentosa,OMIM phenotypic series,PS268000,268000.0,16. Feb 16
12306,130557,,ZNF513,C0035334,Retinitis pigmentosa,MONDO,MONDO:0019200,268000.0,16. Feb 16
12307,22917,,ZP1,CN238505,Inherited oocyte maturation defect,MONDO,MONDO:0014769,,19. Apr 22
12308,7783,,ZP2,CN238505,Inherited oocyte maturation defect,MONDO,MONDO:0014769,,19. Apr 22


Grouping the total genes by GeneID to get an overall idea of number of genes I will be dealing with in this project

In [4]:
#Grouping the total genes by GeneID to get an overall idea of number of genes I will be dealing with in this project# Group the data by Gene ID and count the number of unique disease names
gene_counts = file.groupby('GeneID')['DiseaseName'].nunique()


Here I want to find out which gene from the dataset is responsible for causing most diseases

In [5]:
# Find the Gene ID with the maximum count of unique disease names
max_gene_id = gene_counts.idxmax()

print("Gene ID with the maximum number of disease names:", max_gene_id)

Gene ID with the maximum number of disease names: 1280


According to our previous findings, the Gene ID causing the maximum number of diseases is 1280, based on that, I wanted to calculate the probability of getting a defect on the gene ID 1280 out of all gene IDs which will lead to causing a disease or disorder

In [None]:
# Calculate the total number of unique gene IDs
total_gene_ids = len(file['GeneID'].unique())

In [None]:
# Specify the gene ID you want to calculate the probability for
gene_id = 1280

In [None]:
# Count the number of occurrences of the specific gene ID
gene_id_count = len(file[file['GeneID'] == gene_id])

In [None]:
# Calculate the probability #estimated probability
probability = gene_id_count / total_gene_ids

print("Probability of getting the gene ID", gene_id, "out of all gene IDs:", probability)

After calculating the probability of getting a defect in the Gene ID responsible for maximum number of disease, next is to calculate Probability of getting a disease with defect on associated as well as related genes 

In [None]:
# Calculate the total number of diseases
total_diseases = len(file)

# Filter the dataset for rows with non-null values in 'AssociatedGenes' column
associated_genes_data = file.dropna(subset=['AssociatedGenes'])

# Calculate the number of diseases with associated genes
diseases_with_associated_genes = len(associated_genes_data)

# Filter the dataset for rows with non-null values in 'RelatedGenes' column
related_genes_data = file.dropna(subset=['RelatedGenes'])

# Calculate the number of diseases with related genes
diseases_with_related_genes = len(related_genes_data)

# Calculate the probabilities
probability_associated_genes = diseases_with_associated_genes / total_diseases
probability_related_genes = diseases_with_related_genes / total_diseases

# Print the probabilities
print("Probability of getting a disease with associated genes:", probability_associated_genes)
print("Probability of getting a disease with related genes:", probability_related_genes)

Figuring out the most common disorder from the dataset and the genes associated with the disorder

In [None]:
# Get the most common disorders
most_common_disorder = file['DiseaseName'].value_counts().idxmax()

print("The most common disorder is:", most_common_disorder)

In [None]:
# Filter the dataset for rows with 'Cardiomyopathy' in the 'DiseaseName' column
filtered_data = file[file['DiseaseName'] == 'Cardiomyopathy']

# Get the associated genes and related genes for Cardiomyopathy
associated_genes = filtered_data['AssociatedGenes'].dropna().unique().tolist()
related_genes = filtered_data['RelatedGenes'].dropna().unique().tolist()

# Print the associated genes and related genes for Cardiomyopathy
print("Associated Genes for Cardiomyopathy:")
for gene in associated_genes:
    print(gene)

print("\nRelated Genes for Cardiomyopathy:")
for gene in related_genes:
    print(gene)

Finding out the top 5 gene IDs causing the maximum number of diseases and plotting them 

In [None]:
# Group the data by Gene ID and count the number of unique disease names
gene_counts = file.groupby('GeneID')['DiseaseName'].nunique()

# Sort the gene counts in descending order and get the top 5 gene IDs
top_5_gene_ids = gene_counts.sort_values(ascending=False).head(5)

# Print the top 5 gene IDs causing the maximum number of diseases
print("Top 5 gene IDs causing the maximum number of diseases:")
for gene_id, count in top_5_gene_ids.items():
    print("Gene ID:", gene_id, " | Number of Diseases:", count)

In [None]:
import seaborn as sns

In [None]:
# Convert to a  DataFrame for plotting
df = pd.DataFrame({'Gene ID': top_5_gene_ids.index, 'Number of Diseases': top_5_gene_ids.values})

# Create a bar plot using Seaborn
plt.figure(figsize=(10, 6))
sns.barplot(x='Gene ID', y='Number of Diseases', data=df, color='orange')

# Set the plot labels and title
plt.xlabel('Gene ID')
plt.ylabel('Number of Diseases')
plt.title('Top 5 Gene IDs Causing the Maximum Number of Diseases')

# Show the plot
plt.tight_layout()
plt.show()




Finding out the Gene IDs and names of the diseases and disorders associated with the top 5 disease

In [None]:
# 1.Get the unique gene IDs
unique_gene_ids = file['GeneID'].unique()

# Print the unique gene IDs
print("Unique Gene IDs:")
for gene_id in unique_gene_ids:
    print(gene_id)

In [None]:
# Calculate the number of unique gene IDs
num_unique_ids = len(unique_gene_ids)

# Print the number of unique gene IDs
print("Number of Unique Gene IDs:", num_unique_ids)

In [None]:
# 2.Get the top 5 most common disease names
top_5_diseases = file['DiseaseName'].value_counts().head(5).index.tolist()

# 3.Filter the dataset for the top 5 disease names
filtered_data = file[file['DiseaseName'].isin(top_5_diseases)]

# 4.Get the gene IDs for the top 5 disease names
gene_ids = filtered_data['GeneID'].unique()

# Print the gene IDs for the top 5 disease names
print("Gene IDs associated with the top 5 disease names:")
for gene_id in gene_ids:
    print(gene_id)

In [None]:
top_5_diseases = file['DiseaseName'].value_counts().head(5).index.tolist()

# 5.Filter the dataset for the top 5 disease names
filtered_data = file[file['DiseaseName'].isin(top_5_diseases)]

# 6.Create a list of tuples containing gene IDs and their associated disease names
gene_disease_list = [(row['GeneID'], row['DiseaseName']) for _, row in filtered_data.iterrows()]

# Print the gene IDs and their associated disease names
print("Gene IDs and Associated Disease Names for the top 5 disease names:")
for gene_id, disease_name in gene_disease_list:
    print("Gene ID:", gene_id, "| Disease Name:", disease_name)

Finding out if the most frequently occuring disorder is on associated or related gene, which will help to figureout if or not it will be an acquired condition

In [None]:
# Filter the dataset for rows with 'Cardiomyopathy' in the 'DiseaseName' column
cardiomyopathy_data = file[file['DiseaseName'] == 'Cardiomyopathy']

# Count the number of associated genes and related genes for cardiomyopathy
num_associated_genes = cardiomyopathy_data['AssociatedGenes'].notnull().sum()
num_related_genes = cardiomyopathy_data['RelatedGenes'].notnull().sum()

# Print the number of associated genes and related genes for cardiomyopathy
print("Number of associated genes for Cardiomyopathy:", num_associated_genes)
print("Number of related genes for Cardiomyopathy:", num_related_genes)
