In [1]:
import os
import duckdb

clinvar_duckdb_loc = (
    "~/dev/2023/cpvt-database-cleanup/data/clinvar/clinvar.duckdb"
)

if not os.path.exists(os.path.expanduser(clinvar_duckdb_loc)):
    raise FileNotFoundError(
        f"File not found: {clinvar_duckdb_loc}"
    )

conn = duckdb.connect(clinvar_duckdb_loc)

In [2]:
conn.execute(
    """
SELECT *
FROM clinvar
LIMIT 5
"""
).fetchdf()

Unnamed: 0,#AlleleID,Type,Name,GeneID,GeneSymbol,HGNC_ID,ClinicalSignificance,ClinSigSimple,LastEvaluated,RS# (dbSNP),...,VariationID,PositionVCF,ReferenceAlleleVCF,AlternateAlleleVCF,SomaticClinicalImpact,SomaticClinicalImpactLastEvaluated,ReviewStatusClinicalImpact,Oncogenicity,OncogenicityLastEvaluated,ReviewStatusOncogenicity
0,15041,Indel,NM_014855.3(AP5Z1):c.80_83delinsTGCTGTAAACTGTA...,9907,AP5Z1,HGNC:22197,Pathogenic,0,-,397704705,...,2,4820844,GGAT,TGCTGTAAACTGTAACTGTAAA,-,-,-,-,-,-
1,15041,Indel,NM_014855.3(AP5Z1):c.80_83delinsTGCTGTAAACTGTA...,9907,AP5Z1,HGNC:22197,Pathogenic,0,-,397704705,...,2,4781213,GGAT,TGCTGTAAACTGTAACTGTAAA,-,-,-,-,-,-
2,15042,Deletion,NM_014855.3(AP5Z1):c.1413_1426del (p.Leu473fs),9907,AP5Z1,HGNC:22197,Pathogenic,0,"Jun 29, 2010",397704709,...,3,4827360,GCTGCTGGACCTGCC,G,-,-,-,-,-,-
3,15042,Deletion,NM_014855.3(AP5Z1):c.1413_1426del (p.Leu473fs),9907,AP5Z1,HGNC:22197,Pathogenic,0,"Jun 29, 2010",397704709,...,3,4787729,GCTGCTGGACCTGCC,G,-,-,-,-,-,-
4,15043,single nucleotide variant,NM_014630.3(ZNF592):c.3136G>A (p.Gly1046Arg),9640,ZNF592,HGNC:28986,Uncertain significance,0,"Jun 29, 2015",150829393,...,4,85342440,G,A,-,-,-,-,-,-


In [3]:
# Get the table names
conn.execute(
    """
SELECT COUNT(*)
FROM clinvar
WHERE Assembly = 'GRCh38'
"""
).fetchall()

[(2921042,)]

In [4]:
# As of 07/2024 there are 2,921,042 variants in the GRCh38 assembly

In [5]:
conn.execute(
    """
SELECT 
    GeneSymbol,
    COUNT(*) AS num_variants
FROM clinvar
WHERE Assembly = 'GRCh38'
GROUP BY GeneSymbol
ORDER BY num_variants DESC
LIMIT 25
"""
).fetchdf()

Unnamed: 0,GeneSymbol,num_variants
0,TTN,31076
1,BRCA2,18561
2,ATM,16427
3,APC,14607
4,BRCA1,14241
5,NF1,14046
6,NEB,10870
7,TSC2,10630
8,MSH6,9128
9,POLE,9122


In [6]:
conn.execute("""
SELECT *
FROM clinvar
WHERE Assembly = 'GRCh38'
AND GeneSymbol = 'RYR2'
LIMIT 5
""").df()

Unnamed: 0,#AlleleID,Type,Name,GeneID,GeneSymbol,HGNC_ID,ClinicalSignificance,ClinSigSimple,LastEvaluated,RS# (dbSNP),...,VariationID,PositionVCF,ReferenceAlleleVCF,AlternateAlleleVCF,SomaticClinicalImpact,SomaticClinicalImpactLastEvaluated,ReviewStatusClinicalImpact,Oncogenicity,OncogenicityLastEvaluated,ReviewStatusOncogenicity
0,27993,single nucleotide variant,NM_001035.3(RYR2):c.6737C>T (p.Ser2246Leu),6262,RYR2,HGNC:10484,Pathogenic,0,"Jun 17, 2023",121918597,...,12954,237634937,C,T,-,-,-,-,-,-
1,27994,single nucleotide variant,NM_001035.3(RYR2):c.7422G>C (p.Arg2474Ser),6262,RYR2,HGNC:10484,Pathogenic,0,"Jan 16, 2001",121918598,...,12955,237648523,G,C,-,-,-,-,-,-
2,27995,single nucleotide variant,NM_001035.3(RYR2):c.12312C>G (p.Asn4104Lys),6262,RYR2,HGNC:10484,Pathogenic,0,"Jan 16, 2001",121918599,...,12956,237784024,C,G,-,-,-,-,-,-
3,27996,single nucleotide variant,NM_001035.3(RYR2):c.13489C>T (p.Arg4497Cys),6262,RYR2,HGNC:10484,Pathogenic,0,"Oct 11, 2023",121918600,...,12957,237791441,C,T,-,-,-,-,-,-
4,27997,single nucleotide variant,NM_001035.3(RYR2):c.7157A>T (p.Asn2386Ile),6262,RYR2,HGNC:10484,Pathogenic,0,"Nov 07, 2022",121918601,...,12958,237640938,A,T,-,-,-,-,-,-


In [7]:
phenotypes = conn.execute("""
SELECT PhenotypeList
FROM clinvar
WHERE Assembly = 'GRCh38'
AND GeneSymbol = 'RYR2'
""").df()

phenotypes.head()

Unnamed: 0,PhenotypeList
0,Catecholaminergic polymorphic ventricular tach...
1,Catecholaminergic polymorphic ventricular tach...
2,Catecholaminergic polymorphic ventricular tach...
3,Catecholaminergic polymorphic ventricular tach...
4,Catecholaminergic polymorphic ventricular tach...


In [8]:
# split the phenotypes by "|" and count the number of unique phenotypes
# also split phenotypes by ";" and count the number of unique phenotypes
phenotypes2 = phenotypes.PhenotypeList.str.split("[|;]", expand=True)
phenotypes2 = phenotypes2.stack().reset_index(drop=True)

phenotypes2.value_counts()

Catecholaminergic polymorphic ventricular tachycardia 1                                          5131
Cardiomyopathy                                                                                   2497
Catecholaminergic polymorphic ventricular tachycardia                                            2366
not provided                                                                                     1675
Cardiovascular phenotype                                                                         1673
not specified                                                                                     793
Arrhythmogenic right ventricular dysplasia 2                                                      540
Ventricular arrhythmias due to cardiac ryanodine receptor calcium release deficiency syndrome     229
RYR2-related disorder                                                                              81
Arrhythmogenic right ventricular cardiomyopathy                                   

In [9]:
# do for phenotypeIDs
phenotype_ids = conn.execute("""
SELECT PhenotypeIDS
FROM clinvar
WHERE Assembly = 'GRCh38'
AND GeneSymbol = 'RYR2'
""").df()

phenotype_ids.PhenotypeIDS.str.split("[|;]", expand=True).stack().value_counts()

MONDO:MONDO:0011484,MedGen:C1631597,OMIM:604772,Orphanet:3286                                                                                                                                                                                                                                                                                                                              5131
Human Phenotype Ontology:HP:0001638,MONDO:MONDO:0004994,MedGen:C0878544,Orphanet:167848                                                                                                                                                                                                                                                                                                    2497
MONDO:MONDO:0017990,MedGen:C5574922,OMIM:PS604772,Orphanet:3286                                                                                                                                                                         

In [10]:
# get number of GRCH38 RYR2 variants
conn.execute("""
SELECT COUNT(*)
FROM clinvar
WHERE Assembly = 'GRCh38'
AND GeneSymbol = 'RYR2'
""").fetchdf()

Unnamed: 0,count_star()
0,7906
