# Create OMIM inheritance map

In [1]:
# You will need to apply for access to OMIM's data downloads in order to replicate this process
# Download the gememap2.txt file from OMIM's registered data downloads
import pandas
genemap=pandas.read_csv('OMIM/genemap2.txt',sep='\t',low_memory=False,comment='#')
genemap=genemap.loc[:,['MIM Number','Approved Gene Symbol','Ensembl Gene ID','Phenotypes']]
genemap=genemap.loc[~genemap['Approved Gene Symbol'].isna(),:]
genemap=genemap.loc[~genemap['Phenotypes'].isna(),:]
genemap

Unnamed: 0,MIM Number,Approved Gene Symbol,Ensembl Gene ID,Phenotypes
24,147571,ISG15,ENSG00000187608,"Immunodeficiency 38, 616126 (3), Autosomal rec..."
25,103320,AGRN,ENSG00000188157,"Myasthenic syndrome, congenital, 8, with pre- ..."
30,600315,TNFRSF4,ENSG00000186827,"?Immunodeficiency 16, 615593 (3), Autosomal re..."
32,615291,B3GALT6,ENSG00000176022,"Ehlers-Danlos syndrome, spondylodysplastic typ..."
36,611354,INTS11,ENSG00000127054,Neurodevelopmental disorder with motor and lan...
...,...,...,...,...
18223,300777,TMLHE,ENSG00000185973,"{Autism, susceptibility to, X-linked 6}, 30087..."
18236,400020,SHOX,ENSG00000185960,"Short stature, idiopathic familial, 300582 (3)..."
18245,480000,SRY,ENSG00000184895,"46XY sex reversal 1, 400044 (3), Y-linked; 46X..."
18251,400033,TBL1Y,ENSG00000092377,"?Deafness, Y-linked 2, 400047 (3), Y-linked"


In [2]:
mimToGene=genemap.loc[:,['MIM Number','Approved Gene Symbol']]
phenotypes=pandas.DataFrame(genemap['Phenotypes'].str.split(';').tolist(), index=genemap['MIM Number'].values).stack().str.strip()
phenotypes

147571  0    Immunodeficiency 38, 616126 (3), Autosomal rec...
103320  0    Myasthenic syndrome, congenital, 8, with pre- ...
600315  0    ?Immunodeficiency 16, 615593 (3), Autosomal re...
615291  0    Ehlers-Danlos syndrome, spondylodysplastic typ...
        1    Spondyloepimetaphyseal dysplasia with joint la...
                                   ...                        
400020  2    Leri-Weill dyschondrosteosis, 127300 (3), Pseu...
480000  0            46XY sex reversal 1, 400044 (3), Y-linked
        1    46XX sex reversal 1, 400045 (4), X-linked domi...
400033  0          ?Deafness, Y-linked 2, 400047 (3), Y-linked
400005  0    Spermatogenic failure, Y-linked, 2, 415000 (3)...
Length: 7471, dtype: object

In [3]:
filter=((phenotypes.str[0]=="?") | (phenotypes.str[0]=="[") | (phenotypes.str[0]=="{"))
phenotypes2=phenotypes.loc[~filter,:]
phenotypes2
filter2=phenotypes2.str.contains(" [0-9]{6} \(")
phenotypes2=phenotypes2.loc[filter2,:]
phenotypes2

147571  0    Immunodeficiency 38, 616126 (3), Autosomal rec...
103320  0    Myasthenic syndrome, congenital, 8, with pre- ...
615291  0    Ehlers-Danlos syndrome, spondylodysplastic typ...
        1    Spondyloepimetaphyseal dysplasia with joint la...
        2    Al-Gazali syndrome, 609465 (3), Autosomal rece...
                                   ...                        
400020  1    Langer mesomelic dysplasia, 249700 (3), Pseudo...
        2    Leri-Weill dyschondrosteosis, 127300 (3), Pseu...
480000  0            46XY sex reversal 1, 400044 (3), Y-linked
        1    46XX sex reversal 1, 400045 (4), X-linked domi...
400005  0    Spermatogenic failure, Y-linked, 2, 415000 (3)...
Length: 5928, dtype: object

In [4]:
locusMIMs=phenotypes2.index.get_level_values(0)
phenotypeMIMs=phenotypes2.str.split("\([0-9]\)").str[0].str.split(',').str[-1].str.strip()
phenotypeInheritances=phenotypes2.str.split("\([0-9]\), ").str[1].str.strip()
mappingMethod=phenotypes2.str.split(" [0-9]{6} \(").str[1].str.split(')').str[0]

In [5]:
newMap=pandas.DataFrame({'geneMIM':locusMIMs,'phenoMIM':phenotypeMIMs.values,'inheritance':phenotypeInheritances.values,'mappingMethod':mappingMethod.values})
newMap

Unnamed: 0,geneMIM,phenoMIM,inheritance,mappingMethod
0,147571,616126,Autosomal recessive,3
1,103320,615120,Autosomal recessive,3
2,615291,615349,Autosomal recessive,3
3,615291,271640,Autosomal recessive,3
4,615291,609465,Autosomal recessive,3
...,...,...,...,...
5923,400020,249700,Pseudoautosomal recessive,3
5924,400020,127300,Pseudoautosomal dominant,3
5925,480000,400044,Y-linked,3
5926,480000,400045,X-linked dominant,4


In [6]:
newMap.to_csv('OMIM/geneToPhenoToInheritanceMap.txt',sep='\t',index=False)

In [7]:
newMap2=newMap.loc[((newMap['inheritance']=="Autosomal dominant") | (newMap['inheritance']=="Autosomal recessive")),:]
newMap3=newMap2.loc[newMap2['mappingMethod']=='3',:]
# add back in the gene names
mimToGene=mimToGene.rename(columns={'MIM Number':'geneMIM','Approved Symbol':'GeneSymbol'})
newMap4=newMap3.merge(mimToGene,how='inner',on='geneMIM')
newMap4.to_csv('OMIM/geneToPhenoToInheritanceMap_filtered.txt',sep='\t',index=False)
newMap4

Unnamed: 0,geneMIM,phenoMIM,inheritance,mappingMethod,Approved Gene Symbol
0,147571,616126,Autosomal recessive,3,ISG15
1,103320,615120,Autosomal recessive,3,AGRN
2,615291,615349,Autosomal recessive,3,B3GALT6
3,615291,271640,Autosomal recessive,3,B3GALT6
4,615291,609465,Autosomal recessive,3,B3GALT6
...,...,...,...,...,...
4939,604272,604377,Autosomal recessive,3,SCO2
4940,131222,603041,Autosomal recessive,3,TYMP
4941,612395,602541,Autosomal recessive,3,CHKB
4942,607574,250100,Autosomal recessive,3,ARSA


# Parse clinvar variants

## 2021.01

In [1]:
import pandas
inheritanceMap=pandas.read_csv('OMIM/geneToPhenoToInheritanceMap_filtered.txt',sep='\t',low_memory=False)
inheritanceMap=inheritanceMap.drop_duplicates(subset=['geneMIM','phenoMIM','inheritance'],keep='first')
inheritanceMap

Unnamed: 0,geneMIM,phenoMIM,inheritance,mappingMethod,Approved Gene Symbol
0,147571,616126,Autosomal recessive,3,ISG15
1,103320,615120,Autosomal recessive,3,AGRN
2,615291,615349,Autosomal recessive,3,B3GALT6
3,615291,271640,Autosomal recessive,3,B3GALT6
4,615291,609465,Autosomal recessive,3,B3GALT6
...,...,...,...,...,...
4939,604272,604377,Autosomal recessive,3,SCO2
4940,131222,603041,Autosomal recessive,3,TYMP
4941,612395,602541,Autosomal recessive,3,CHKB
4942,607574,250100,Autosomal recessive,3,ARSA


In [2]:
# ClinVar files downloaded from: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/archive/
clinvar=pandas.read_csv('clinvar/variant_summary_2021-01.txt',sep='\t',low_memory=False)
clinvar.loc[:,'Type'].value_counts()

single nucleotide variant    1408788
Deletion                       97092
Duplication                    42477
copy number loss               35139
copy number gain               34897
Microsatellite                 22640
Indel                          11130
Insertion                       8313
Variation                        780
Inversion                        717
Translocation                    307
protein only                     103
Complex                           62
fusion                             6
Tandem duplication                 1
Name: Type, dtype: int64

In [7]:
clinvar=clinvar.loc[clinvar['Assembly']=="GRCh37",:]
clinvar=clinvar.loc[((clinvar['Type']=="single nucleotide variant") | (clinvar['Type']=="Deletion") | (clinvar['Type']=="Duplication") | (clinvar['Type']=="Microsatellite") | (clinvar['Type']=="Indel") | (clinvar['Type']=="Insertion")),:]
benign=clinvar.loc[((clinvar['ClinicalSignificance']=="Benign") | (clinvar['ClinicalSignificance']=="Likely benign") | (clinvar['ClinicalSignificance']=="Benign/Likely benign")),:]
benign=benign.loc[~(benign['ReviewStatus']=="no assertion criteria provided"),['Chromosome','PositionVCF','ReferenceAlleleVCF','AlternateAlleleVCF']]
benign=benign.rename(columns={'PositionVCF':'Position','ReferenceAlleleVCF':'ReferenceAllele','AlternateAlleleVCF':'AlternateAllele'}).sort_values(by=['Chromosome','Position','ReferenceAllele','AlternateAllele'])
benign.to_csv('clinvar/clinvar_202101_oneToFourStarBenignVariants_locations.txt',sep='\t',index=False)
clinvar=clinvar.loc[((clinvar['ClinicalSignificance']=="Pathogenic") | (clinvar['ClinicalSignificance']=="Likely pathogenic") | (clinvar['ClinicalSignificance']=="Pathogenic/Likely pathogenic")),:]
clinvar=clinvar.loc[~(clinvar['OriginSimple']=="somatic"),:]
clinvar=clinvar.loc[clinvar['PhenotypeIDS'].str.contains("OMIM:"),:]

In [16]:
# get the relationships between clinvar allele IDs and omim phenotype ids 
clinvarOmimPheno=pandas.DataFrame(clinvar['PhenotypeIDS'].str.split(';').tolist(), index=clinvar['AlleleID'].values).stack()
clinvarOmimPheno=clinvarOmimPheno[clinvarOmimPheno.str.contains("OMIM:")]
clinvarAlleleIDs=clinvarOmimPheno.index.get_level_values(0)
clinvarAlleleOmimIDs=clinvarOmimPheno.str.split("OMIM:").str[1].str.split(",").str[0].str.strip()
clinvarAllelesToOmimIDs=pandas.DataFrame({'clinvarAlleleID':clinvarAlleleIDs,'phenoMIM':clinvarAlleleOmimIDs.values})

In [17]:
# select down to just the allele IDs for which we have at least one inheritance pattern in omim
clinvarAllelesWithInheritance=clinvarAllelesToOmimIDs.merge(inheritanceMap,how='inner',on='phenoMIM')
# select down to just the variants without conflicts in the inheritance
clinvarAllelesWithInheritance2=clinvarAllelesWithInheritance.drop_duplicates(subset=['clinvarAlleleID','inheritance'],keep='first')
clinvarAllelesWithInheritance3=clinvarAllelesWithInheritance2.drop_duplicates(subset='clinvarAlleleID',keep=False)
# get the original data on the alleles with omim inheritance
clinvar2=clinvar.merge(clinvarAllelesWithInheritance3,how='inner',left_on=['AlleleID'],right_on=['clinvarAlleleID'])
clinvar2.loc[:,'ReviewStatus'].value_counts()

criteria provided, single submitter                     29424
no assertion criteria provided                          20335
criteria provided, multiple submitters, no conflicts    10668
reviewed by expert panel                                 1171
practice guideline                                         14
Name: ReviewStatus, dtype: int64

In [18]:
clinvar3=clinvar2.loc[~(clinvar2['ReviewStatus']=="no assertion criteria provided"),:]
len(clinvar3.loc[clinvar3['inheritance']=='Autosomal recessive',:]) # this is the number of recessive variants we have now (but this includes coding and noncoding)

24054

In [19]:
len(clinvar3.loc[clinvar3['inheritance']=='Autosomal dominant',:]) # this is the number of dominant variants we have now (but this includes coding and noncoding)

17223

In [23]:
clinvar3AR=clinvar3.loc[clinvar3['inheritance']=='Autosomal recessive',['Chromosome','PositionVCF','ReferenceAlleleVCF','AlternateAlleleVCF']]
clinvar3AD=clinvar3.loc[clinvar3['inheritance']=='Autosomal dominant',['Chromosome','PositionVCF','ReferenceAlleleVCF','AlternateAlleleVCF']]
clinvar3AR=clinvar3AR.rename(columns={'PositionVCF':'Position','ReferenceAlleleVCF':'ReferenceAllele','AlternateAlleleVCF':'AlternateAllele'}).sort_values(by=['Chromosome','Position','ReferenceAllele','AlternateAllele'])
clinvar3AD=clinvar3AD.rename(columns={'PositionVCF':'Position','ReferenceAlleleVCF':'ReferenceAllele','AlternateAlleleVCF':'AlternateAllele'}).sort_values(by=['Chromosome','Position','ReferenceAllele','AlternateAllele'])
clinvar3AR=clinvar3AR.loc[clinvar3AR['Position']!=-1,:]
clinvar3AD=clinvar3AD.loc[clinvar3AD['Position']!=-1,:]
clinvar3AR.to_csv('clinvar/clinvar_202101_oneToFourStarPathogenicVariants_autosomalRecessive_locations.txt',sep='\t',index=False)
clinvar3AD.to_csv('clinvar/clinvar_202101_oneToFourStarPathogenicVariants_autosomalDominant_locations.txt',sep='\t',index=False)


## 2022.01

In [1]:
import pandas
inheritanceMap=pandas.read_csv('OMIM/geneToPhenoToInheritanceMap_filtered.txt',sep='\t',low_memory=False)
inheritanceMap=inheritanceMap.drop_duplicates(subset=['geneMIM','phenoMIM','inheritance'],keep='first')
inheritanceMap

Unnamed: 0,geneMIM,phenoMIM,inheritance,mappingMethod,Approved Gene Symbol
0,147571,616126,Autosomal recessive,3,ISG15
1,103320,615120,Autosomal recessive,3,AGRN
2,615291,615349,Autosomal recessive,3,B3GALT6
3,615291,271640,Autosomal recessive,3,B3GALT6
4,615291,609465,Autosomal recessive,3,B3GALT6
...,...,...,...,...,...
4939,604272,604377,Autosomal recessive,3,SCO2
4940,131222,603041,Autosomal recessive,3,TYMP
4941,612395,602541,Autosomal recessive,3,CHKB
4942,607574,250100,Autosomal recessive,3,ARSA


In [8]:
clinvar=pandas.read_csv('clinvar/variant_summary_2022-01.txt',sep='\t',low_memory=False)
clinvar.loc[:,'Type'].value_counts()

single nucleotide variant    1981494
Deletion                      134699
Duplication                    63166
Microsatellite                 36460
copy number loss               35471
copy number gain               35059
Indel                          13372
Insertion                      13252
Inversion                        973
Variation                        882
Translocation                    318
protein only                      99
Complex                           66
fusion                             6
Tandem duplication                 1
Name: Type, dtype: int64

In [9]:
clinvar=clinvar.loc[clinvar['Assembly']=="GRCh37",:]
clinvar=clinvar.loc[((clinvar['Type']=="single nucleotide variant") | (clinvar['Type']=="Deletion") | (clinvar['Type']=="Duplication") | (clinvar['Type']=="Microsatellite") | (clinvar['Type']=="Indel") | (clinvar['Type']=="Insertion")),:]
benign=clinvar.loc[((clinvar['ClinicalSignificance']=="Benign") | (clinvar['ClinicalSignificance']=="Likely benign") | (clinvar['ClinicalSignificance']=="Benign/Likely benign")),:]
benign=benign.loc[~(benign['ReviewStatus']=="no assertion criteria provided"),['Chromosome','PositionVCF','ReferenceAlleleVCF','AlternateAlleleVCF']]
benign=benign.rename(columns={'PositionVCF':'Position','ReferenceAlleleVCF':'ReferenceAllele','AlternateAlleleVCF':'AlternateAllele'}).sort_values(by=['Chromosome','Position','ReferenceAllele','AlternateAllele'])
benign.to_csv('clinvar/clinvar_202201_oneToFourStarBenignVariants_locations.txt',sep='\t',index=False)
clinvar=clinvar.loc[((clinvar['ClinicalSignificance']=="Pathogenic") | (clinvar['ClinicalSignificance']=="Likely pathogenic") | (clinvar['ClinicalSignificance']=="Pathogenic/Likely pathogenic")),:]
clinvar=clinvar.loc[~(clinvar['OriginSimple']=="somatic"),:]
clinvar=clinvar.loc[clinvar['PhenotypeIDS'].str.contains("OMIM:"),:]

In [4]:
# get the relationships between clinvar allele IDs and omim phenotype ids 
clinvarOmimPheno=pandas.DataFrame(clinvar['PhenotypeIDS'].str.split(';').tolist(), index=clinvar['AlleleID'].values).stack()
clinvarOmimPheno=clinvarOmimPheno[clinvarOmimPheno.str.contains("OMIM:")]
clinvarAlleleIDs=clinvarOmimPheno.index.get_level_values(0)
clinvarAlleleOmimIDs=clinvarOmimPheno.str.split("OMIM:").str[1].str.split(",").str[0].str.strip()
clinvarAllelesToOmimIDs=pandas.DataFrame({'clinvarAlleleID':clinvarAlleleIDs,'phenoMIM':clinvarAlleleOmimIDs.values})

In [5]:
# select down to just the allele IDs for which we have at least one inheritance pattern in omim
clinvarAllelesWithInheritance=clinvarAllelesToOmimIDs.merge(inheritanceMap,how='inner',on='phenoMIM')
# select down to just the variants without conflicts in the inheritance
clinvarAllelesWithInheritance2=clinvarAllelesWithInheritance.drop_duplicates(subset=['clinvarAlleleID','inheritance'],keep='first')
clinvarAllelesWithInheritance3=clinvarAllelesWithInheritance2.drop_duplicates(subset='clinvarAlleleID',keep=False)
# get the original data on the alleles with omim inheritance
clinvar2=clinvar.merge(clinvarAllelesWithInheritance3,how='inner',left_on=['AlleleID'],right_on=['clinvarAlleleID'])
clinvar2.loc[:,'ReviewStatus'].value_counts()

criteria provided, single submitter                     36429
no assertion criteria provided                          20868
criteria provided, multiple submitters, no conflicts    14025
reviewed by expert panel                                 1365
practice guideline                                         14
Name: ReviewStatus, dtype: int64

In [6]:
clinvar3=clinvar2.loc[~(clinvar2['ReviewStatus']=="no assertion criteria provided"),:]
len(clinvar3.loc[clinvar3['inheritance']=='Autosomal recessive',:]) # this is the number of recessive variants we have now (but this includes coding and noncoding)

29627

In [7]:
len(clinvar3.loc[clinvar3['inheritance']=='Autosomal dominant',:]) # this is the number of dominant variants we have now (but this includes coding and noncoding)

22206

In [8]:
clinvar3AR=clinvar3.loc[clinvar3['inheritance']=='Autosomal recessive',['Chromosome','PositionVCF','ReferenceAlleleVCF','AlternateAlleleVCF']]
clinvar3AD=clinvar3.loc[clinvar3['inheritance']=='Autosomal dominant',['Chromosome','PositionVCF','ReferenceAlleleVCF','AlternateAlleleVCF']]
clinvar3AR=clinvar3AR.rename(columns={'PositionVCF':'Position','ReferenceAlleleVCF':'ReferenceAllele','AlternateAlleleVCF':'AlternateAllele'}).sort_values(by=['Chromosome','Position','ReferenceAllele','AlternateAllele'])
clinvar3AD=clinvar3AD.rename(columns={'PositionVCF':'Position','ReferenceAlleleVCF':'ReferenceAllele','AlternateAlleleVCF':'AlternateAllele'}).sort_values(by=['Chromosome','Position','ReferenceAllele','AlternateAllele'])
clinvar3AR=clinvar3AR.loc[clinvar3AR['Position']!=-1,:]
clinvar3AD=clinvar3AD.loc[clinvar3AD['Position']!=-1,:]
clinvar3AR.to_csv('clinvar/clinvar_202201_oneToFourStarPathogenicVariants_autosomalRecessive_locations.txt',sep='\t',index=False)
clinvar3AD.to_csv('clinvar/clinvar_202201_oneToFourStarPathogenicVariants_autosomalDominant_locations.txt',sep='\t',index=False)


## 2023.01

In [1]:
import pandas
inheritanceMap=pandas.read_csv('OMIM/geneToPhenoToInheritanceMap_filtered.txt',sep='\t',low_memory=False)
inheritanceMap=inheritanceMap.drop_duplicates(subset=['geneMIM','phenoMIM','inheritance'],keep='first')
inheritanceMap

Unnamed: 0,geneMIM,phenoMIM,inheritance,mappingMethod,Approved Gene Symbol
0,147571,616126,Autosomal recessive,3,ISG15
1,103320,615120,Autosomal recessive,3,AGRN
2,615291,615349,Autosomal recessive,3,B3GALT6
3,615291,271640,Autosomal recessive,3,B3GALT6
4,615291,609465,Autosomal recessive,3,B3GALT6
...,...,...,...,...,...
4939,604272,604377,Autosomal recessive,3,SCO2
4940,131222,603041,Autosomal recessive,3,TYMP
4941,612395,602541,Autosomal recessive,3,CHKB
4942,607574,250100,Autosomal recessive,3,ARSA


In [10]:
clinvar=pandas.read_csv('clinvar/variant_summary_2023-01.txt',sep='\t',low_memory=False)
clinvar.loc[:,'Type'].value_counts()

single nucleotide variant    2831279
Deletion                      180902
Duplication                    83140
Microsatellite                 46908
copy number loss               37045
copy number gain               36661
Indel                          19521
Insertion                      17499
Inversion                       1638
Variation                        815
Translocation                    334
protein only                      95
Complex                           75
fusion                             6
Tandem duplication                 1
Name: Type, dtype: int64

In [11]:
clinvar=clinvar.loc[clinvar['Assembly']=="GRCh37",:]
clinvar=clinvar.loc[((clinvar['Type']=="single nucleotide variant") | (clinvar['Type']=="Deletion") | (clinvar['Type']=="Duplication") | (clinvar['Type']=="Microsatellite") | (clinvar['Type']=="Indel") | (clinvar['Type']=="Insertion")),:]
benign=clinvar.loc[((clinvar['ClinicalSignificance']=="Benign") | (clinvar['ClinicalSignificance']=="Likely benign") | (clinvar['ClinicalSignificance']=="Benign/Likely benign")),:]
benign=benign.loc[~(benign['ReviewStatus']=="no assertion criteria provided"),['Chromosome','PositionVCF','ReferenceAlleleVCF','AlternateAlleleVCF']]
benign=benign.rename(columns={'PositionVCF':'Position','ReferenceAlleleVCF':'ReferenceAllele','AlternateAlleleVCF':'AlternateAllele'}).sort_values(by=['Chromosome','Position','ReferenceAllele','AlternateAllele'])
benign.to_csv('clinvar/clinvar_202301_oneToFourStarBenignVariants_locations.txt',sep='\t',index=False)
clinvar=clinvar.loc[((clinvar['ClinicalSignificance']=="Pathogenic") | (clinvar['ClinicalSignificance']=="Likely pathogenic") | (clinvar['ClinicalSignificance']=="Pathogenic/Likely pathogenic")),:]
clinvar=clinvar.loc[~(clinvar['OriginSimple']=="somatic"),:]
clinvar=clinvar.loc[clinvar['PhenotypeIDS'].str.contains("OMIM:"),:]

In [4]:
# get the relationships between clinvar allele IDs and omim phenotype ids 
clinvarOmimPheno=pandas.DataFrame(clinvar['PhenotypeIDS'].str.split(';').tolist(), index=clinvar['AlleleID'].values).stack()
clinvarOmimPheno=clinvarOmimPheno[clinvarOmimPheno.str.contains("OMIM:")]
clinvarAlleleIDs=clinvarOmimPheno.index.get_level_values(0)
clinvarAlleleOmimIDs=clinvarOmimPheno.str.split("OMIM:").str[1].str.split(",").str[0].str.strip()
clinvarAllelesToOmimIDs=pandas.DataFrame({'clinvarAlleleID':clinvarAlleleIDs,'phenoMIM':clinvarAlleleOmimIDs.values})

In [5]:
# select down to just the allele IDs for which we have at least one inheritance pattern in omim
clinvarAllelesWithInheritance=clinvarAllelesToOmimIDs.merge(inheritanceMap,how='inner',on='phenoMIM')
# select down to just the variants without conflicts in the inheritance
clinvarAllelesWithInheritance2=clinvarAllelesWithInheritance.drop_duplicates(subset=['clinvarAlleleID','inheritance'],keep='first')
clinvarAllelesWithInheritance3=clinvarAllelesWithInheritance2.drop_duplicates(subset='clinvarAlleleID',keep=False)
# get the original data on the alleles with omim inheritance
clinvar2=clinvar.merge(clinvarAllelesWithInheritance3,how='inner',left_on=['AlleleID'],right_on=['clinvarAlleleID'])
clinvar2.loc[:,'ReviewStatus'].value_counts()

criteria provided, single submitter                     50647
criteria provided, multiple submitters, no conflicts    20714
no assertion criteria provided                          19974
reviewed by expert panel                                 1722
practice guideline                                          5
Name: ReviewStatus, dtype: int64

In [6]:
clinvar3=clinvar2.loc[~(clinvar2['ReviewStatus']=="no assertion criteria provided"),:]
len(clinvar3.loc[clinvar3['inheritance']=='Autosomal recessive',:]) # this is the number of recessive variants we have now (but this includes coding and noncoding)

43621

In [7]:
len(clinvar3.loc[clinvar3['inheritance']=='Autosomal dominant',:]) # this is the number of dominant variants we have now (but this includes coding and noncoding)

29467

In [8]:
clinvar3AR=clinvar3.loc[clinvar3['inheritance']=='Autosomal recessive',['Chromosome','PositionVCF','ReferenceAlleleVCF','AlternateAlleleVCF']]
clinvar3AD=clinvar3.loc[clinvar3['inheritance']=='Autosomal dominant',['Chromosome','PositionVCF','ReferenceAlleleVCF','AlternateAlleleVCF']]
clinvar3AR=clinvar3AR.rename(columns={'PositionVCF':'Position','ReferenceAlleleVCF':'ReferenceAllele','AlternateAlleleVCF':'AlternateAllele'}).sort_values(by=['Chromosome','Position','ReferenceAllele','AlternateAllele'])
clinvar3AD=clinvar3AD.rename(columns={'PositionVCF':'Position','ReferenceAlleleVCF':'ReferenceAllele','AlternateAlleleVCF':'AlternateAllele'}).sort_values(by=['Chromosome','Position','ReferenceAllele','AlternateAllele'])
clinvar3AR=clinvar3AR.loc[clinvar3AR['Position']!=-1,:]
clinvar3AD=clinvar3AD.loc[clinvar3AD['Position']!=-1,:]
clinvar3AR.to_csv('clinvar/clinvar_202301_oneToFourStarPathogenicVariants_autosomalRecessive_locations.txt',sep='\t',index=False)
clinvar3AD.to_csv('clinvar/clinvar_202301_oneToFourStarPathogenicVariants_autosomalDominant_locations.txt',sep='\t',index=False)


## 2023.10

In [1]:
import pandas
inheritanceMap=pandas.read_csv('OMIM/geneToPhenoToInheritanceMap_filtered.txt',sep='\t',low_memory=False)
inheritanceMap=inheritanceMap.drop_duplicates(subset=['geneMIM','phenoMIM','inheritance'],keep='first')
inheritanceMap

Unnamed: 0,geneMIM,phenoMIM,inheritance,mappingMethod,Approved Gene Symbol
0,147571,616126,Autosomal recessive,3,ISG15
1,103320,615120,Autosomal recessive,3,AGRN
2,615291,615349,Autosomal recessive,3,B3GALT6
3,615291,271640,Autosomal recessive,3,B3GALT6
4,615291,609465,Autosomal recessive,3,B3GALT6
...,...,...,...,...,...
4939,604272,604377,Autosomal recessive,3,SCO2
4940,131222,603041,Autosomal recessive,3,TYMP
4941,612395,602541,Autosomal recessive,3,CHKB
4942,607574,250100,Autosomal recessive,3,ARSA


In [12]:
clinvar=pandas.read_csv('clinvar/variant_summary_2023-10.txt',sep='\t',low_memory=False)
clinvar.loc[:,'Type'].value_counts()

single nucleotide variant    4128513
Deletion                      213368
Duplication                    98427
Microsatellite                 55030
copy number loss               37426
copy number gain               36802
Indel                          23738
Insertion                      19472
Inversion                       2219
Variation                        852
Translocation                    338
protein only                      95
Complex                           79
fusion                             6
Tandem duplication                 1
Name: Type, dtype: int64

In [13]:
clinvar=clinvar.loc[clinvar['Assembly']=="GRCh37",:]
clinvar=clinvar.loc[((clinvar['Type']=="single nucleotide variant") | (clinvar['Type']=="Deletion") | (clinvar['Type']=="Duplication") | (clinvar['Type']=="Microsatellite") | (clinvar['Type']=="Indel") | (clinvar['Type']=="Insertion")),:]
benign=clinvar.loc[((clinvar['ClinicalSignificance']=="Benign") | (clinvar['ClinicalSignificance']=="Likely benign") | (clinvar['ClinicalSignificance']=="Benign/Likely benign")),:]
benign=benign.loc[~(benign['ReviewStatus']=="no assertion criteria provided"),['Chromosome','PositionVCF','ReferenceAlleleVCF','AlternateAlleleVCF']]
benign=benign.rename(columns={'PositionVCF':'Position','ReferenceAlleleVCF':'ReferenceAllele','AlternateAlleleVCF':'AlternateAllele'}).sort_values(by=['Chromosome','Position','ReferenceAllele','AlternateAllele'])
benign.to_csv('clinvar/clinvar_202310_oneToFourStarBenignVariants_locations.txt',sep='\t',index=False)
clinvar=clinvar.loc[((clinvar['ClinicalSignificance']=="Pathogenic") | (clinvar['ClinicalSignificance']=="Likely pathogenic") | (clinvar['ClinicalSignificance']=="Pathogenic/Likely pathogenic")),:]
clinvar=clinvar.loc[~(clinvar['OriginSimple']=="somatic"),:]
clinvar=clinvar.loc[clinvar['PhenotypeIDS'].str.contains("OMIM:"),:]

In [4]:
# get the relationships between clinvar allele IDs and omim phenotype ids 
clinvarOmimPheno=pandas.DataFrame(clinvar['PhenotypeIDS'].str.split(';').tolist(), index=clinvar['AlleleID'].values).stack()
clinvarOmimPheno=clinvarOmimPheno[clinvarOmimPheno.str.contains("OMIM:")]
clinvarAlleleIDs=clinvarOmimPheno.index.get_level_values(0)
clinvarAlleleOmimIDs=clinvarOmimPheno.str.split("OMIM:").str[1].str.split(",").str[0].str.strip()
clinvarAllelesToOmimIDs=pandas.DataFrame({'clinvarAlleleID':clinvarAlleleIDs,'phenoMIM':clinvarAlleleOmimIDs.values})

In [5]:
# select down to just the allele IDs for which we have at least one inheritance pattern in omim
clinvarAllelesWithInheritance=clinvarAllelesToOmimIDs.merge(inheritanceMap,how='inner',on='phenoMIM')
# select down to just the variants without conflicts in the inheritance
clinvarAllelesWithInheritance2=clinvarAllelesWithInheritance.drop_duplicates(subset=['clinvarAlleleID','inheritance'],keep='first')
clinvarAllelesWithInheritance3=clinvarAllelesWithInheritance2.drop_duplicates(subset='clinvarAlleleID',keep=False)
# get the original data on the alleles with omim inheritance
clinvar2=clinvar.merge(clinvarAllelesWithInheritance3,how='inner',left_on=['AlleleID'],right_on=['clinvarAlleleID'])
clinvar2.loc[:,'ReviewStatus'].value_counts()

criteria provided, single submitter                     59536
criteria provided, multiple submitters, no conflicts    24712
no assertion criteria provided                          18453
reviewed by expert panel                                 1903
practice guideline                                          5
Name: ReviewStatus, dtype: int64

In [6]:
clinvar3=clinvar2.loc[~(clinvar2['ReviewStatus']=="no assertion criteria provided"),:]
len(clinvar3.loc[clinvar3['inheritance']=='Autosomal recessive',:]) # this is the number of recessive variants we have now (but this includes coding and noncoding)

51351

In [7]:
len(clinvar3.loc[clinvar3['inheritance']=='Autosomal dominant',:]) # this is the number of dominant variants we have now (but this includes coding and noncoding)

34805

In [8]:
clinvar3AR=clinvar3.loc[clinvar3['inheritance']=='Autosomal recessive',['Chromosome','PositionVCF','ReferenceAlleleVCF','AlternateAlleleVCF']]
clinvar3AD=clinvar3.loc[clinvar3['inheritance']=='Autosomal dominant',['Chromosome','PositionVCF','ReferenceAlleleVCF','AlternateAlleleVCF']]
clinvar3AR=clinvar3AR.rename(columns={'PositionVCF':'Position','ReferenceAlleleVCF':'ReferenceAllele','AlternateAlleleVCF':'AlternateAllele'}).sort_values(by=['Chromosome','Position','ReferenceAllele','AlternateAllele'])
clinvar3AD=clinvar3AD.rename(columns={'PositionVCF':'Position','ReferenceAlleleVCF':'ReferenceAllele','AlternateAlleleVCF':'AlternateAllele'}).sort_values(by=['Chromosome','Position','ReferenceAllele','AlternateAllele'])
clinvar3AR=clinvar3AR.loc[clinvar3AR['Position']!=-1,:]
clinvar3AD=clinvar3AD.loc[clinvar3AD['Position']!=-1,:]
clinvar3AR.to_csv('clinvar/clinvar_202310_oneToFourStarPathogenicVariants_autosomalRecessive_locations.txt',sep='\t',index=False)
clinvar3AD.to_csv('clinvar/clinvar_202310_oneToFourStarPathogenicVariants_autosomalDominant_locations.txt',sep='\t',index=False)


# Process Clinvar Variants with Annovar

In [14]:
import pandas
AD202101=pandas.read_csv('clinvar/clinvar_202101_oneToFourStarPathogenicVariants_autosomalDominant_locations.txt',sep='\t',low_memory=False)
AD202201=pandas.read_csv('clinvar/clinvar_202201_oneToFourStarPathogenicVariants_autosomalDominant_locations.txt',sep='\t',low_memory=False)
AD202301=pandas.read_csv('clinvar/clinvar_202301_oneToFourStarPathogenicVariants_autosomalDominant_locations.txt',sep='\t',low_memory=False)
AD202310=pandas.read_csv('clinvar/clinvar_202310_oneToFourStarPathogenicVariants_autosomalDominant_locations.txt',sep='\t',low_memory=False)

AD202310=AD202310.merge(AD202301,how='left',on=['Chromosome','Position','ReferenceAllele','AlternateAllele'],indicator=True)
AD202310=AD202310.loc[AD202310['_merge']=='left_only',['Chromosome','Position','ReferenceAllele','AlternateAllele']].reset_index(drop=True)
AD202310.to_csv('clinvar/clinvar_202310_oneToFourStarPathogenicVariants_autosomalDominant_locations_v2.txt',sep='\t',index=False)

AD202301=AD202301.merge(AD202201,how='left',on=['Chromosome','Position','ReferenceAllele','AlternateAllele'],indicator=True)
AD202301=AD202301.loc[AD202301['_merge']=='left_only',['Chromosome','Position','ReferenceAllele','AlternateAllele']].reset_index(drop=True)
AD202301.to_csv('clinvar/clinvar_202301_oneToFourStarPathogenicVariants_autosomalDominant_locations_v2.txt',sep='\t',index=False)

AD202201=AD202201.merge(AD202101,how='left',on=['Chromosome','Position','ReferenceAllele','AlternateAllele'],indicator=True)
AD202201=AD202201.loc[AD202201['_merge']=='left_only',['Chromosome','Position','ReferenceAllele','AlternateAllele']].reset_index(drop=True)
AD202201.to_csv('clinvar/clinvar_202201_oneToFourStarPathogenicVariants_autosomalDominant_locations_v2.txt',sep='\t',index=False)

AR202101=pandas.read_csv('clinvar/clinvar_202101_oneToFourStarPathogenicVariants_autosomalRecessive_locations.txt',sep='\t',low_memory=False)
AR202201=pandas.read_csv('clinvar/clinvar_202201_oneToFourStarPathogenicVariants_autosomalRecessive_locations.txt',sep='\t',low_memory=False)
AR202301=pandas.read_csv('clinvar/clinvar_202301_oneToFourStarPathogenicVariants_autosomalRecessive_locations.txt',sep='\t',low_memory=False)
AR202310=pandas.read_csv('clinvar/clinvar_202310_oneToFourStarPathogenicVariants_autosomalRecessive_locations.txt',sep='\t',low_memory=False)


AR202310=AR202310.merge(AR202301,how='left',on=['Chromosome','Position','ReferenceAllele','AlternateAllele'],indicator=True)
AR202310=AR202310.loc[AR202310['_merge']=='left_only',['Chromosome','Position','ReferenceAllele','AlternateAllele']].reset_index(drop=True)
AR202310.to_csv('clinvar/clinvar_202310_oneToFourStarPathogenicVariants_autosomalRecessive_locations_v2.txt',sep='\t',index=False)

AR202301=AR202301.merge(AR202201,how='left',on=['Chromosome','Position','ReferenceAllele','AlternateAllele'],indicator=True)
AR202301=AR202301.loc[AR202301['_merge']=='left_only',['Chromosome','Position','ReferenceAllele','AlternateAllele']].reset_index(drop=True)
AR202301.to_csv('clinvar/clinvar_202301_oneToFourStarPathogenicVariants_autosomalRecessive_locations_v2.txt',sep='\t',index=False)

AR202201=AR202201.merge(AR202101,how='left',on=['Chromosome','Position','ReferenceAllele','AlternateAllele'],indicator=True)
AR202201=AR202201.loc[AR202201['_merge']=='left_only',['Chromosome','Position','ReferenceAllele','AlternateAllele']].reset_index(drop=True)
AR202201.to_csv('clinvar/clinvar_202201_oneToFourStarPathogenicVariants_autosomalRecessive_locations_v2.txt',sep='\t',index=False)


In [None]:
# turn the files into VCFs and process with Annovar. I couldn't do this with the magics here, and instead had to run this block of code manually on the command line. 
%%bash 
cd clinvar
dos2unix clinvar_20*_locations_v2.txt

tail -n +2 clinvar_202201_oneToFourStarPathogenicVariants_autosomalRecessive_locations_v2.txt | awk -F'\t' -v OFS='\t' '{print $1,$2,".",$3,$4,".","PASS",".","GT","0/1"}' | cat ../header.vcf - > clinvar_202201_pathogenic_AR.vcf
tail -n +2 clinvar_202201_oneToFourStarPathogenicVariants_autosomalDominant_locations_v2.txt | awk -F'\t' -v OFS='\t' '{print $1,$2,".",$3,$4,".","PASS",".","GT","0/1"}' | cat ../header.vcf - > clinvar_202201_pathogenic_AD.vcf
tail -n +2 clinvar_202301_oneToFourStarPathogenicVariants_autosomalRecessive_locations_v2.txt | awk -F'\t' -v OFS='\t' '{print $1,$2,".",$3,$4,".","PASS",".","GT","0/1"}' | cat ../header.vcf - > clinvar_202301_pathogenic_AR.vcf
tail -n +2 clinvar_202301_oneToFourStarPathogenicVariants_autosomalDominant_locations_v2.txt | awk -F'\t' -v OFS='\t' '{print $1,$2,".",$3,$4,".","PASS",".","GT","0/1"}' | cat ../header.vcf - > clinvar_202301_pathogenic_AD.vcf
tail -n +2 clinvar_202310_oneToFourStarPathogenicVariants_autosomalRecessive_locations_v2.txt | awk -F'\t' -v OFS='\t' '{print $1,$2,".",$3,$4,".","PASS",".","GT","0/1"}' | cat ../header.vcf - > clinvar_202310_pathogenic_AR.vcf
tail -n +2 clinvar_202310_oneToFourStarPathogenicVariants_autosomalDominant_locations_v2.txt | awk -F'\t' -v OFS='\t' '{print $1,$2,".",$3,$4,".","PASS",".","GT","0/1"}' | cat ../header.vcf - > clinvar_202310_pathogenic_AD.vcf

../annovar/convert2annovar.pl -format vcf4 --keepindelref clinvar_202201_pathogenic_AR.vcf > clinvar_202201_pathogenic_AR.avinput
../annovar/annotate_variation.pl -dbtype wgEncodeGencodeBasicV33lift37 -buildver hg19 --exonicsplicing clinvar_202201_pathogenic_AR.avinput ../annovar/humandb/
../annovar/coding_change.pl clinvar_202201_pathogenic_AR.avinput.exonic_variant_function ../annovar/humandb/hg19_wgEncodeGencodeBasicV33lift37.txt ../annovar/humandb/hg19_wgEncodeGencodeBasicV33lift37Mrna.fa --includesnp --onlyAltering --alltranscript > clinvar_202201_pathogenic_AR.coding_changes.txt

../annovar/convert2annovar.pl -format vcf4 --keepindelref clinvar_202201_pathogenic_AD.vcf > clinvar_202201_pathogenic_AD.avinput
../annovar/annotate_variation.pl -dbtype wgEncodeGencodeBasicV33lift37 -buildver hg19 --exonicsplicing clinvar_202201_pathogenic_AD.avinput ../annovar/humandb/
../annovar/coding_change.pl clinvar_202201_pathogenic_AD.avinput.exonic_variant_function ../annovar/humandb/hg19_wgEncodeGencodeBasicV33lift37.txt ../annovar/humandb/hg19_wgEncodeGencodeBasicV33lift37Mrna.fa --includesnp --onlyAltering --alltranscript > clinvar_202201_pathogenic_AD.coding_changes.txt

../annovar/convert2annovar.pl -format vcf4 --keepindelref clinvar_202301_pathogenic_AR.vcf > clinvar_202301_pathogenic_AR.avinput
../annovar/annotate_variation.pl -dbtype wgEncodeGencodeBasicV33lift37 -buildver hg19 --exonicsplicing clinvar_202301_pathogenic_AR.avinput ../annovar/humandb/
../annovar/coding_change.pl clinvar_202301_pathogenic_AR.avinput.exonic_variant_function ../annovar/humandb/hg19_wgEncodeGencodeBasicV33lift37.txt ../annovar/humandb/hg19_wgEncodeGencodeBasicV33lift37Mrna.fa --includesnp --onlyAltering --alltranscript > clinvar_202301_pathogenic_AR.coding_changes.txt

../annovar/convert2annovar.pl -format vcf4 --keepindelref clinvar_202301_pathogenic_AD.vcf > clinvar_202301_pathogenic_AD.avinput
../annovar/annotate_variation.pl -dbtype wgEncodeGencodeBasicV33lift37 -buildver hg19 --exonicsplicing clinvar_202301_pathogenic_AD.avinput ../annovar/humandb/
../annovar/coding_change.pl clinvar_202301_pathogenic_AD.avinput.exonic_variant_function ../annovar/humandb/hg19_wgEncodeGencodeBasicV33lift37.txt ../annovar/humandb/hg19_wgEncodeGencodeBasicV33lift37Mrna.fa --includesnp --onlyAltering --alltranscript > clinvar_202301_pathogenic_AD.coding_changes.txt

../annovar/convert2annovar.pl -format vcf4 --keepindelref clinvar_202310_pathogenic_AR.vcf > clinvar_202310_pathogenic_AR.avinput
../annovar/annotate_variation.pl -dbtype wgEncodeGencodeBasicV33lift37 -buildver hg19 --exonicsplicing clinvar_202310_pathogenic_AR.avinput ../annovar/humandb/
../annovar/coding_change.pl clinvar_202310_pathogenic_AR.avinput.exonic_variant_function ../annovar/humandb/hg19_wgEncodeGencodeBasicV33lift37.txt ../annovar/humandb/hg19_wgEncodeGencodeBasicV33lift37Mrna.fa --includesnp --onlyAltering --alltranscript > clinvar_202310_pathogenic_AR.coding_changes.txt

../annovar/convert2annovar.pl -format vcf4 --keepindelref clinvar_202310_pathogenic_AD.vcf > clinvar_202310_pathogenic_AD.avinput
../annovar/annotate_variation.pl -dbtype wgEncodeGencodeBasicV33lift37 -buildver hg19 --exonicsplicing clinvar_202310_pathogenic_AD.avinput ../annovar/humandb/
../annovar/coding_change.pl clinvar_202310_pathogenic_AD.avinput.exonic_variant_function ../annovar/humandb/hg19_wgEncodeGencodeBasicV33lift37.txt ../annovar/humandb/hg19_wgEncodeGencodeBasicV33lift37Mrna.fa --includesnp --onlyAltering --alltranscript > clinvar_202310_pathogenic_AD.coding_changes.txt


In [None]:
%%bash

# re-make locations files
grep -v '^#' clinvar_202201_pathogenic_AR.vcf | cut -f 1,2,4,5 > clinvar_202201_pathogenic_AR_locations.txt
grep -v '^#' clinvar_202301_pathogenic_AR.vcf | cut -f 1,2,4,5 > clinvar_202301_pathogenic_AR_locations.txt
grep -v '^#' clinvar_202310_pathogenic_AR.vcf | cut -f 1,2,4,5 > clinvar_202310_pathogenic_AR_locations.txt
grep -v '^#' clinvar_202201_pathogenic_AD.vcf | cut -f 1,2,4,5 > clinvar_202201_pathogenic_AD_locations.txt
grep -v '^#' clinvar_202301_pathogenic_AD.vcf | cut -f 1,2,4,5 > clinvar_202301_pathogenic_AD_locations.txt
grep -v '^#' clinvar_202310_pathogenic_AD.vcf | cut -f 1,2,4,5 > clinvar_202310_pathogenic_AD_locations.txt

cd ../

# groom annovar outputs
python Maverick/InferenceScripts/groomAnnovarOutput.py --inputBase=testSets/clinvar_202201_pathogenic_AR
python Maverick/InferenceScripts/groomAnnovarOutput.py --inputBase=testSets/clinvar_202201_pathogenic_AD
python Maverick/InferenceScripts/groomAnnovarOutput.py --inputBase=testSets/clinvar_202301_pathogenic_AR
python Maverick/InferenceScripts/groomAnnovarOutput.py --inputBase=testSets/clinvar_202301_pathogenic_AD
python Maverick/InferenceScripts/groomAnnovarOutput.py --inputBase=testSets/clinvar_202310_pathogenic_AR
python Maverick/InferenceScripts/groomAnnovarOutput.py --inputBase=testSets/clinvar_202310_pathogenic_AD

# annotate variants
python Maverick/InferenceScripts/annotateVariants.py --inputBase=testSets/clinvar_202201_pathogenic_AR
python Maverick/InferenceScripts/annotateVariants.py --inputBase=testSets/clinvar_202201_pathogenic_AD
python Maverick/InferenceScripts/annotateVariants.py --inputBase=testSets/clinvar_202301_pathogenic_AR
python Maverick/InferenceScripts/annotateVariants.py --inputBase=testSets/clinvar_202301_pathogenic_AD
python Maverick/InferenceScripts/annotateVariants.py --inputBase=testSets/clinvar_202310_pathogenic_AR
python Maverick/InferenceScripts/annotateVariants.py --inputBase=testSets/clinvar_202310_pathogenic_AD


# Divide into testing sets

In [1]:
# Divide groups into previously-known and novel disease genes sets
import pandas
clinvar2021=pandas.read_csv('clinvar/variant_summary_2021-01.txt',sep='\t',low_memory=False)
clinvar2021=clinvar2021.loc[((clinvar2021['ClinicalSignificance']=="Pathogenic") | (clinvar2021['ClinicalSignificance']=="Likely pathogenic") | (clinvar2021['ClinicalSignificance']=="Pathogenic/Likely pathogenic")),:]
clinvar2021=clinvar2021.loc[~(clinvar2021['OriginSimple']=="somatic"),:]
clinvar2021=clinvar2021.loc[((clinvar2021['Type']=="single nucleotide variant") | (clinvar2021['Type']=="Deletion") | (clinvar2021['Type']=="Duplication") | (clinvar2021['Type']=="Microsatellite") | (clinvar2021['Type']=="Indel") | (clinvar2021['Type']=="Insertion")),:]
clinvar2021DiseaseGenes=clinvar2021.loc[:,'GeneSymbol'].drop_duplicates(keep='first')

clinvar2022=pandas.read_csv('clinvar/variant_summary_2022-01.txt',sep='\t',low_memory=False)
clinvar2022=clinvar2022.loc[((clinvar2022['ClinicalSignificance']=="Pathogenic") | (clinvar2021['ClinicalSignificance']=="Likely pathogenic") | (clinvar2021['ClinicalSignificance']=="Pathogenic/Likely pathogenic")),:]
clinvar2022=clinvar2022.loc[~(clinvar2022['OriginSimple']=="somatic"),:]
clinvar2022=clinvar2022.loc[((clinvar2022['Type']=="single nucleotide variant") | (clinvar2022['Type']=="Deletion") | (clinvar2022['Type']=="Duplication") | (clinvar2022['Type']=="Microsatellite") | (clinvar2022['Type']=="Indel") | (clinvar2022['Type']=="Insertion")),:]
clinvar2022DiseaseGenes=clinvar2022.loc[:,'GeneSymbol'].drop_duplicates(keep='first')

clinvar2023=pandas.read_csv('clinvar/variant_summary_2023-01.txt',sep='\t',low_memory=False)
clinvar2023=clinvar2023.loc[((clinvar2023['ClinicalSignificance']=="Pathogenic") | (clinvar2021['ClinicalSignificance']=="Likely pathogenic") | (clinvar2021['ClinicalSignificance']=="Pathogenic/Likely pathogenic")),:]
clinvar2023=clinvar2023.loc[~(clinvar2023['OriginSimple']=="somatic"),:]
clinvar2023=clinvar2023.loc[((clinvar2023['Type']=="single nucleotide variant") | (clinvar2023['Type']=="Deletion") | (clinvar2023['Type']=="Duplication") | (clinvar2023['Type']=="Microsatellite") | (clinvar2023['Type']=="Indel") | (clinvar2023['Type']=="Insertion")),:]
clinvar2023DiseaseGenes=clinvar2023.loc[:,'GeneSymbol'].drop_duplicates(keep='first')


In [2]:
clinvar2023DiseaseGenes

0               AP5Z1
6             FOXRED1
20                HFE
34              WDR35
42             ABHD12
              ...    
3246778    FTH1;BEST1
3251168         NPTX1
3251312       C3orf52
3251323         TNNC2
3251330          BUB1
Name: GeneSymbol, Length: 5880, dtype: object

In [3]:
clinvar2021DiseaseGenes

0            AP5Z1
6          FOXRED1
18             HFE
34           WDR35
42          ABHD12
            ...   
1656978     VPS35L
1656986      IKZF5
1656998     PRKACB
1657006     PRKACA
1662441     CFAP58
Name: GeneSymbol, Length: 5138, dtype: object

In [4]:
AR2022=pandas.read_csv('clinvar/clinvar_202201_pathogenic_AR.annotated.txt',sep='\t',low_memory=False)
AR2022Known=AR2022.loc[AR2022['geneName'].isin(clinvar2021DiseaseGenes.values),:].reset_index(drop=True)
AR2022Novel=AR2022.loc[~(AR2022['geneName'].isin(clinvar2021DiseaseGenes.values)),:].reset_index(drop=True)
AD2022=pandas.read_csv('clinvar/clinvar_202201_pathogenic_AD.annotated.txt',sep='\t',low_memory=False)
AD2022Known=AD2022.loc[AD2022['geneName'].isin(clinvar2021DiseaseGenes.values),:].reset_index(drop=True)
AD2022Novel=AD2022.loc[~(AD2022['geneName'].isin(clinvar2021DiseaseGenes.values)),:].reset_index(drop=True)
AR2022Known.to_csv('testSets/2021_autosomalRecessive_knownDiseaseGenes.txt',sep='\t',index=False)
AR2022Novel.to_csv('testSets/2021_autosomalRecessive_novelDiseaseGenes.txt',sep='\t',index=False)
AD2022Known.to_csv('testSets/2021_autosomalDominant_knownDiseaseGenes.txt',sep='\t',index=False)
AD2022Novel.to_csv('testSets/2021_autosomalDominant_novelDiseaseGenes.txt',sep='\t',index=False)

AR2023=pandas.read_csv('clinvar/clinvar_202301_pathogenic_AR.annotated.txt',sep='\t',low_memory=False)
AR2023Known=AR2023.loc[AR2023['geneName'].isin(clinvar2022DiseaseGenes.values),:].reset_index(drop=True)
AR2023Novel=AR2023.loc[~(AR2023['geneName'].isin(clinvar2022DiseaseGenes.values)),:].reset_index(drop=True)
AD2023=pandas.read_csv('clinvar/clinvar_202301_pathogenic_AD.annotated.txt',sep='\t',low_memory=False)
AD2023Known=AD2023.loc[AD2023['geneName'].isin(clinvar2022DiseaseGenes.values),:].reset_index(drop=True)
AD2023Novel=AD2023.loc[~(AD2023['geneName'].isin(clinvar2022DiseaseGenes.values)),:].reset_index(drop=True)
AR2023Known.to_csv('testSets/2022_autosomalRecessive_knownDiseaseGenes.txt',sep='\t',index=False)
AR2023Novel.to_csv('testSets/2022_autosomalRecessive_novelDiseaseGenes.txt',sep='\t',index=False)
AD2023Known.to_csv('testSets/2022_autosomalDominant_knownDiseaseGenes.txt',sep='\t',index=False)
AD2023Novel.to_csv('testSets/2022_autosomalDominant_novelDiseaseGenes.txt',sep='\t',index=False)

AR202310=pandas.read_csv('clinvar/clinvar_202310_pathogenic_AR.annotated.txt',sep='\t',low_memory=False)
AR202310Known=AR202310.loc[AR202310['geneName'].isin(clinvar2023DiseaseGenes.values),:].reset_index(drop=True)
AR202310Novel=AR202310.loc[~(AR202310['geneName'].isin(clinvar2023DiseaseGenes.values)),:].reset_index(drop=True)
AD202310=pandas.read_csv('clinvar/clinvar_202310_pathogenic_AD.annotated.txt',sep='\t',low_memory=False)
AD202310Known=AD202310.loc[AD202310['geneName'].isin(clinvar2023DiseaseGenes.values),:].reset_index(drop=True)
AD202310Novel=AD202310.loc[~(AD202310['geneName'].isin(clinvar2023DiseaseGenes.values)),:].reset_index(drop=True)
AR202310Known.to_csv('testSets/2023NineMonths_autosomalRecessive_knownDiseaseGenes.txt',sep='\t',index=False)
AR202310Novel.to_csv('testSets/2023NineMonths_autosomalRecessive_novelDiseaseGenes.txt',sep='\t',index=False)
AD202310Known.to_csv('testSets/2023NineMonths_autosomalDominant_knownDiseaseGenes.txt',sep='\t',index=False)
AD202310Novel.to_csv('testSets/2023NineMonths_autosomalDominant_novelDiseaseGenes.txt',sep='\t',index=False)


# Filter out entries that were zero star in earlier years and divide into test sets

In [8]:
import pandas
rec2021=pandas.read_csv('clinvar/clinvar_202101_oneToFourStarPathogenicVariants_autosomalDominant_locations.txt',sep='\t',low_memory=False)
dom2021=pandas.read_csv('clinvar/clinvar_202101_oneToFourStarPathogenicVariants_autosomalDominant_locations.txt',sep='\t',low_memory=False)

rec2022=pandas.read_csv('testSets/2022_autosomalRecessive_knownDiseaseGenes.txt',sep='\t',low_memory=False)
dom2022=pandas.read_csv('testSets/2022_autosomalDominant_knownDiseaseGenes.txt',sep='\t',low_memory=False)

rec2023=pandas.read_csv('testSets/2023NineMonths_autosomalRecessive_knownDiseaseGenes.txt',sep='\t',low_memory=False)
dom2023=pandas.read_csv('testSets/2023NineMonths_autosomalDominant_knownDiseaseGenes.txt',sep='\t',low_memory=False)

clinvar2020=pandas.read_csv('clinvar/variant_summary_2021-01.txt',sep='\t',low_memory=False)
clinvar2020=clinvar2020.loc[clinvar2020['Assembly']=="GRCh37",:]
clinvar2020=clinvar2020.loc[((clinvar2020['Type']=="single nucleotide variant") | (clinvar2020['Type']=="Deletion") | (clinvar2020['Type']=="Duplication") | (clinvar2020['Type']=="Microsatellite") | (clinvar2020['Type']=="Indel") | (clinvar2020['Type']=="Insertion")),:]
clinvar2020=clinvar2020.loc[((clinvar2020['ClinicalSignificance']=="Pathogenic") | (clinvar2020['ClinicalSignificance']=="Likely pathogenic") | (clinvar2020['ClinicalSignificance']=="Pathogenic/Likely pathogenic")),:]
clinvar2020=clinvar2020.loc[~(clinvar2020['OriginSimple']=="somatic"),:]

clinvar2021=pandas.read_csv('clinvar/variant_summary_2022-01.txt',sep='\t',low_memory=False)
clinvar2021=clinvar2021.loc[clinvar2021['Assembly']=="GRCh37",:]
clinvar2021=clinvar2021.loc[((clinvar2021['Type']=="single nucleotide variant") | (clinvar2021['Type']=="Deletion") | (clinvar2021['Type']=="Duplication") | (clinvar2021['Type']=="Microsatellite") | (clinvar2021['Type']=="Indel") | (clinvar2021['Type']=="Insertion")),:]
clinvar2021=clinvar2021.loc[((clinvar2021['ClinicalSignificance']=="Pathogenic") | (clinvar2021['ClinicalSignificance']=="Likely pathogenic") | (clinvar2021['ClinicalSignificance']=="Pathogenic/Likely pathogenic")),:]
clinvar2021=clinvar2021.loc[~(clinvar2021['OriginSimple']=="somatic"),:]

clinvar2022=pandas.read_csv('clinvar/variant_summary_2023-01.txt',sep='\t',low_memory=False)
clinvar2022=clinvar2022.loc[clinvar2022['Assembly']=="GRCh37",:]
clinvar2022=clinvar2022.loc[((clinvar2022['Type']=="single nucleotide variant") | (clinvar2022['Type']=="Deletion") | (clinvar2022['Type']=="Duplication") | (clinvar2022['Type']=="Microsatellite") | (clinvar2022['Type']=="Indel") | (clinvar2022['Type']=="Insertion")),:]
clinvar2022=clinvar2022.loc[((clinvar2022['ClinicalSignificance']=="Pathogenic") | (clinvar2022['ClinicalSignificance']=="Likely pathogenic") | (clinvar2022['ClinicalSignificance']=="Pathogenic/Likely pathogenic")),:]
clinvar2022=clinvar2022.loc[~(clinvar2022['OriginSimple']=="somatic"),:]


clinvar2020=clinvar2020.loc[:,['Chromosome','PositionVCF','ReferenceAlleleVCF','AlternateAlleleVCF']].rename(columns={'Chromosome':'hg19_chr','PositionVCF':'hg19_pos(1-based)','ReferenceAlleleVCF':'ref','AlternateAlleleVCF':'alt'})
clinvar2021=clinvar2021.loc[:,['Chromosome','PositionVCF','ReferenceAlleleVCF','AlternateAlleleVCF']].rename(columns={'Chromosome':'hg19_chr','PositionVCF':'hg19_pos(1-based)','ReferenceAlleleVCF':'ref','AlternateAlleleVCF':'alt'})
clinvar2022=clinvar2022.loc[:,['Chromosome','PositionVCF','ReferenceAlleleVCF','AlternateAlleleVCF']].rename(columns={'Chromosome':'hg19_chr','PositionVCF':'hg19_pos(1-based)','ReferenceAlleleVCF':'ref','AlternateAlleleVCF':'alt'})
clinvar2020=clinvar2020.loc[~(clinvar2020['hg19_chr'].isin(['X','Y','MT'])),:]
clinvar2020['hg19_chr']=clinvar2020.loc[:,'hg19_chr'].astype(int)
clinvar2021=clinvar2021.loc[~(clinvar2021['hg19_chr'].isin(['X','Y','MT'])),:]
clinvar2021['hg19_chr']=clinvar2021.loc[:,'hg19_chr'].astype(int)
clinvar2022=clinvar2022.loc[~(clinvar2022['hg19_chr'].isin(['X','Y','MT'])),:]
clinvar2022['hg19_chr']=clinvar2022.loc[:,'hg19_chr'].astype(int)

rec2021=rec2021.merge(clinvar2020,how='left',on=['hg19_chr','hg19_pos(1-based)','ref','alt'],indicator=True)
rec2021=rec2021.loc[rec2021['_merge']=='left_only',:].drop(columns='_merge').reset_index(drop=True)
rec2021.to_csv('testSets/2021_autosomalRecessive_notInClinvarAtAllBefore2021.txt',sep='\t',index=False)
dom2021=dom2021.merge(clinvar2020,how='left',on=['hg19_chr','hg19_pos(1-based)','ref','alt'],indicator=True)
dom2021=dom2021.loc[dom2021['_merge']=='left_only',:].drop(columns='_merge').reset_index(drop=True)
dom2021.to_csv('testSets/2021_autosomalDominant_notInClinvarAtAllBefore2021.txt',sep='\t',index=False)

rec2022=rec2022.merge(clinvar2021,how='left',on=['hg19_chr','hg19_pos(1-based)','ref','alt'],indicator=True)
rec2022=rec2022.loc[rec2022['_merge']=='left_only',:].drop(columns='_merge').reset_index(drop=True)
rec2022.to_csv('testSets/2022_autosomalRecessive_notInClinvarAtAllBefore2022.txt',sep='\t',index=False)
dom2022=dom2022.merge(clinvar2021,how='left',on=['hg19_chr','hg19_pos(1-based)','ref','alt'],indicator=True)
dom2022=dom2022.loc[dom2022['_merge']=='left_only',:].drop(columns='_merge').reset_index(drop=True)
dom2022.to_csv('testSets/2022_autosomalDominant_notInClinvarAtAllBefore2022.txt',sep='\t',index=False)

rec2023=rec2023.merge(clinvar2021,how='left',on=['hg19_chr','hg19_pos(1-based)','ref','alt'],indicator=True)
rec2023=rec2023.loc[rec2023['_merge']=='left_only',:].drop(columns='_merge').reset_index(drop=True)
rec2023.to_csv('testSets/2023_autosomalRecessive_notInClinvarAtAllBefore2023.txt',sep='\t',index=False)
dom2023=dom2023.merge(clinvar2021,how='left',on=['hg19_chr','hg19_pos(1-based)','ref','alt'],indicator=True)
dom2023=dom2023.loc[dom2023['_merge']=='left_only',:].drop(columns='_merge').reset_index(drop=True)
dom2023.to_csv('testSets/2023_autosomalDominant_notInClinvarAtAllBefore2023.txt',sep='\t',index=False)


In [3]:
clinvar2020

Unnamed: 0,hg19_chr,hg19_pos(1-based),ref,alt
0,7,4820844,GGAT,TGCTGTAAACTGTAACTGTAAA
2,7,4827360,GCTGCTGGACCTGCC,G
6,11,126145284,C,T
8,11,126147412,A,G
18,6,26091306,T,C
...,...,...,...,...
1662441,10,106163539,C,T
1662444,10,106163493,GT,G
1662446,10,106159139,C,T
1662448,10,106207473,C,A


In [5]:
rec2021

Unnamed: 0,varType,hg19_chr,hg19_pos(1-based),ref,alt,genotype,WildtypeSeq,AltSeq,ChangePos,TranscriptID,...,pRec,mis_z,lof_z,CCR,pext,gerp,GDI,RVIS_ExAC_0.1,RVIS_ExAC_0.05,RVIS_ExAC_0.01
0,frameshift substitution,1,976074,G,GC,het,MAGRSHPGPLRPLLPLLVVAACVLPGAGGTCPERALERREEEANVV...,MAGRSHPGPLRPLLPLLVVAACVLPGAGGTCPERALERREEEANVV...,182,ENST00000379370.7_2,...,1.000000,0.22552,6.01430,13.523968,0.48112,3.60,1338.60140,93.93,85.02,97.10
1,stopgain,1,979491,G,T,het,MAGRSHPGPLRPLLPLLVVAACVLPGAGGTCPERALERREEEANVV...,MAGRSHPGPLRPLLPLLVVAACVLPGAGGTCPERALERREEEANVV...,668,ENST00000379370.7_2,...,1.000000,0.22552,6.01430,0.000000,0.48112,4.60,1338.60140,93.93,85.02,97.10
2,stopgain,1,983248,C,T,het,MAGRSHPGPLRPLLPLLVVAACVLPGAGGTCPERALERREEEANVV...,MAGRSHPGPLRPLLPLLVVAACVLPGAGGTCPERALERREEEANVV...,1242,ENST00000379370.7_2,...,1.000000,0.22552,6.01430,13.132544,0.48112,3.10,1338.60140,93.93,85.02,97.10
3,frameshift substitution,1,983612,G,GGCCCCCC,het,MAGRSHPGPLRPLLPLLVVAACVLPGAGGTCPERALERREEEANVV...,MAGRSHPGPLRPLLPLLVVAACVLPGAGGTCPERALERREEEANVV...,1327,ENST00000379370.7_2,...,1.000000,0.22552,6.01430,0.000000,0.48112,2.66,1338.60140,93.93,85.02,97.10
4,nonsynonymous SNV,1,1167659,A,G,het,MKLLRRAWRRRAALGLGTLALCGAALLYLARCAAEPGDPRAMSGRS...,VKLLRRAWRRRAALGLGTLALCGAALLYLARCAAEPGDPRAMSGRS...,1,ENST00000379198.5_4,...,0.779350,1.26080,0.93213,,1.00000,2.56,720.15343,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4842,nonsynonymous SNV,22,51065458,C,G,het,MSMGAPRSLLLALAAGLAVARPPNIVLIFADDLGYGDLGCYGHPSS...,MSMGAPRSLLLALAAGLAVARPPNIVLIFADDLGYGDLGCYGHPSS...,163,ENST00000216124.10_3,...,0.087995,0.38013,0.39925,26.399607,0.53845,5.21,3502.02917,80.66,89.60,90.44
4843,nonsynonymous SNV,22,51065721,A,G,het,MSMGAPRSLLLALAAGLAVARPPNIVLIFADDLGYGDLGCYGHPSS...,MSMGAPRSLLLALAAGLAVARPPNIVLIFADDLGYGDLGCYGHPSS...,113,ENST00000216124.10_3,...,0.087995,0.38013,0.39925,0.000000,0.53845,4.74,3502.02917,80.66,89.60,90.44
4844,nonsynonymous SNV,22,51065766,G,A,het,MSMGAPRSLLLALAAGLAVARPPNIVLIFADDLGYGDLGCYGHPSS...,MSMGAPRSLLLALAAGLAVARPPNIVLIFADDLGYGDLGCYGHPSS...,98,ENST00000216124.10_3,...,0.087995,0.38013,0.39925,0.000000,0.53845,4.96,3502.02917,80.66,89.60,90.44
4845,nonsynonymous SNV,22,51065796,C,T,het,MSMGAPRSLLLALAAGLAVARPPNIVLIFADDLGYGDLGCYGHPSS...,MSMGAPRSLLLALAAGLAVARPPNIVLIFADDLGYGDLGCYGHPSS...,88,ENST00000216124.10_3,...,0.087995,0.38013,0.39925,49.347366,0.53845,4.99,3502.02917,80.66,89.60,90.44


# Parse clinvar variants 2017 - 2020


In [1]:
# ClinVar's variant summary archive files had some slightly different formatting pre-2021, so these files are handled separately from the more recent years.
import pandas
inheritanceMap=pandas.read_csv('OMIM/geneToPhenoToInheritanceMap_filtered.txt',sep='\t',low_memory=False)
inheritanceMap=inheritanceMap.drop_duplicates(subset=['geneMIM','phenoMIM','inheritance'],keep='first')
inheritanceMap

Unnamed: 0,geneMIM,phenoMIM,inheritance,mappingMethod,Approved Gene Symbol
0,147571,616126,Autosomal recessive,3,ISG15
1,103320,615120,Autosomal recessive,3,AGRN
2,615291,615349,Autosomal recessive,3,B3GALT6
3,615291,271640,Autosomal recessive,3,B3GALT6
4,615291,609465,Autosomal recessive,3,B3GALT6
...,...,...,...,...,...
4939,604272,604377,Autosomal recessive,3,SCO2
4940,131222,603041,Autosomal recessive,3,TYMP
4941,612395,602541,Autosomal recessive,3,CHKB
4942,607574,250100,Autosomal recessive,3,ARSA


## 2017

In [2]:
clinvar=pandas.read_csv('clinvar/variant_summary_2017-01.txt.gz',sep='\t',low_memory=False,compression='gzip')
clinvar.loc[:,'Type'].value_counts()

single nucleotide variant    428673
deletion                      40105
copy number gain              21793
copy number loss              20317
duplication                   16029
insertion                      4159
indel                          3367
undetermined variant            552
NT expansion                    403
Translocation                   254
protein only                     96
inversion                        52
complex                          38
short repeat                     12
fusion                            4
Name: Type, dtype: int64

In [3]:
clinvar=clinvar.rename(columns={'#AlleleID':'AlleleID'})
clinvar=clinvar.loc[clinvar['Assembly']=="GRCh37",:]
clinvar=clinvar.loc[((clinvar['Type']=="single nucleotide variant") | (clinvar['Type']=="deletion") | (clinvar['Type']=="duplication") | (clinvar['Type']=="insertion") | (clinvar['Type']=="indel") | (clinvar['Type']=="short repeat")),:]
clinvar=clinvar.loc[((clinvar['ClinicalSignificance']=="Pathogenic") | (clinvar['ClinicalSignificance']=="Likely pathogenic") | (clinvar['ClinicalSignificance']=="Likely pathogenic;Pathogenic")
                     | (clinvar['ClinicalSignificance']=='Pathogenic;not provided') | (clinvar['ClinicalSignificance']=='Likely pathogenic;not provided')),:]
clinvar=clinvar.loc[~(clinvar['OriginSimple']=="somatic"),:]
clinvar=clinvar.dropna(subset='PhenotypeIDS')
clinvar=clinvar.loc[clinvar['PhenotypeIDS'].str.contains("OMIM:"),:]

In [4]:
# get the relationships between clinvar allele IDs and omim phenotype ids 
clinvarOmimPheno=pandas.DataFrame(clinvar['PhenotypeIDS'].str.split(';').tolist(), index=clinvar['AlleleID'].values).stack()
clinvarOmimPheno=clinvarOmimPheno[clinvarOmimPheno.str.contains("OMIM:")]
clinvarAlleleIDs=clinvarOmimPheno.index.get_level_values(0)
clinvarAlleleOmimIDs=clinvarOmimPheno.str.split("OMIM:").str[1].str.split(",").str[0].str.strip()
clinvarAllelesToOmimIDs=pandas.DataFrame({'clinvarAlleleID':clinvarAlleleIDs,'phenoMIM':clinvarAlleleOmimIDs.values})

In [5]:
# select down to just the allele IDs for which we have at least one inheritance pattern in omim
clinvarAllelesWithInheritance=clinvarAllelesToOmimIDs.merge(inheritanceMap,how='inner',on='phenoMIM')
# select down to just the variants without conflicts in the inheritance
clinvarAllelesWithInheritance2=clinvarAllelesWithInheritance.drop_duplicates(subset=['clinvarAlleleID','inheritance'],keep='first')
clinvarAllelesWithInheritance3=clinvarAllelesWithInheritance2.drop_duplicates(subset='clinvarAlleleID',keep=False)
# get the original data on the alleles with omim inheritance
clinvar2=clinvar.merge(clinvarAllelesWithInheritance3,how='inner',left_on=['AlleleID'],right_on=['clinvarAlleleID'])
clinvar2.loc[:,'ReviewStatus'].value_counts()

no assertion criteria provided                          15723
criteria provided, single submitter                      6726
criteria provided, multiple submitters, no conflicts     1500
reviewed by expert panel                                  241
practice guideline                                         11
no assertion for the individual variant                     1
Name: ReviewStatus, dtype: int64

In [6]:
clinvar3=clinvar2.loc[~(clinvar2['ReviewStatus']=="no assertion criteria provided"),:]
clinvar3=clinvar3.loc[~(clinvar3['ReviewStatus']=="no assertion for the individual variant"),:]
len(clinvar3.loc[clinvar3['inheritance']=='Autosomal recessive',:]) # this is the number of recessive variants we have now (but this includes coding and noncoding)

4659

In [7]:
len(clinvar3.loc[clinvar3['inheritance']=='Autosomal dominant',:]) # this is the number of dominant variants we have now (but this includes coding and noncoding)

3819

In [8]:
clinvar3AR=clinvar3.loc[clinvar3['inheritance']=='Autosomal recessive',['Chromosome','Start','ReferenceAllele','AlternateAllele']]
clinvar3AD=clinvar3.loc[clinvar3['inheritance']=='Autosomal dominant',['Chromosome','Start','ReferenceAllele','AlternateAllele']]
clinvar3AR=clinvar3AR.rename(columns={'Start':'Position'}).sort_values(by=['Chromosome','Position','ReferenceAllele','AlternateAllele'])
clinvar3AD=clinvar3AD.rename(columns={'Start':'Position'}).sort_values(by=['Chromosome','Position','ReferenceAllele','AlternateAllele'])
clinvar3AR=clinvar3AR.dropna()
clinvar3AD=clinvar3AD.dropna()
clinvar3AR=clinvar3AR.loc[clinvar3AR['Position']!=-1,:]
clinvar3AD=clinvar3AD.loc[clinvar3AD['Position']!=-1,:]
clinvar3AR.to_csv('clinvar/clinvar_201701_oneToFourStarPathogenicVariants_autosomalRecessive_locations.txt',sep='\t',index=False)
clinvar3AD.to_csv('clinvar/clinvar_201701_oneToFourStarPathogenicVariants_autosomalDominant_locations.txt',sep='\t',index=False)


## 2018

In [33]:
clinvar=pandas.read_csv('clinvar/variant_summary_2018-01.txt.gz',sep='\t',low_memory=False,compression='gzip')
clinvar.loc[:,'Type'].value_counts()

single nucleotide variant    604440
deletion                      55105
copy number gain              27995
copy number loss              24529
duplication                   21396
insertion                      5638
indel                          5323
NT expansion                    408
undetermined variant            404
inversion                       217
Translocation                   200
protein only                     99
complex                          51
short repeat                     25
fusion                            4
Name: Type, dtype: int64

In [36]:
clinvar=clinvar.rename(columns={'#AlleleID':'AlleleID'})
clinvar=clinvar.loc[clinvar['Assembly']=="GRCh37",:]
clinvar=clinvar.loc[((clinvar['Type']=="single nucleotide variant") | (clinvar['Type']=="deletion") | (clinvar['Type']=="duplication") | (clinvar['Type']=="insertion") | (clinvar['Type']=="indel") | (clinvar['Type']=="short repeat")),:]
clinvar=clinvar.loc[((clinvar['ClinicalSignificance']=="Pathogenic") | (clinvar['ClinicalSignificance']=="Likely pathogenic") | (clinvar['ClinicalSignificance']=="Pathogenic/Likely pathogenic")),:]
clinvar=clinvar.loc[~(clinvar['OriginSimple']=="somatic"),:]
clinvar=clinvar.dropna(subset='PhenotypeIDS')
clinvar=clinvar.loc[clinvar['PhenotypeIDS'].str.contains("OMIM:"),:]

In [37]:
# get the relationships between clinvar allele IDs and omim phenotype ids 
clinvarOmimPheno=pandas.DataFrame(clinvar['PhenotypeIDS'].str.split(';').tolist(), index=clinvar['AlleleID'].values).stack()
clinvarOmimPheno=clinvarOmimPheno[clinvarOmimPheno.str.contains("OMIM:")]
clinvarAlleleIDs=clinvarOmimPheno.index.get_level_values(0)
clinvarAlleleOmimIDs=clinvarOmimPheno.str.split("OMIM:").str[1].str.split(",").str[0].str.strip()
clinvarAllelesToOmimIDs=pandas.DataFrame({'clinvarAlleleID':clinvarAlleleIDs,'phenoMIM':clinvarAlleleOmimIDs.values})

In [38]:
# select down to just the allele IDs for which we have at least one inheritance pattern in omim
clinvarAllelesWithInheritance=clinvarAllelesToOmimIDs.merge(inheritanceMap,how='inner',on='phenoMIM')
# select down to just the variants without conflicts in the inheritance
clinvarAllelesWithInheritance2=clinvarAllelesWithInheritance.drop_duplicates(subset=['clinvarAlleleID','inheritance'],keep='first')
clinvarAllelesWithInheritance3=clinvarAllelesWithInheritance2.drop_duplicates(subset='clinvarAlleleID',keep=False)
# get the original data on the alleles with omim inheritance
clinvar2=clinvar.merge(clinvarAllelesWithInheritance3,how='inner',left_on=['AlleleID'],right_on=['clinvarAlleleID'])
clinvar2.loc[:,'ReviewStatus'].value_counts()

no assertion criteria provided                          16821
criteria provided, single submitter                     11219
criteria provided, multiple submitters, no conflicts     3428
reviewed by expert panel                                  319
no assertion for the individual variant                    32
practice guideline                                         11
Name: ReviewStatus, dtype: int64

In [39]:
clinvar3=clinvar2.loc[~(clinvar2['ReviewStatus']=="no assertion criteria provided"),:]
clinvar3=clinvar3.loc[~(clinvar3['ReviewStatus']=="no assertion for the individual variant"),:]
len(clinvar3.loc[clinvar3['inheritance']=='Autosomal recessive',:]) # this is the number of recessive variants we have now (but this includes coding and noncoding)

8441

In [40]:
len(clinvar3.loc[clinvar3['inheritance']=='Autosomal dominant',:]) # this is the number of dominant variants we have now (but this includes coding and noncoding)

6536

In [41]:
clinvar3AR=clinvar3.loc[clinvar3['inheritance']=='Autosomal recessive',['Chromosome','Start','ReferenceAllele','AlternateAllele']]
clinvar3AD=clinvar3.loc[clinvar3['inheritance']=='Autosomal dominant',['Chromosome','Start','ReferenceAllele','AlternateAllele']]
clinvar3AR=clinvar3AR.rename(columns={'Start':'Position'}).sort_values(by=['Chromosome','Position','ReferenceAllele','AlternateAllele'])
clinvar3AD=clinvar3AD.rename(columns={'Start':'Position'}).sort_values(by=['Chromosome','Position','ReferenceAllele','AlternateAllele'])
clinvar3AR=clinvar3AR.dropna()
clinvar3AD=clinvar3AD.dropna()
clinvar3AR=clinvar3AR.loc[clinvar3AR['Position']!=-1,:]
clinvar3AD=clinvar3AD.loc[clinvar3AD['Position']!=-1,:]
clinvar3AR.to_csv('clinvar/clinvar_201801_oneToFourStarPathogenicVariants_autosomalRecessive_locations.txt',sep='\t',index=False)
clinvar3AD.to_csv('clinvar/clinvar_201801_oneToFourStarPathogenicVariants_autosomalDominant_locations.txt',sep='\t',index=False)


## 2019

In [42]:
clinvar=pandas.read_csv('clinvar/variant_summary_2019-01.txt.gz',sep='\t',low_memory=False,compression='gzip')
clinvar.loc[:,'Type'].value_counts()

single nucleotide variant    776360
deletion                      74369
copy number gain              29573
duplication                   29087
copy number loss              26180
insertion                      7506
indel                          7456
NT expansion                    407
undetermined variant            401
inversion                       292
Translocation                   283
protein only                     98
complex                          56
short repeat                     41
fusion                            4
Name: Type, dtype: int64

In [43]:
clinvar=clinvar.rename(columns={'#AlleleID':'AlleleID'})
clinvar=clinvar.loc[clinvar['Assembly']=="GRCh37",:]
clinvar=clinvar.loc[((clinvar['Type']=="single nucleotide variant") | (clinvar['Type']=="deletion") | (clinvar['Type']=="duplication") | (clinvar['Type']=="insertion") | (clinvar['Type']=="indel") | (clinvar['Type']=="short repeat")),:]
clinvar=clinvar.loc[((clinvar['ClinicalSignificance']=="Pathogenic") | (clinvar['ClinicalSignificance']=="Likely pathogenic") | (clinvar['ClinicalSignificance']=="Pathogenic/Likely pathogenic")),:]
clinvar=clinvar.loc[~(clinvar['OriginSimple']=="somatic"),:]
clinvar=clinvar.dropna(subset='PhenotypeIDS')
clinvar=clinvar.loc[clinvar['PhenotypeIDS'].str.contains("OMIM:"),:]

In [44]:
# get the relationships between clinvar allele IDs and omim phenotype ids 
clinvarOmimPheno=pandas.DataFrame(clinvar['PhenotypeIDS'].str.split(';').tolist(), index=clinvar['AlleleID'].values).stack()
clinvarOmimPheno=clinvarOmimPheno[clinvarOmimPheno.str.contains("OMIM:")]
clinvarAlleleIDs=clinvarOmimPheno.index.get_level_values(0)
clinvarAlleleOmimIDs=clinvarOmimPheno.str.split("OMIM:").str[1].str.split(",").str[0].str.strip()
clinvarAllelesToOmimIDs=pandas.DataFrame({'clinvarAlleleID':clinvarAlleleIDs,'phenoMIM':clinvarAlleleOmimIDs.values})

In [45]:
# select down to just the allele IDs for which we have at least one inheritance pattern in omim
clinvarAllelesWithInheritance=clinvarAllelesToOmimIDs.merge(inheritanceMap,how='inner',on='phenoMIM')
# select down to just the variants without conflicts in the inheritance
clinvarAllelesWithInheritance2=clinvarAllelesWithInheritance.drop_duplicates(subset=['clinvarAlleleID','inheritance'],keep='first')
clinvarAllelesWithInheritance3=clinvarAllelesWithInheritance2.drop_duplicates(subset='clinvarAlleleID',keep=False)
# get the original data on the alleles with omim inheritance
clinvar2=clinvar.merge(clinvarAllelesWithInheritance3,how='inner',left_on=['AlleleID'],right_on=['clinvarAlleleID'])
clinvar2.loc[:,'ReviewStatus'].value_counts()

criteria provided, single submitter                     18637
no assertion criteria provided                          17311
criteria provided, multiple submitters, no conflicts     6006
reviewed by expert panel                                  524
no interpretation for the single variant                   25
practice guideline                                         14
Name: ReviewStatus, dtype: int64

In [46]:
clinvar3=clinvar2.loc[~(clinvar2['ReviewStatus']=="no assertion criteria provided"),:]
clinvar3=clinvar3.loc[~(clinvar3['ReviewStatus']=="no assertion for the individual variant"),:]
len(clinvar3.loc[clinvar3['inheritance']=='Autosomal recessive',:]) # this is the number of recessive variants we have now (but this includes coding and noncoding)

15330

In [47]:
len(clinvar3.loc[clinvar3['inheritance']=='Autosomal dominant',:]) # this is the number of dominant variants we have now (but this includes coding and noncoding)

9876

In [48]:
clinvar3AR=clinvar3.loc[clinvar3['inheritance']=='Autosomal recessive',['Chromosome','Start','ReferenceAllele','AlternateAllele']]
clinvar3AD=clinvar3.loc[clinvar3['inheritance']=='Autosomal dominant',['Chromosome','Start','ReferenceAllele','AlternateAllele']]
clinvar3AR=clinvar3AR.rename(columns={'Start':'Position'}).sort_values(by=['Chromosome','Position','ReferenceAllele','AlternateAllele'])
clinvar3AD=clinvar3AD.rename(columns={'Start':'Position'}).sort_values(by=['Chromosome','Position','ReferenceAllele','AlternateAllele'])
clinvar3AR=clinvar3AR.dropna()
clinvar3AD=clinvar3AD.dropna()
clinvar3AR=clinvar3AR.loc[clinvar3AR['Position']!=-1,:]
clinvar3AD=clinvar3AD.loc[clinvar3AD['Position']!=-1,:]
clinvar3AR.to_csv('clinvar/clinvar_201901_oneToFourStarPathogenicVariants_autosomalRecessive_locations.txt',sep='\t',index=False)
clinvar3AD.to_csv('clinvar/clinvar_201901_oneToFourStarPathogenicVariants_autosomalDominant_locations.txt',sep='\t',index=False)


## 2020

In [49]:
clinvar=pandas.read_csv('clinvar/variant_summary_2020-01.txt.gz',sep='\t',low_memory=False,compression='gzip')
clinvar.loc[:,'Type'].value_counts()

single nucleotide variant    1097964
deletion                       74859
copy number gain               39397
copy number loss               39089
duplication                    33118
short repeat                   18188
indel                           8391
insertion                       6804
undetermined variant             824
inversion                        580
Translocation                    298
NT expansion                     133
protein only                     103
complex                           60
fusion                             6
tandem duplication                 1
Name: Type, dtype: int64

In [50]:
clinvar=clinvar.rename(columns={'#AlleleID':'AlleleID'})
clinvar=clinvar.loc[clinvar['Assembly']=="GRCh37",:]
clinvar=clinvar.loc[((clinvar['Type']=="single nucleotide variant") | (clinvar['Type']=="deletion") | (clinvar['Type']=="duplication") | (clinvar['Type']=="insertion") | (clinvar['Type']=="indel") | (clinvar['Type']=="short repeat")),:]
clinvar=clinvar.loc[((clinvar['ClinicalSignificance']=="Pathogenic") | (clinvar['ClinicalSignificance']=="Likely pathogenic") | (clinvar['ClinicalSignificance']=="Pathogenic/Likely pathogenic")),:]
clinvar=clinvar.loc[~(clinvar['OriginSimple']=="somatic"),:]
clinvar=clinvar.dropna(subset='PhenotypeIDS')
clinvar=clinvar.loc[clinvar['PhenotypeIDS'].str.contains("OMIM:"),:]

In [51]:
# get the relationships between clinvar allele IDs and omim phenotype ids 
clinvarOmimPheno=pandas.DataFrame(clinvar['PhenotypeIDS'].str.split(';').tolist(), index=clinvar['AlleleID'].values).stack()
clinvarOmimPheno=clinvarOmimPheno[clinvarOmimPheno.str.contains("OMIM:")]
clinvarAlleleIDs=clinvarOmimPheno.index.get_level_values(0)
clinvarAlleleOmimIDs=clinvarOmimPheno.str.split("OMIM:").str[1].str.split(",").str[0].str.strip()
clinvarAllelesToOmimIDs=pandas.DataFrame({'clinvarAlleleID':clinvarAlleleIDs,'phenoMIM':clinvarAlleleOmimIDs.values})

In [52]:
# select down to just the allele IDs for which we have at least one inheritance pattern in omim
clinvarAllelesWithInheritance=clinvarAllelesToOmimIDs.merge(inheritanceMap,how='inner',on='phenoMIM')
# select down to just the variants without conflicts in the inheritance
clinvarAllelesWithInheritance2=clinvarAllelesWithInheritance.drop_duplicates(subset=['clinvarAlleleID','inheritance'],keep='first')
clinvarAllelesWithInheritance3=clinvarAllelesWithInheritance2.drop_duplicates(subset='clinvarAlleleID',keep=False)
# get the original data on the alleles with omim inheritance
clinvar2=clinvar.merge(clinvarAllelesWithInheritance3,how='inner',left_on=['AlleleID'],right_on=['clinvarAlleleID'])
clinvar2.loc[:,'ReviewStatus'].value_counts()

criteria provided, single submitter                     21755
no assertion criteria provided                          18849
criteria provided, multiple submitters, no conflicts     7596
reviewed by expert panel                                  719
practice guideline                                         11
Name: ReviewStatus, dtype: int64

In [53]:
clinvar3=clinvar2.loc[~(clinvar2['ReviewStatus']=="no assertion criteria provided"),:]
clinvar3=clinvar3.loc[~(clinvar3['ReviewStatus']=="no assertion for the individual variant"),:]
len(clinvar3.loc[clinvar3['inheritance']=='Autosomal recessive',:]) # this is the number of recessive variants we have now (but this includes coding and noncoding)

18010

In [54]:
len(clinvar3.loc[clinvar3['inheritance']=='Autosomal dominant',:]) # this is the number of dominant variants we have now (but this includes coding and noncoding)

12071

In [55]:
clinvar3AR=clinvar3.loc[clinvar3['inheritance']=='Autosomal recessive',['Chromosome','Start','ReferenceAllele','AlternateAllele']]
clinvar3AD=clinvar3.loc[clinvar3['inheritance']=='Autosomal dominant',['Chromosome','Start','ReferenceAllele','AlternateAllele']]
clinvar3AR=clinvar3AR.rename(columns={'Start':'Position'}).sort_values(by=['Chromosome','Position','ReferenceAllele','AlternateAllele'])
clinvar3AD=clinvar3AD.rename(columns={'Start':'Position'}).sort_values(by=['Chromosome','Position','ReferenceAllele','AlternateAllele'])
clinvar3AR=clinvar3AR.dropna()
clinvar3AD=clinvar3AD.dropna()
clinvar3AR=clinvar3AR.loc[clinvar3AR['Position']!=-1,:]
clinvar3AD=clinvar3AD.loc[clinvar3AD['Position']!=-1,:]
clinvar3AR.to_csv('clinvar/clinvar_202001_oneToFourStarPathogenicVariants_autosomalRecessive_locations.txt',sep='\t',index=False)
clinvar3AD.to_csv('clinvar/clinvar_202001_oneToFourStarPathogenicVariants_autosomalDominant_locations.txt',sep='\t',index=False)


# Get New Variants For Each Year

In [1]:
import pandas
AD202101=pandas.read_csv('clinvar/clinvar_202101_oneToFourStarPathogenicVariants_autosomalDominant_locations.txt',sep='\t',low_memory=False)
AD202001=pandas.read_csv('clinvar/clinvar_202001_oneToFourStarPathogenicVariants_autosomalDominant_locations.txt',sep='\t',low_memory=False)
AD201901=pandas.read_csv('clinvar/clinvar_201901_oneToFourStarPathogenicVariants_autosomalDominant_locations.txt',sep='\t',low_memory=False)
AD201801=pandas.read_csv('clinvar/clinvar_201801_oneToFourStarPathogenicVariants_autosomalDominant_locations.txt',sep='\t',low_memory=False)

clinvar2020=pandas.read_csv('clinvar/variant_summary_2020-01.txt.gz',sep='\t',low_memory=False,compression='gzip')
clinvar2020=clinvar2020.loc[clinvar2020['Assembly']=="GRCh37",:]
clinvar2020=clinvar2020.loc[((clinvar2020['Type']=="single nucleotide variant") | (clinvar2020['Type']=="Deletion") | (clinvar2020['Type']=="Duplication") | (clinvar2020['Type']=="Microsatellite") | (clinvar2020['Type']=="Indel") | (clinvar2020['Type']=="Insertion")),:]
clinvar2020=clinvar2020.loc[((clinvar2020['ClinicalSignificance']=="Pathogenic") | (clinvar2020['ClinicalSignificance']=="Likely pathogenic") | (clinvar2020['ClinicalSignificance']=="Pathogenic/Likely pathogenic")),:]
clinvar2020=clinvar2020.loc[~(clinvar2020['OriginSimple']=="somatic"),:]
clinvar2020=clinvar2020.rename(columns={'Start':'Position'})

clinvar2019=pandas.read_csv('clinvar/variant_summary_2019-01.txt.gz',sep='\t',low_memory=False,compression='gzip')
clinvar2019=clinvar2019.loc[clinvar2019['Assembly']=="GRCh37",:]
clinvar2019=clinvar2019.loc[((clinvar2019['Type']=="single nucleotide variant") | (clinvar2019['Type']=="deletion") | (clinvar2019['Type']=="duplication") | (clinvar2019['Type']=="insertion") | (clinvar2019['Type']=="indel") | (clinvar2019['Type']=="short repeat")),:]
clinvar2019=clinvar2019.loc[((clinvar2019['ClinicalSignificance']=="Pathogenic") | (clinvar2019['ClinicalSignificance']=="Likely pathogenic") | (clinvar2019['ClinicalSignificance']=="Pathogenic/Likely pathogenic")),:]
clinvar2019=clinvar2019.loc[~(clinvar2019['OriginSimple']=="somatic"),:]
clinvar2019=clinvar2019.rename(columns={'Start':'Position'})

clinvar2018=pandas.read_csv('clinvar/variant_summary_2018-01.txt.gz',sep='\t',low_memory=False,compression='gzip')
clinvar2018=clinvar2018.loc[clinvar2018['Assembly']=="GRCh37",:]
clinvar2018=clinvar2018.loc[((clinvar2018['Type']=="single nucleotide variant") | (clinvar2018['Type']=="deletion") | (clinvar2018['Type']=="duplication") | (clinvar2018['Type']=="insertion") | (clinvar2018['Type']=="indel") | (clinvar2018['Type']=="short repeat")),:]
clinvar2018=clinvar2018.loc[((clinvar2018['ClinicalSignificance']=="Pathogenic") | (clinvar2018['ClinicalSignificance']=="Likely pathogenic") | (clinvar2018['ClinicalSignificance']=="Pathogenic/Likely pathogenic")),:]
clinvar2018=clinvar2018.loc[~(clinvar2018['OriginSimple']=="somatic"),:]
clinvar2018=clinvar2018.rename(columns={'Start':'Position'})

clinvar2017=pandas.read_csv('clinvar/variant_summary_2017-01.txt.gz',sep='\t',low_memory=False,compression='gzip')
clinvar2017=clinvar2017.loc[clinvar2017['Assembly']=="GRCh37",:]
clinvar2017=clinvar2017.loc[((clinvar2017['Type']=="single nucleotide variant") | (clinvar2017['Type']=="deletion") | (clinvar2017['Type']=="duplication") | (clinvar2017['Type']=="insertion") | (clinvar2017['Type']=="indel") | (clinvar2017['Type']=="short repeat")),:]
clinvar2017=clinvar2017.loc[((clinvar2017['ClinicalSignificance']=="Pathogenic") | (clinvar2017['ClinicalSignificance']=="Likely pathogenic") | (clinvar2017['ClinicalSignificance']=="Likely pathogenic;Pathogenic") | (clinvar2017['ClinicalSignificance']=='Pathogenic;not provided') | (clinvar2017['ClinicalSignificance']=='Likely pathogenic;not provided')),:]
clinvar2017=clinvar2017.loc[~(clinvar2017['OriginSimple']=="somatic"),:]
clinvar2017=clinvar2017.rename(columns={'Start':'Position'})


AD202101=AD202101.merge(clinvar2020,how='left',on=['Chromosome','Position','ReferenceAllele','AlternateAllele'],indicator=True)
AD202101=AD202101.loc[AD202101['_merge']=='left_only',['Chromosome','Position','ReferenceAllele','AlternateAllele']].reset_index(drop=True)
AD202101.to_csv('clinvar/clinvar_202101_oneToFourStarPathogenicVariants_autosomalDominant_locations_v2.txt',sep='\t',index=False)

AD202001=AD202001.merge(clinvar2019,how='left',on=['Chromosome','Position','ReferenceAllele','AlternateAllele'],indicator=True)
AD202001=AD202001.loc[AD202001['_merge']=='left_only',['Chromosome','Position','ReferenceAllele','AlternateAllele']].reset_index(drop=True)
AD202001.to_csv('clinvar/clinvar_202001_oneToFourStarPathogenicVariants_autosomalDominant_locations_v2.txt',sep='\t',index=False)

AD201901=AD201901.merge(clinvar2018,how='left',on=['Chromosome','Position','ReferenceAllele','AlternateAllele'],indicator=True)
AD201901=AD201901.loc[AD201901['_merge']=='left_only',['Chromosome','Position','ReferenceAllele','AlternateAllele']].reset_index(drop=True)
AD201901.to_csv('clinvar/clinvar_201901_oneToFourStarPathogenicVariants_autosomalDominant_locations_v2.txt',sep='\t',index=False)

AD201801=AD201801.merge(clinvar2017,how='left',on=['Chromosome','Position','ReferenceAllele','AlternateAllele'],indicator=True)
AD201801=AD201801.loc[AD201801['_merge']=='left_only',['Chromosome','Position','ReferenceAllele','AlternateAllele']].reset_index(drop=True)
AD201801.to_csv('clinvar/clinvar_201801_oneToFourStarPathogenicVariants_autosomalDominant_locations_v2.txt',sep='\t',index=False)

AR202101=pandas.read_csv('clinvar/clinvar_202101_oneToFourStarPathogenicVariants_autosomalRecessive_locations.txt',sep='\t',low_memory=False)
AR202001=pandas.read_csv('clinvar/clinvar_202001_oneToFourStarPathogenicVariants_autosomalRecessive_locations.txt',sep='\t',low_memory=False)
AR201901=pandas.read_csv('clinvar/clinvar_201901_oneToFourStarPathogenicVariants_autosomalRecessive_locations.txt',sep='\t',low_memory=False)
AR201801=pandas.read_csv('clinvar/clinvar_201801_oneToFourStarPathogenicVariants_autosomalRecessive_locations.txt',sep='\t',low_memory=False)
AR201701=pandas.read_csv('clinvar/clinvar_201701_oneToFourStarPathogenicVariants_autosomalRecessive_locations.txt',sep='\t',low_memory=False)

AR202101=AR202101.merge(clinvar2020,how='left',on=['Chromosome','Position','ReferenceAllele','AlternateAllele'],indicator=True)
AR202101=AR202101.loc[AR202101['_merge']=='left_only',['Chromosome','Position','ReferenceAllele','AlternateAllele']].reset_index(drop=True)
AR202101.to_csv('clinvar/clinvar_202101_oneToFourStarPathogenicVariants_autosomalRecessive_locations_v2.txt',sep='\t',index=False)

AR202001=AR202001.merge(clinvar2019,how='left',on=['Chromosome','Position','ReferenceAllele','AlternateAllele'],indicator=True)
AR202001=AR202001.loc[AR202001['_merge']=='left_only',['Chromosome','Position','ReferenceAllele','AlternateAllele']].reset_index(drop=True)
AR202001.to_csv('clinvar/clinvar_202001_oneToFourStarPathogenicVariants_autosomalRecessive_locations_v2.txt',sep='\t',index=False)

AR201901=AR201901.merge(clinvar2018,how='left',on=['Chromosome','Position','ReferenceAllele','AlternateAllele'],indicator=True)
AR201901=AR201901.loc[AR201901['_merge']=='left_only',['Chromosome','Position','ReferenceAllele','AlternateAllele']].reset_index(drop=True)
AR201901.to_csv('clinvar/clinvar_201901_oneToFourStarPathogenicVariants_autosomalRecessive_locations_v2.txt',sep='\t',index=False)

AR201801=AR201801.merge(clinvar2017,how='left',on=['Chromosome','Position','ReferenceAllele','AlternateAllele'],indicator=True)
AR201801=AR201801.loc[AR201801['_merge']=='left_only',['Chromosome','Position','ReferenceAllele','AlternateAllele']].reset_index(drop=True)
AR201801.to_csv('clinvar/clinvar_201801_oneToFourStarPathogenicVariants_autosomalRecessive_locations_v2.txt',sep='\t',index=False)


# END