# Setup

In [None]:
# install necessary modules
!pip install wget pandas numpy biopython
# download the resources to the 'resources' folder
!python -m wget https://zuchnerlab.s3.amazonaws.com/VariantPathogenicity/MaverickTrainingResources.tar.gz
!tar -zxvf MaverickTrainingResources.tar.gz

# Create OMIM inheritance map

In [None]:
import pandas
genemap=pandas.read_csv('resources/genemap2_20200114.txt',sep='\t',low_memory=False,comment='#')
genemap=genemap.loc[:,['MIM Number','Approved Symbol','Ensembl Gene ID','Phenotypes']]
genemap=genemap.loc[~genemap['Approved Symbol'].isna(),:]
genemap=genemap.loc[~genemap['Phenotypes'].isna(),:]
genemap

In [None]:
mimToGene=genemap.loc[:,['MIM Number','Approved Symbol']]
phenotypes=pandas.DataFrame(genemap['Phenotypes'].str.split(';').tolist(), index=genemap['MIM Number'].values).stack().str.strip()
phenotypes

In [None]:
filter=((phenotypes.str[0]=="?") | (phenotypes.str[0]=="[") | (phenotypes.str[0]=="{"))
phenotypes2=phenotypes.loc[~filter,:]
phenotypes2
filter2=phenotypes2.str.contains(" [0-9]{6} \(")
phenotypes2=phenotypes2.loc[filter2,:]
phenotypes2

In [None]:
locusMIMs=phenotypes2.index.get_level_values(0)
phenotypeMIMs=phenotypes2.str.split("\([0-9]\)").str[0].str.split(',').str[-1].str.strip()
phenotypeInheritances=phenotypes2.str.split("\([0-9]\), ").str[1].str.strip()
mappingMethod=phenotypes2.str.split(" [0-9]{6} \(").str[1].str.split(')').str[0]

In [None]:
newMap=pandas.DataFrame({'geneMIM':locusMIMs,'phenoMIM':phenotypeMIMs.values,'inheritance':phenotypeInheritances.values,'mappingMethod':mappingMethod.values})
newMap

In [None]:
newMap.to_csv('resources/geneToPhenoToInheritanceMap.txt',sep='\t',index=False)

In [None]:
newMap2=newMap.loc[((newMap['inheritance']=="Autosomal dominant") | (newMap['inheritance']=="Autosomal recessive")),:]
newMap3=newMap2.loc[newMap2['mappingMethod']=='3',:]
# add back in the gene names
mimToGene=mimToGene.rename(columns={'MIM Number':'geneMIM','Approved Symbol':'GeneSymbol'})
newMap4=newMap3.merge(mimToGene,how='inner',on='geneMIM')
newMap4.to_csv('resources/geneToPhenoToInheritanceMap_filtered.txt',sep='\t',index=False)

# Parse 2020 Clinvar variants

In [None]:
import pandas
inheritanceMap=pandas.read_csv('resources/geneToPhenoToInheritanceMap_filtered.txt',sep='\t',low_memory=False)
inheritanceMap=inheritanceMap.drop_duplicates(subset=['geneMIM','phenoMIM','inheritance'],keep='first')
inheritanceMap

In [None]:
clinvar=pandas.read_csv('resources/variant_summary_2020-01.txt',sep='\t',low_memory=False)
clinvar.loc[:,'Type'].value_counts()

In [None]:
clinvar=clinvar.loc[clinvar['Assembly']=="GRCh37",:]
clinvar=clinvar.loc[((clinvar['Type']=="single nucleotide variant") | (clinvar['Type']=="deletion") | (clinvar['Type']=="duplication") | (clinvar['Type']=="short repeat") | (clinvar['Type']=="indel") | (clinvar['Type']=="insertion")),:]
benign=clinvar.loc[((clinvar['ClinicalSignificance']=="Benign") | (clinvar['ClinicalSignificance']=="Likely benign") | (clinvar['ClinicalSignificance']=="Benign/Likely benign")),:]
benign=benign.loc[~(benign['OriginSimple']=="somatic"),:]
clinvar=clinvar.loc[((clinvar['ClinicalSignificance']=="Pathogenic") | (clinvar['ClinicalSignificance']=="Likely pathogenic") | (clinvar['ClinicalSignificance']=="Pathogenic/Likely pathogenic")),:]
clinvar=clinvar.loc[~(clinvar['OriginSimple']=="somatic"),:]
clinvar=clinvar.loc[clinvar['PhenotypeIDS'].str.contains("OMIM:"),:]

In [None]:
# get the relationships between clinvar allele IDs and omim phenotype ids 
clinvarOmimPheno=pandas.DataFrame(clinvar['PhenotypeIDS'].str.split(';').tolist(), index=clinvar['AlleleID'].values).stack()
clinvarOmimPheno=clinvarOmimPheno[clinvarOmimPheno.str.contains("OMIM:")]
clinvarAlleleIDs=clinvarOmimPheno.index.get_level_values(0)
clinvarAlleleOmimIDs=clinvarOmimPheno.str.split("OMIM:").str[1].str.split(",").str[0].str.strip()
clinvarAllelesToOmimIDs=pandas.DataFrame({'clinvarAlleleID':clinvarAlleleIDs,'phenoMIM':clinvarAlleleOmimIDs.values})

In [None]:
# select down to just the allele IDs for which we have at least one inheritance pattern in omim
clinvarAllelesWithInheritance=clinvarAllelesToOmimIDs.merge(inheritanceMap,how='inner',on='phenoMIM')
# select down to just the variants without conflicts in the inheritance
clinvarAllelesWithInheritance2=clinvarAllelesWithInheritance.drop_duplicates(subset=['clinvarAlleleID','inheritance'],keep='first')
clinvarAllelesWithInheritance3=clinvarAllelesWithInheritance2.drop_duplicates(subset='clinvarAlleleID',keep=False)
# get the original data on the alleles with omim inheritance
clinvar2=clinvar.merge(clinvarAllelesWithInheritance3,how='inner',left_on=['AlleleID'],right_on=['clinvarAlleleID'])
clinvar2.loc[:,'ReviewStatus'].value_counts()

In [None]:
clinvar3=clinvar2.loc[~(clinvar2['ReviewStatus']=="no assertion criteria provided"),:]
len(clinvar3.loc[clinvar3['inheritance']=='Autosomal recessive',:]) # this is the number of recessive variants we have now (but this includes coding and noncoding)

In [None]:
len(clinvar3.loc[clinvar3['inheritance']=='Autosomal dominant',:]) # this is the number of dominant variants we have now (but this includes coding and noncoding)

In [None]:
clinvar3AR=clinvar3.loc[clinvar3['inheritance']=='Autosomal recessive',['Chromosome','Start','ReferenceAllele','AlternateAllele']]
clinvar3AD=clinvar3.loc[clinvar3['inheritance']=='Autosomal dominant',['Chromosome','Start','ReferenceAllele','AlternateAllele']]
clinvar3AR=clinvar3AR.sort_values(by=['Chromosome','Start','ReferenceAllele','AlternateAllele'])
clinvar3AD=clinvar3AD.sort_values(by=['Chromosome','Start','ReferenceAllele','AlternateAllele'])
clinvar3AR.to_csv('resources/clinvar_pathogenic_AR_locations.txt',sep='\t',index=False)
clinvar3AD.to_csv('resources/clinvar_pathogenic_AD_locations.txt',sep='\t',index=False)


In [None]:
# deal with clinvar and gnomAD benigns
benign=benign.loc[~(benign['ReviewStatus']=="no assertion criteria provided"),:]
benign=benign.loc[:,['Chromosome','Start','ReferenceAllele','AlternateAllele']]
benign.to_csv('resources/clinvar_oneToFourStarBenign_locations.txt',sep='\t',index=False)
gnomad=pandas.read_csv('resources/gnomADVariantsSeenAsHomAltTwiceInExomes.txt',sep='\t',low_memory=False)
benign2=pandas.concat([benign,gnomad],axis=0,ignore_index=True)
benign2=benign2.sort_values(by=['Chromosome','Start','ReferenceAllele','AlternateAllele'])
benign2=benign2.drop_duplicates(subset=['Chromosome','Start','ReferenceAllele','AlternateAllele'],keep='first').reset_index(drop=True)
benign2.to_csv('resources/clinvar_benign_locations.txt',sep='\t',index=False)
len(benign2) # this is the number of benign variants we have now

In [None]:
# turn the files into VCFs and process with Annovar
%%bash 
cd resources
dos2unix clinvar_oneToFourStar*
tail -n +2 clinvar_pathogenic_AR_locations.txt | awk -F'\t' -v OFS='\t' '{print $1,$2,".",$3,$4,".","PASS",".","GT","0/1"}' | cat header.vcf - > clinvar_pathogenic_AR.vcf
tail -n +2 clinvar_pathogenic_AD_locations.txt | awk -F'\t' -v OFS='\t' '{print $1,$2,".",$3,$4,".","PASS",".","GT","0/1"}' | cat header.vcf - > clinvar_pathogenic_AD.vcf
tail -n +2 clinvar_benign_locations.txt | awk -F'\t' -v OFS='\t' '{print $1,$2,".",$3,$4,".","PASS",".","GT","0/1"}' | cat header.vcf - > clinvar_benign.vcf

annovar/convert2annovar.pl -format vcf4 clinvar_pathogenic_AR.vcf > clinvar_pathogenic_AR.avinput
annovar/annotate_variation.pl -dbtype wgEncodeGencodeBasicV33lift37 -buildver hg19 --exonicsplicing clinvar_pathogenic_AR.avinput annovar/humandb/
annovar/convert2annovar.pl -format vcf4 clinvar_pathogenic_AD.vcf > clinvar_pathogenic_AD.avinput
annovar/annotate_variation.pl -dbtype wgEncodeGencodeBasicV33lift37 -buildver hg19 --exonicsplicing clinvar_pathogenic_AD.avinput annovar/humandb/
annovar/convert2annovar.pl -format vcf4 clinvar_benign.vcf > clinvar_benign.avinput
annovar/annotate_variation.pl -dbtype wgEncodeGencodeBasicV33lift37 -buildver hg19 --exonicsplicing clinvar_benign.avinput annovar/humandb/

annovar/coding_change.pl clinvar_pathogenic_AR.avinput.exonic_variant_function annovar/humandb/hg19_wgEncodeGencodeBasicV33lift37.txt annovar/humandb/hg19_wgEncodeGencodeBasicV33lift37Mrna.fa --includesnp --onlyAltering --alltranscript > clinvar_pathogenic_AR.coding_changes.txt
annovar/coding_change.pl clinvar_pathogenic_AD.avinput.exonic_variant_function annovar/humandb/hg19_wgEncodeGencodeBasicV33lift37.txt annovar/humandb/hg19_wgEncodeGencodeBasicV33lift37Mrna.fa --includesnp --onlyAltering --alltranscript > clinvar_pathogenic_AD.coding_changes.txt
annovar/coding_change.pl clinvar_benign.avinput.exonic_variant_function annovar/humandb/hg19_wgEncodeGencodeBasicV33lift37.txt annovar/humandb/hg19_wgEncodeGencodeBasicV33lift37Mrna.fa --includesnp --onlyAltering --alltranscript > clinvar_benign.coding_changes.txt

cd ..

In [None]:
import pandas
import numpy as np
from Bio import SeqIO
approvedTranscripts=pandas.read_csv('resources/gencodeBasicFullLengthTranscriptsConversionTable.txt',sep='\t',low_memory=False)

canonical=pandas.read_csv('resources/gnomad211_constraint_canonical_simple.txt',sep='\t',low_memory=False)
# remove the gnomad canonical transcripts that are not approvedTranscripts
canonical=canonical.loc[canonical['transcript'].isin(approvedTranscripts['transcriptIDShort'].values),:].reset_index(drop=True)

GTEx=pandas.read_csv('resources/GTEx.V7.tx_medians.021820.tsv',sep='\t',low_memory=False)
# remove the non-approvedTranscripts from the expression data
GTEx=GTEx.loc[GTEx['transcript_id'].isin(approvedTranscripts['transcriptIDShort'].values),:].reset_index(drop=True)
# add a overall expression column
GTEx['overallAvg']=GTEx.iloc[:,2:55].mean()

sequences={}
for record in SeqIO.parse("resources/gencode.v33lift37.pc_translations.fa","fasta"):
    transcriptID=record.id.split('|')[1]
    if transcriptID in approvedTranscripts['transcriptID'].values:
        sequences[transcriptID]=record.seq

def groomAnnovarOutput(base,sequences=sequences,approvedTranscripts=approvedTranscripts,canonical=canonical,GTEx=GTEx):

    sample=pandas.read_csv("resources/" + base + ".avinput.exonic_variant_function",sep='\t',low_memory=False,header=None,
                        names=['line','varType','location','hg19_chr','hg19_pos(1-based)','end','ref','alt','genotype','qual','depth'])
    # convert the position, ref, and alt alleles to long form
    longForm=pandas.read_csv("resources/" + base + "_locations.txt",sep='\t',low_memory=False)
    longForm=longForm.rename(columns={'Chromosome':'chrom','Start':'pos_long','ReferenceAllele':'ref_long','AlternateAllele':'alt_long'})
    sample['lineNum']=sample.loc[:,'line'].str[4:].astype(int)-1
    sample=sample.merge(longForm,how='inner',left_on='lineNum',right_on=longForm.index)
    sample=sample.loc[:,['line','varType','location','hg19_chr','pos_long','end','ref_long','alt_long','genotype','qual','depth']].rename(columns={'pos_long':'hg19_pos(1-based)','ref_long':'ref','alt_long':'alt'}).reset_index(drop=True)
    # add new columns with placeholders to be filled in
    sample['WildtypeSeq']=""
    sample['AltSeq']=""
    sample['ChangePos']=-1
    sample['TranscriptID']=""
    sample['TranscriptIDShort']=sample['location'].str.split(':',expand=True)[1].str[:15]
    sample['geneName']=sample['location'].str.split(':',expand=True)[0]
    sample['geneID']=""
    sample['geneIDShort']=""


    for i in range(len(sample)):
        if i % 1000 == 0:
            print(str(i) + ' rows completed')
        numTranscripts=len(sample.loc[i,'location'].split(','))
        numCanonical=0
        canonicals=[]
        transcripts=[]
        transcriptLengths=[]
        canonicalTranscript=""
        correctedGeneName=""
        for j in range(numTranscripts-1):
            if sample.loc[i,'location'].split(',')[j].split(':')[1][:15] in canonical['transcript'].values:
                numCanonical=numCanonical+1
                canonicals.append(sample.loc[i,'location'].split(',')[j].split(':')[1][:15])
            if sample.loc[i,'location'].split(',')[j].split(':')[1] in approvedTranscripts['transcriptID'].values:  
                transcripts.append(sample.loc[i,'location'].split(',')[j].split(':')[1][:15])
                transcriptLengths.append(len(sequences[sample.loc[i,'location'].split(',')[j].split(':')[1]]))

        if len(transcripts)>0:
            if numCanonical==1:
                transcriptID=canonicals[0]
                sample.loc[i,'TranscriptIDShort']=transcriptID
                sample.loc[i,'TranscriptID']=approvedTranscripts.loc[approvedTranscripts['transcriptIDShort']==transcriptID,'transcriptID'].values[0]
                sample.loc[i,'geneName']=approvedTranscripts.loc[approvedTranscripts['transcriptIDShort']==transcriptID,'geneName'].values[0]
                sample.loc[i,'geneID']=approvedTranscripts.loc[approvedTranscripts['transcriptIDShort']==transcriptID,'geneID'].values[0]
                sample.loc[i,'geneIDShort']=approvedTranscripts.loc[approvedTranscripts['transcriptIDShort']==transcriptID,'geneIDShort'].values[0]
            elif numCanonical==0:
                if len(transcripts)==1:
                    transcriptID=transcripts[0]
                    sample.loc[i,'TranscriptIDShort']=transcriptID
                    sample.loc[i,'TranscriptID']=approvedTranscripts.loc[approvedTranscripts['transcriptIDShort']==transcriptID,'transcriptID'].values[0]
                    sample.loc[i,'geneName']=approvedTranscripts.loc[approvedTranscripts['transcriptIDShort']==transcriptID,'geneName'].values[0]
                    sample.loc[i,'geneID']=approvedTranscripts.loc[approvedTranscripts['transcriptIDShort']==transcriptID,'geneID'].values[0]
                    sample.loc[i,'geneIDShort']=approvedTranscripts.loc[approvedTranscripts['transcriptIDShort']==transcriptID,'geneIDShort'].values[0]
                else:
                    if len(GTEx.loc[GTEx['transcript_id'].isin(transcripts),:])>0:
                        # pick the transcript with the highest expression
                        transcriptID=GTEx.loc[GTEx['transcript_id'].isin(transcripts),:].sort_values(by=['overallAvg'],ascending=False).reset_index(drop=True).iloc[0,0]
                        sample.loc[i,'TranscriptIDShort']=transcriptID
                        sample.loc[i,'TranscriptID']=approvedTranscripts.loc[approvedTranscripts['transcriptIDShort']==transcriptID,'transcriptID'].values[0]
                        sample.loc[i,'geneName']=approvedTranscripts.loc[approvedTranscripts['transcriptIDShort']==transcriptID,'geneName'].values[0]
                        sample.loc[i,'geneID']=approvedTranscripts.loc[approvedTranscripts['transcriptIDShort']==transcriptID,'geneID'].values[0]
                        sample.loc[i,'geneIDShort']=approvedTranscripts.loc[approvedTranscripts['transcriptIDShort']==transcriptID,'geneIDShort'].values[0]
                    else:
                        # if none of the transcripts have measured expression and none of them are canonical, then pick the one with the longest amino acid sequence
                        # if multiple tie for longest, this picks the one we saw first
                        j=transcriptLengths.index(max(transcriptLengths))
                        transcriptID=transcripts[j]
                        sample.loc[i,'TranscriptIDShort']=transcriptID
                        sample.loc[i,'TranscriptID']=approvedTranscripts.loc[approvedTranscripts['transcriptIDShort']==transcriptID,'transcriptID'].values[0]
                        sample.loc[i,'geneName']=approvedTranscripts.loc[approvedTranscripts['transcriptIDShort']==transcriptID,'geneName'].values[0]
                        sample.loc[i,'geneID']=approvedTranscripts.loc[approvedTranscripts['transcriptIDShort']==transcriptID,'geneID'].values[0]
                        sample.loc[i,'geneIDShort']=approvedTranscripts.loc[approvedTranscripts['transcriptIDShort']==transcriptID,'geneIDShort'].values[0]
            elif numCanonical>1:
                if len(GTEx.loc[GTEx['transcript_id'].isin(canonicals),:])>0:
                    # pick the canonical transcript with the highest expression
                    transcriptID=GTEx.loc[GTEx['transcript_id'].isin(canonicals),:].sort_values(by=['overallAvg'],ascending=False).reset_index(drop=True).iloc[0,0]
                    sample.loc[i,'TranscriptIDShort']=transcriptID
                    sample.loc[i,'TranscriptID']=approvedTranscripts.loc[approvedTranscripts['transcriptIDShort']==transcriptID,'transcriptID'].values[0]
                    sample.loc[i,'geneName']=approvedTranscripts.loc[approvedTranscripts['transcriptIDShort']==transcriptID,'geneName'].values[0]
                    sample.loc[i,'geneID']=approvedTranscripts.loc[approvedTranscripts['transcriptIDShort']==transcriptID,'geneID'].values[0]
                    sample.loc[i,'geneIDShort']=approvedTranscripts.loc[approvedTranscripts['transcriptIDShort']==transcriptID,'geneIDShort'].values[0]
                else:
                    # if none of the canonical transcripts have measured expression, then pick the one with the longest amino acid sequence
                    # if multiple tie for longest, this picks the one we saw first
                    j=transcriptLengths.index(max(transcriptLengths))
                    transcriptID=transcripts[j]
                    sample.loc[i,'TranscriptIDShort']=transcriptID
                    sample.loc[i,'TranscriptID']=approvedTranscripts.loc[approvedTranscripts['transcriptIDShort']==transcriptID,'transcriptID'].values[0]
                    sample.loc[i,'geneName']=approvedTranscripts.loc[approvedTranscripts['transcriptIDShort']==transcriptID,'geneName'].values[0]
                    sample.loc[i,'geneID']=approvedTranscripts.loc[approvedTranscripts['transcriptIDShort']==transcriptID,'geneID'].values[0]
                    sample.loc[i,'geneIDShort']=approvedTranscripts.loc[approvedTranscripts['transcriptIDShort']==transcriptID,'geneIDShort'].values[0]

    for record in SeqIO.parse("resources/" + base + ".coding_changes.txt", "fasta"):
        lineNum=record.id
        # only use the transcript that we selected above 
        if sample.loc[sample['line']==lineNum,'TranscriptID'].values==record.description.split(' ')[1]:
            if 'WILDTYPE' in record.description:
                if record.seq.__str__()[:-1] == sequences[record.description.split(' ')[1]]:
                    sample.loc[sample['line']==lineNum,'WildtypeSeq']=record.seq.__str__()
                    sample.loc[sample['line']==lineNum,'TranscriptID']=record.description.split(' ')[1]
            else:
                sample.loc[sample['line']==lineNum,'AltSeq']=record.seq.__str__()
                if 'startloss' in record.description:
                    sample.loc[sample['line']==lineNum,'ChangePos']=1
                elif 'silent' in record.description:
                    sample.loc[sample['line']==lineNum,'ChangePos']=-1
                else:
                    sample.loc[sample['line']==lineNum,'ChangePos']=record.description.split(' ')[7].split('-')[0]
    sample2=sample.loc[~((sample['WildtypeSeq']=="") | (sample['AltSeq']=="") | (sample['ChangePos']==-1)),:]
    sample2.to_csv("resources/" + base + '.groomed.txt',sep='\t',index=False)
    return


In [None]:
groomAnnovarOutput('clinvar_pathogenic_AR')
groomAnnovarOutput('clinvar_pathogenic_AD')
groomAnnovarOutput('clinvar_benign')

In [None]:
import pandas
constraint=pandas.read_csv('resources/gnomad211_constraint_canonical_simple.txt',sep='\t',low_memory=False)

gnomadAF=pandas.read_csv('resources/gnomad211_exomes_AFs.txt',sep='\t',low_memory=False)
gnomadAF.loc[gnomadAF['hg19_chr']=='X','hg19_chr']=23
gnomadAF.loc[gnomadAF['hg19_chr']=='Y','hg19_chr']=24
gnomadAF.loc[gnomadAF['hg19_chr']=='MT','hg19_chr']=25
gnomadAF['hg19_chr']=gnomadAF['hg19_chr'].astype(int)

CCR=pandas.read_csv('resources/ccrs.enumerated.txt',sep='\t',low_memory=False)
CCR.loc[CCR['chrom']=='X','chrom']=23
CCR['chrom']=CCR.loc[:,'chrom'].astype(int)
CCR=CCR.sort_values(by=['chrom','pos','ccr_pct'],ascending=[True,True,False]).drop_duplicates(subset=['chrom','pos'],keep='first').reset_index(drop=True)

pext=pandas.read_csv('resources/gnomAD_pext_values.txt',sep='\t',low_memory=False)
pext.loc[pext['chr']=='X','chr']=23
pext.loc[pext['chr']=='Y','chr']=24
pext.loc[pext['chr']=='MT','chr']=25
pext['chr']=pext.loc[:,'chr'].astype(int)
pext=pext.sort_values(by=['chr','pos','pext'],ascending=[True,True,False]).drop_duplicates(subset=['chr','pos'],keep='first').reset_index(drop=True)

gerp=pandas.read_csv('resources/gerpOnExons.txt',sep='\t',low_memory=False,header=None,names=['chr','pos','gerp'])
gerp.loc[gerp['chr']=='X','chr']=23
gerp.loc[gerp['chr']=='Y','chr']=24
gerp.loc[gerp['chr']=='MT','chr']=25
gerp['chr']=gerp['chr'].astype(int)
gerp=gerp.sort_values(by=['chr','pos','gerp'],ascending=[True,True,False]).drop_duplicates(subset=['chr','pos'],keep='first').reset_index(drop=True)

GDI=pandas.read_csv('resources/GDI.groomed.txt',sep='\t',low_memory=False)
RVIS=pandas.read_csv('resources/RVIS.groomed.txt',sep='\t',low_memory=False)

def annotateVariantsAndFilter(base,constraint=constraint,gnomadAF=gnomadAF,CCR=CCR,pext=pext,gerp=gerp,GDI=GDI,RVIS=RVIS,variantType='normal'):
    import pandas
    import numpy as np
    sample=pandas.read_csv('resources/' + base + '.groomed.txt',sep='\t',low_memory=False)
    sample.loc[sample['hg19_chr']=='X','hg19_chr']=23
    sample.loc[sample['hg19_chr']=='Y','hg19_chr']=24
    sample.loc[sample['hg19_chr']=='MT','hg19_chr']=25
    sample['hg19_chr']=sample['hg19_chr'].astype(int)

    # merge on the allele frequency data
    sample=sample.merge(gnomadAF,how='left',on=['hg19_chr','hg19_pos(1-based)','ref','alt'])

    # merge on the constraint data (try transcript ID merge first)
    sampleTranscript=sample.merge(constraint,how='inner',left_on=['TranscriptIDShort'],right_on=['transcript'])
    notMatched=sample.loc[~(sample['TranscriptIDShort'].isin(sampleTranscript['TranscriptIDShort'])),:]
    constraint=pandas.read_csv('gnomad211_constraint_simple_geneLevel.txt',sep='\t',low_memory=False)
    sampleGeneID=notMatched.merge(constraint,how='inner',left_on=['geneIDShort'],right_on=['gene_id'])
    notMatched2=notMatched.loc[~(notMatched['geneIDShort'].isin(sampleGeneID['geneIDShort'])),:]
    sampleGeneName=notMatched2.merge(constraint,how='left',left_on=['geneName'],right_on=['gene'])
    # stack them all back together
    sample2=pandas.concat([sampleTranscript,sampleGeneID,sampleGeneName],axis=0,ignore_index=True)
    sample2.loc[sample2['hg19_chr']=='X','hg19_chr']=23
    sample2.loc[sample2['hg19_chr']=='Y','hg19_chr']=24
    sample2.loc[sample2['hg19_chr']=='MT','hg19_chr']=25
    sample2['hg19_chr']=sample2['hg19_chr'].astype(int)

    # merge on the CCR data
    sample2['CCR']=np.nan
    sampleSNVs=sample2.loc[sample2['varType'].isin(['nonsynonymous SNV','synonymous SNV','stopgain','stoploss']),['hg19_chr','hg19_pos(1-based)']]
    sampleIndels=sample2.loc[sample2['varType'].isin(['frameshift insertion','frameshift deletion','frameshift substitution',
                                                    'nonframeshift insertion','nonframeshift deletion','nonframeshift substitution']),['hg19_chr','hg19_pos(1-based)','ref']]
    sampleIndels['length']=sampleIndels['ref'].str.len()
    sampleIndels['CCR']=np.nan
    sampleSNVs2=sampleSNVs.merge(CCR,how='left',left_on=['hg19_chr','hg19_pos(1-based)'],right_on=['chrom','pos']).set_index(sampleSNVs.index)
    for i in range(len(sampleIndels)):
        if i%100==0:
            print(str(i) + ' rows complete of ' + str(len(sampleIndels)))
        startPos=sampleIndels.iloc[i,1]+1
        endPos=startPos+sampleIndels.iloc[i,3]
        sampleIndels.iloc[i,4]=CCR.loc[((CCR['chrom']==sampleIndels.iloc[i,0]) & (CCR['pos'].isin(range(startPos,endPos)))),'ccr_pct'].max()
    sample2.loc[sampleSNVs2.index,'CCR']=sampleSNVs2.loc[:,'ccr_pct'].values
    sample2.loc[sampleIndels.index,'CCR']=sampleIndels.loc[:,'CCR'].values

    # merge on the pext data
    sample2['pext']=np.nan
    sampleIndels['pext']=np.nan
    sampleSNVs2=sampleSNVs.merge(pext,how='left',left_on=['hg19_chr','hg19_pos(1-based)'],right_on=['chr','pos']).set_index(sampleSNVs.index)
    for i in range(len(sampleIndels)):
        if i%100==0:
            print(str(i) + ' rows complete of ' + str(len(sampleIndels)))
        startPos=sampleIndels.iloc[i,1]+1
        endPos=startPos+sampleIndels.iloc[i,3]
        sampleIndels.iloc[i,5]=pext.loc[((pext['chr']==sampleIndels.iloc[i,0]) & (pext['pos'].isin(range(startPos,endPos)))),'pext'].max()
    sample2.loc[sampleSNVs2.index,'pext']=sampleSNVs2.loc[:,'pext'].values
    sample2.loc[sampleIndels.index,'pext']=sampleIndels.loc[:,'pext'].values

    # merge on the GERP data
    sample2['gerp']=np.nan
    sampleIndels['gerp']=np.nan
    sampleSNVs2=sampleSNVs.merge(gerp,how='left',left_on=['hg19_chr','hg19_pos(1-based)'],right_on=['chr','pos']).set_index(sampleSNVs.index)
    for i in range(len(sampleIndels)):
        if i%100==0:
            print(str(i) + ' rows complete of ' + str(len(sampleIndels)))
        startPos=sampleIndels.iloc[i,1]+1
        endPos=startPos+sampleIndels.iloc[i,3]
        sampleIndels.iloc[i,6]=gerp.loc[((gerp['chr']==sampleIndels.iloc[i,0]) & (gerp['pos'].isin(range(startPos,endPos)))),'gerp'].max()
    sample2.loc[sampleSNVs2.index,'gerp']=sampleSNVs2.loc[:,'gerp'].values
    sample2.loc[sampleIndels.index,'gerp']=sampleIndels.loc[:,'gerp'].values

    sample2=sample2.drop_duplicates(subset=['hg19_chr','hg19_pos(1-based)','ref','alt'],keep='first')
    sample2=sample2.drop(columns=['line','location','end','qual','depth','gene','transcript', 'canonical','gene_id'])
    sample2=sample2.sort_values(by=['hg19_chr','hg19_pos(1-based)','ref','alt']).reset_index(drop=True)

    # merge on GDI data
    sample2=sample2.merge(GDI,how='left',on='geneName')
    # merge on RVIS data
    sample2=sample2.merge(RVIS,how='left',on='geneName')
    
    # filtration steps that are only performed on the training and testing sets
    # remove variants within 2bp of exon-intron boundaries
    sample2=sample2.loc[~(sample2['varType'].str.contains('splicing')),:].reset_index(drop=True)
    
    # correct the sequence of proteins whose alt sequence doesn't start with an M to being non-translated
    sample2.loc[((sample2['ChangePos']==1) & (sample2['ref'].str.len()==1) & (sample2['alt'].str.len()==1) & (sample2['ref']!='-') * (sample2['alt']!='-')),'AltSeq']="*"
    
    if variantType=='dominant':
        # get rid of any variants seen in gnomAD
        sample2['controls_AF']=sample2['controls_AF'].replace('.',0).fillna(0)
        sample2=sample2.loc[sample2['controls_AF']==0,:].reset_index(drop=True)
    elif variantType=='recessive':
        # get rid of any variants seen in the homozygous state in gnomAD
        sample2['controls_nhomalt']=sample2['controls_nhomalt'].replace('.',0).fillna(0)
        sample2=sample2.loc[sample2['controls_nhomalt']==0,:].reset_index(drop=True)

    sample2=sample2.sort_values(by=['hg19_chr','hg19_pos(1-based)','ref','alt']).reset_index(drop=True)
    sample2=sample2.drop_duplicates(subset=['hg19_chr','hg19_pos(1-based)','ref','alt'],keep='first').reset_index(drop=True)
    sample2.to_csv('resources/' + base + '.annotated.txt',sep='\t',index=False)
    return



In [None]:
annotateVariantsAndFilter('clinvar_pathogenic_AR',variantType='recessive')
annotateVariantsAndFilter('clinvar_pathogenic_AD',variantType='dominant')
annotateVariantsAndFilter('clinvar_benign')


In [None]:
# remove duplicates and create validation set and training sets
AR=pandas.read_csv('resources/clinvar_pathogenic_AR.annotated.txt',sep='\t',low_memory=False)
AD=pandas.read_csv('resources/clinvar_pathogenic_AD.annotated.txt',sep='\t',low_memory=False)
benign=pandas.read_csv('resources/clinvar_benign.annotated.txt',sep='\t',low_memory=False)
AR['classLabel']=2
AD['classLabel']=1
benign['classLabel']=0
allData=pandas.concat([AR,AD,benign],axis=0,ignore_index=True)
allData=allData.sort_values(by=['hg19_chr','hg19_pos(1-based)','ref','alt'])
allData=allData.drop_duplicates(subset=['hg19_chr','hg19_pos(1-based)','ref','alt'],keep=False).reset_index(drop=True)
allData=allData.loc[:,['varType', 'hg19_chr', 'hg19_pos(1-based)', 'ref', 'alt', 'WildtypeSeq', 'AltSeq', 'ChangePos', 'TranscriptID', 'TranscriptIDShort',
       'geneName', 'geneID', 'geneIDShort', 'pLI', 'pNull', 'pRec', 'mis_z', 'lof_z', 'controls_AF', 'controls_nhomalt', 'CCR', 'pext', 'gerp', 'GDI', 'RVIS_ExAC_0.05', 'classLabel']]


validationSet=allData.sample(n=1000,replace=False,random_state=1).sort_values(by=['hg19_chr','hg19_pos(1-based)','ref','alt'],ascending=True)
trainingSet=allData.drop(validationSet.index).sort_values(by=['hg19_chr','hg19_pos(1-based)','ref','alt'],ascending=True)
trainingSet.to_csv('resources/trainingSet.txt',sep='\t',index=False)
validationSet.to_csv('resources/validationSet.txt',sep='\t',index=False)


In [None]:
trainingSet.loc[:,'classLabel'].value_counts()

In [None]:
validationSet.loc[:,'classLabel'].value_counts()

# Repeat with 2021 Clinvar variants

In [None]:
# note that clinvar updated their format between 1/2020 and 1/2021, so some of this is different from the above to compensate for that
import pandas
inheritanceMap=pandas.read_csv('resources/geneToPhenoToInheritanceMap_filtered.txt',sep='\t',low_memory=False)
inheritanceMap=inheritanceMap.drop_duplicates(subset=['geneMIM','phenoMIM','inheritance'],keep='first')
inheritanceMap

In [None]:
clinvar=pandas.read_csv('resources/variant_summary_2021-01.txt',sep='\t',low_memory=False)
clinvar.loc[:,'Type'].value_counts()

In [None]:
clinvar=clinvar.loc[clinvar['Assembly']=="GRCh37",:]
clinvar=clinvar.loc[((clinvar['Type']=="single nucleotide variant") | (clinvar['Type']=="Deletion") | (clinvar['Type']=="Duplication") | (clinvar['Type']=="Microsatellite") | (clinvar['Type']=="Indel") | (clinvar['Type']=="Insertion")),:]
benign=clinvar.loc[((clinvar['ClinicalSignificance']=="Benign") | (clinvar['ClinicalSignificance']=="Likely benign") | (clinvar['ClinicalSignificance']=="Benign/Likely benign")),:]
benign=benign.loc[~(benign['OriginSimple']=="somatic"),:]
clinvar=clinvar.loc[((clinvar['ClinicalSignificance']=="Pathogenic") | (clinvar['ClinicalSignificance']=="Likely pathogenic") | (clinvar['ClinicalSignificance']=="Pathogenic/Likely pathogenic")),:]
clinvar=clinvar.loc[~(clinvar['OriginSimple']=="somatic"),:]
clinvar=clinvar.loc[clinvar['PhenotypeIDS'].str.contains("OMIM:"),:]

In [None]:
# get the relationships between clinvar allele IDs and omim phenotype ids 
clinvarOmimPheno=pandas.DataFrame(clinvar['PhenotypeIDS'].str.split(';').tolist(), index=clinvar['AlleleID'].values).stack()
clinvarOmimPheno=clinvarOmimPheno[clinvarOmimPheno.str.contains("OMIM:")]
clinvarAlleleIDs=clinvarOmimPheno.index.get_level_values(0)
clinvarAlleleOmimIDs=clinvarOmimPheno.str.split("OMIM:").str[1].str.split(",").str[0].str.strip()
clinvarAllelesToOmimIDs=pandas.DataFrame({'clinvarAlleleID':clinvarAlleleIDs,'phenoMIM':clinvarAlleleOmimIDs.values})

In [None]:
# select down to just the allele IDs for which we have at least one inheritance pattern in omim
clinvarAllelesWithInheritance=clinvarAllelesToOmimIDs.merge(inheritanceMap,how='inner',on='phenoMIM')
# select down to just the variants without conflicts in the inheritance
clinvarAllelesWithInheritance2=clinvarAllelesWithInheritance.drop_duplicates(subset=['clinvarAlleleID','inheritance'],keep='first')
clinvarAllelesWithInheritance3=clinvarAllelesWithInheritance2.drop_duplicates(subset='clinvarAlleleID',keep=False)
# get the original data on the alleles with omim inheritance
clinvar2=clinvar.merge(clinvarAllelesWithInheritance3,how='inner',left_on=['AlleleID'],right_on=['clinvarAlleleID'])
clinvar2.loc[:,'ReviewStatus'].value_counts()

In [None]:
clinvar3=clinvar2.loc[~(clinvar2['ReviewStatus']=="no assertion criteria provided"),:]
len(clinvar3.loc[clinvar3['inheritance']=='Autosomal recessive',:]) # this is the number of recessive variants we have now (but this includes coding and noncoding)

In [None]:
len(clinvar3.loc[clinvar3['inheritance']=='Autosomal dominant',:]) # this is the number of dominant variants we have now (but this includes coding and noncoding)

In [None]:
clinvar3AR=clinvar3.loc[clinvar3['inheritance']=='Autosomal recessive',['Chromosome','PositionVCF','ReferenceAlleleVCF','AlternateAlleleVCF']]
clinvar3AD=clinvar3.loc[clinvar3['inheritance']=='Autosomal dominant',['Chromosome','PositionVCF','ReferenceAlleleVCF','AlternateAlleleVCF']]
clinvar3AR=clinvar3AR.rename(columns={'PositionVCF':'Start','ReferenceAlleleVCF':'ReferenceAllele','AlternateAlleleVCF':'AlternateAllele'}).sort_values(by=['Chromosome','Start','ReferenceAllele','AlternateAllele'])
clinvar3AD=clinvar3AD.rename(columns={'PositionVCF':'Start','ReferenceAlleleVCF':'ReferenceAllele','AlternateAlleleVCF':'AlternateAllele'}).sort_values(by=['Chromosome','Start','ReferenceAllele','AlternateAllele'])
clinvar3AR.to_csv('resources/clinvar_2021_oneToFourStarPathogenicVariants_autosomalRecessive_locations.txt',sep='\t',index=False)
clinvar3AD.to_csv('resources/clinvar_2021_oneToFourStarPathogenicVariants_autosomalDominant_locations.txt',sep='\t',index=False)


In [None]:
# deal with clinvar benigns (no need to incorporate gnomAD again)
benign=benign.loc[~(benign['ReviewStatus']=="no assertion criteria provided"),:]
benign=benign.loc[:,['Chromosome','PositionVCF','ReferenceAlleleVCF','AlternateAlleleVCF']]
benign=benign.rename(columns={'PositionVCF':'Start','ReferenceAlleleVCF':'ReferenceAllele','AlternateAlleleVCF':'AlternateAllele'})
benign=benign.sort_values(by=['Chromosome','Start','ReferenceAllele','AlternateAllele'])
benign=benign.drop_duplicates(subset=['Chromosome','Start','ReferenceAllele','AlternateAllele'],keep='first').reset_index(drop=True)
benign.to_csv('resources/clinvar_2021_oneToFourStarBenign_locations.txt',sep='\t',index=False)
len(benign) # this is the number of benign variants we have now

In [None]:
# remove the old variants
trainingSet=pandas.read_csv('resources/trainingSet.txt',sep='\t',low_memory=False)
validationSet=pandas.read_csv('resources/validationSet.txt',sep='\t',low_memory=False)
trainingSet=pandas.concat([trainingSet,validationSet],axis=0,ignore_index=True)
trainingSet=trainingSet.loc[:,['hg19_chr','hg19_pos(1-based)','ref','alt']].rename(columns={'hg19_chr':'Chromosome','hg19_pos(1-based)':'Start','ref':'ReferenceAllele','alt':'AlternateAllele'})

benign=pandas.read_csv('resources/clinvar_2021_oneToFourStarBenign_locations.txt',sep='\t',low_memory=False)
benign=benign.loc[~(benign['Chromosome'].isin(['X','Y','MT'])),:]
benign=benign.loc[~(benign['Start']==-1),:]
benign=benign.loc[~(benign['ReferenceAllele']=='na'),:]
benign=benign.loc[~(benign['AlternateAllele']=='na'),:].reset_index(drop=True)
benign['Chromosome']=benign['Chromosome'].astype(int)
benign['Start']=benign['Start'].astype(int)
benign2=benign.merge(trainingSet,how='left',on=['Chromosome','Start','ReferenceAllele','AlternateAllele'],indicator=True)
benign3=benign2.loc[benign2['_merge']=='left_only',:].drop(columns=['_merge'])
benign3.to_csv('resources/clinvar_2021_oneToFourStarBenign_locations_new.txt',sep='\t',index=False)

AD=pandas.read_csv('resources/clinvar_2021_oneToFourStarPathogenicVariants_autosomalDominant_locations.txt',sep='\t',low_memory=False)
AD=AD.loc[~(AD['Chromosome'].isin(['X','Y','MT'])),:]
AD=AD.loc[~(AD['Start']==-1),:]
AD=AD.loc[~(AD['ReferenceAllele']=='na'),:]
AD=AD.loc[~(AD['AlternateAllele']=='na'),:].reset_index(drop=True)
AD['Chromosome']=AD['Chromosome'].astype(int)
AD['Start']=AD['Start'].astype(int)
AD2=AD.merge(trainingSet,how='left',on=['Chromosome','Start','ReferenceAllele','AlternateAllele'],indicator=True)
AD3=AD2.loc[AD2['_merge']=='left_only',:].drop(columns=['_merge'])
AD3.to_csv('resources/clinvar_2021_oneToFourStarPathogenicVariants_autosomalDominant_locations_new.txt',sep='\t',index=False)

AR=pandas.read_csv('resources/clinvar_2021_oneToFourStarPathogenicVariants_autosomalRecessive_locations.txt',sep='\t',low_memory=False)
AR=AR.loc[~(AR['Chromosome'].isin(['X','Y','MT'])),:]
AR=AR.loc[~(AR['Start']==-1),:]
AR=AR.loc[~(AR['ReferenceAllele']=='na'),:]
AR=AR.loc[~(AR['AlternateAllele']=='na'),:].reset_index(drop=True)
AR['Chromosome']=AR['Chromosome'].astype(int)
AR['Start']=AR['Start'].astype(int)
AR2=AR.merge(trainingSet,how='left',on=['Chromosome','Start','ReferenceAllele','AlternateAllele'],indicator=True)
AR3=AR2.loc[AR2['_merge']=='left_only',:].drop(columns=['_merge'])
AR3.to_csv('resources/clinvar_2021_oneToFourStarPathogenicVariants_autosomalRecessive_locations_new.txt',sep='\t',index=False)



In [None]:
# turn the files into VCFs and process with Annovar
%%bash
cd resources/
dos2unix clinvar_oneToFourStar*
tail -n +2 clinvar_2021_oneToFourStarPathogenicVariants_autosomalRecessive_locations_new.txt | awk -F'\t' -v OFS='\t' '{print $1,$2,".",$3,$4,".","PASS",".","GT","0/1"}' | cat ../header.vcf - > clinvar_2021_pathogenic_AR.vcf
tail -n +2 clinvar_2021_oneToFourStarPathogenicVariants_autosomalDominant_locations_new.txt | awk -F'\t' -v OFS='\t' '{print $1,$2,".",$3,$4,".","PASS",".","GT","0/1"}' | cat ../header.vcf - > clinvar_2021_pathogenic_AD.vcf
tail -n +2 clinvar_2021_oneToFourStarBenign_locations_new.txt | awk -F'\t' -v OFS='\t' '{print $1,$2,".",$3,$4,".","PASS",".","GT","0/1"}' | cat ../header.vcf - > clinvar_2021_benign.vcf

annovar/convert2annovar.pl -format vcf4 clinvar_2021_pathogenic_AR.vcf > clinvar_2021_pathogenic_AR.avinput
annovar/annotate_variation.pl -dbtype wgEncodeGencodeBasicV33lift37 -buildver hg19 --exonicsplicing clinvar_2021_pathogenic_AR.avinput annovar/humandb/
annovar/convert2annovar.pl -format vcf4 clinvar_pathogenic_AD.vcf > clinvar_pathogenic_AD.avinput
annovar/annotate_variation.pl -dbtype wgEncodeGencodeBasicV33lift37 -buildver hg19 --exonicsplicing clinvar_2021_pathogenic_AD.avinput annovar/humandb/
annovar/convert2annovar.pl -format vcf4 clinvar_benign.vcf > clinvar_benign.avinput
annovar/annotate_variation.pl -dbtype wgEncodeGencodeBasicV33lift37 -buildver hg19 --exonicsplicing clinvar_2021_benign.avinput annovar/humandb/

annovar/coding_change.pl clinvar_2021_pathogenic_AR.avinput.exonic_variant_function annovar/humandb/hg19_wgEncodeGencodeBasicV33lift37.txt annovar/humandb/hg19_wgEncodeGencodeBasicV33lift37Mrna.fa --includesnp --onlyAltering --alltranscript > clinvar_2021_pathogenic_AR.coding_changes.txt
annovar/coding_change.pl clinvar_2021_pathogenic_AD.avinput.exonic_variant_function annovar/humandb/hg19_wgEncodeGencodeBasicV33lift37.txt annovar/humandb/hg19_wgEncodeGencodeBasicV33lift37Mrna.fa --includesnp --onlyAltering --alltranscript > clinvar_2021_pathogenic_AD.coding_changes.txt
annovar/coding_change.pl clinvar_2021_benign.avinput.exonic_variant_function annovar/humandb/hg19_wgEncodeGencodeBasicV33lift37.txt annovar/humandb/hg19_wgEncodeGencodeBasicV33lift37Mrna.fa --includesnp --onlyAltering --alltranscript > clinvar_2021_benign.coding_changes.txt
cd ..

In [None]:
groomAnnovarOutput('clinvar_2021_pathogenic_AR')
groomAnnovarOutput('clinvar_2021_pathogenic_AD')
groomAnnovarOutput('clinvar_2021_benign')

In [None]:
annotateVariantsAndFilter('clinvar_2021_pathogenic_AR',variantType='recessive')
annotateVariantsAndFilter('clinvar_2021_pathogenic_AD',variantType='dominant')
annotateVariantsAndFilter('clinvar_2021_benign')


In [None]:
# create known and novel gene test sets
AR=pandas.read_csv('resources/clinvar_2021_pathogenic_AR.annotated.txt',sep='\t',low_memory=False)
AD=pandas.read_csv('resources/clinvar_2021_pathogenic_AD.annotated.txt',sep='\t',low_memory=False)
benign=pandas.read_csv('resources/clinvar_2021_benign.annotated.txt',sep='\t',low_memory=False)
AR['classLabel']=2
AD['classLabel']=1
benign['classLabel']=0
allData=pandas.concat([AR,AD,benign],axis=0,ignore_index=True)
allData=allData.sort_values(by=['hg19_chr','hg19_pos(1-based)','ref','alt'])
allData=allData.drop_duplicates(subset=['hg19_chr','hg19_pos(1-based)','ref','alt'],keep=False).reset_index(drop=True)
allData=allData.loc[:,['varType', 'hg19_chr', 'hg19_pos(1-based)', 'ref', 'alt', 'WildtypeSeq', 'AltSeq', 'ChangePos', 'TranscriptID', 'TranscriptIDShort',
       'geneName', 'geneID', 'geneIDShort', 'pLI', 'pNull', 'pRec', 'mis_z', 'lof_z', 'controls_AF', 'controls_nhomalt', 'CCR', 'pext', 'gerp', 'GDI', 'RVIS_ExAC_0.05', 'classLabel']]

# split into known and novel gene sets
trainingSet=pandas.read_csv('resources/trainingSet.txt',sep='\t',low_memory=False)
validationSet=pandas.read_csv('resources/validationSet.txt',sep='\t',low_memory=False)
trainingSet=pandas.concat([trainingSet,validationSet],axis=0,ignore_index=True)
trainingSet=trainingSet.loc[trainingSet['classLabel']>0,:].reset_index(drop=True)

known=allData.loc[allData['geneName'].isin(trainingSet['geneName'].values),:].reset_index(drop=True)
novel=allData.loc[~(allData['geneName'].isin(trainingSet['geneName'].values)),:].reset_index(drop=True)

known=known.sort_values(by=['hg19_chr','hg19_pos(1-based)','ref','alt'],ascending=True)
known.to_csv('resources/knownGenes.txt',sep='\t',index=False)
novel=novel.sort_values(by=['hg19_chr','hg19_pos(1-based)','ref','alt'],ascending=True)
novel.to_csv('resources/novelGenes.txt',sep='\t',index=False)


In [None]:
known.loc[:,'classLabel'].value_counts()

In [None]:
novel.loc[:,'classLabel'].value_counts()