In [1]:
from pysam import VariantFile

In [2]:
childVCF = VariantFile("HG002-NA24385-50x_filtered.vcf")
fatherVCF = VariantFile("HG003.hs37d5.60x.1.converted_filtered.vcf")
motherVCF = VariantFile("HG004.hs37d5.60x.1.converted_filtered.vcf")

In [3]:
def findMaxChromPos(VCFs):
    maxPos = 0
    for vcf in VCFs:  
        for rec in vcf.fetch():
            maxPos = max(maxPos, rec.pos)
    return maxPos, len(str(abs(maxPos)))

In [4]:
#VCFs = (childVCF, fatherVCF, motherVCF)
#count, digits = findMaxChromPos(VCFs)
#print("Maximum position among all chromosomes: " + str(count) + ", number of digits: " + str(digits))

In [5]:
# check if there's a variation which returns child to reference genome

In [6]:
def checkPosIntersection(childVariant, parentVariant):  
    
    if childVariant.ref != parentVariant.ref:
        return False, None
    
    childGT = next(childVariant.samples.itervalues())['GT']
    parentGT = next(parentVariant.samples.itervalues())['GT']
    gt01 = (0,1)
    gt11 = (1,1)
    gt12 = (1,2)
    childALT = childVariant.alts
    parentALT = parentVariant.alts
    
    if parentGT == gt01:
        # parent has 0/1 genotype, child has 0/1 genotype
        # if child and parent have the same REF and ALT -> return the variant
        if childGT == gt01:
            if childALT[0] == parentALT[0]:
                return True, childVariant
        # parent has 0/1 genotype, child has 1/1 genotype
        # if child and parent have the same REF and ALT -> child variant with 0/1 gt goes into intersection
        elif childGT == gt11:
            if childALT[0] == parentALT[0]:
                next(childVariant.samples.itervalues())['GT'] = (0,1)
                return True, childVariant
        # parent has 0/1 genotype, child has 1/2 genotype
        # if one of child's alts is the same as parent's alt-> return the variant
        else :
            if childALT[0] == parentALT[0] or childALT[1] == parentALT[0]:
                next(childVariant.samples.itervalues())['GT'] = (0,1)
                childVariant.alts = parentVariant.alts
                return True, childVariant
    elif parentGT == gt11:
        # parent has 1/1 genotype, child has 0/1 genotype
        # if child's alt is the same as parent's alt -> return the variant
        if childGT == gt01:
            if childALT[0] == parentALT[0]:
                return True, childVariant
        # parent has 1/1 genotype, child has 1/1 genotype
        # if child's alt is the same as parent's alt-> return the variant
        elif childGT == gt11:
            if childALT[0] == parentALT[0]:
                return True, childVariant
        # parent has 1/1 genotype, child has 1/2 genotype
        # if one of child's alts is the same as parent's-> return the variant
        else :
            if childALT[0] == parentALT[0] or childALT[1] == parentALT[0]:
                next(childVariant.samples.itervalues())['GT'] = (0,1)
                childVariant.alts = parentVariant.alts
                return True, childVariant
    else :
        # parent has 1/2 genotype, child has 0/1 genotype
        # if child's alt is the same as one of parent's alts-> return the variant
        if childGT == gt01:
            if childALT[0] == parentALT[0] or childALT[0] == parentALT[1]:
                next(childVariant.samples.itervalues())['GT'] = (0,1)
                return True, childVariant
        #parent has 1/2 genotype, child has 1/1 genotype
        elif childGT == gt11:
            if childALT[0] == parentALT[0] or childALT[0] == parentALT[1]:
                next(childVariant.samples.itervalues())['GT'] = (0,1)
                return True, childVariant
        #parent has 1/2 genotype, child has 1/2 genotype
        else :
            #if child has both variations as parent
            if childALT[0] == parentALT[0] and childALT[1] == parentALT[1] or childALT[0] == parentALT[1] and childALT[1] == parentALT[0]:
                return True, childVariant
            elif childALT[0] == parentALT[0] or childALT[0] == parentALT[1]:
                next(childVariant.samples.itervalues())['GT'] = (0,1)
                childVariant.alts = (childALT[0],)
                return True, childVariant
            elif childALT[1] == parentALT[0] or childALT[1] == parentALT[1]:
                next(childVariant.samples.itervalues())['GT'] = (0,1)
                childVariant.alts = (childALT[1],)
                return True, childVariant
            
    return False, None

In [7]:
def findIntersection(childVCF, parentVCF, outputFileName):
    # check filename extension -----------------------
    outputVCF = VariantFile(outputFileName, 'w', header=childVCF.header)
    chrom = ""
    for (childVariant, parentVariant) in zip(childVCF.fetch(),parentVCF.fetch()):
        # sex chromosomes are skipped -------------------
        if childVariant.chrom == "chrX" or parentVariant.chrom == "chrX": 
            return
        #find the same variant position on the same chromosome for both
        while childVariant.pos != parentVariant.pos or childVariant.chrom != parentVariant.chrom:
            while childVariant.pos > parentVariant.pos:
                if childVariant.chrom != parentVariant.chrom:
                    childVariant = next(childVCF, None) 
                else:
                    parentVariant = next(parentVCF, None)
                if parentVariant is None or childVariant is None or parentVariant.chrom == "chrX" or childVariant.chrom == "chrX": 
                    return
            while childVariant.pos < parentVariant.pos:
                if childVariant.chrom != parentVariant.chrom:
                    parentVariant = next(parentVCF, None)
                else:
                    childVariant = next(childVCF, None) 
                if parentVariant is None or childVariant is None or parentVariant.chrom == "chrX" or childVariant.chrom == "chrX": 
                    return
        
        #child variant position and parent variant position are matched
        condition, variant = checkPosIntersection(childVariant, parentVariant)
        if condition: 
            outputVCF.write(variant)
    outputVCF.close()
    #return outputVCF

In [8]:
#findIntersection(childVCF, motherVCF, "outputCM.vcf")
#findIntersection(childVCF, fatherVCF, "outputCF.vcf")

In [9]:
def correctIntersectionToUnion(variant):
    gt = next(variant.samples.itervalues())['GT']   
    if gt == (1,1):
        next(variant.samples.itervalues())['GT'] = (0,1)
    elif gt == (1,2) :
        next(variant.samples.itervalues())['GT'] = (0,1)
        variant.alts = (variant.alts[0],)
    return variant

In [12]:
def findUnion(VCF1, VCF2, outputFileName):
    chrom = ""
    # check filename extension -----------------------
    outputVCF = VariantFile(outputFileName, 'w', header=VCF1.header)
    for (variant1, variant2) in zip(VCF1.fetch(),VCF2.fetch()):
        #find the same variant position on the same chromosome for both
        while variant1.pos != variant2.pos or variant1.chrom != variant2.chrom:
            while variant1.pos > variant2.pos:
                if variant1.chrom != variant2.chrom:
                    #variant2 on that position is gt 0/0
                    outputVCF.write(correctIntersectionToUnion(variant1)) 
                    variant1 = next(VCF1, None) 
                else:  
                    #variant1 on that position is gt 0/0
                    outputVCF.write(correctIntersectionToUnion(variant2))
                    variant2 = next(VCF2, None)
                if variant2 is None or variant1 is None: 
                    return
            while variant1.pos < variant2.pos:
                if variant1.chrom != variant2.chrom:
                    #variant1 on that position is gt 0/0
                    outputVCF.write(correctIntersectionToUnion(variant2))
                    variant2 = next(VCF2, None)
                else:
                    #variant2 on that position is gt 0/0
                    outputVCF.write(correctIntersectionToUnion(variant1))
                    variant1 = next(VCF1, None) 
                if variant2 is None or variant1 is None : 
                    return
        
        #variant positions are matched
        
        gt1 = next(variant1.samples.itervalues())['GT']
        gt2 = next(variant2.samples.itervalues())['GT']
        
        if gt1 == (0,1) and gt2 == (0,1):
            if variant1.alts == variant2.alts:
                next(variant1.samples.itervalues())['GT'] = (1,1)
            else:
                variant1.alts = (variant1.alts[0],variant2.alts[0])
                next(variant1.samples.itervalues())['GT'] = (1,2)
            outputVCF.write(variant1)
        elif gt1 == (1,1) and gt2 == (1,1) or gt1 == (1,2) and gt2 == (1,2) or (gt2 == (0,1) and (gt1 == (1,1) or gt1 == (1,2))):
            outputVCF.write(variant1)
        elif gt1 == (0,1) and (gt2 == (1,1) or gt2 == (1,2)):
            outputVCF.write(variant2)
        else:
            print("error "+str(gt1)+" "+str(gt2))
        
        
        if chrom != variant1.chrom:
            chrom = variant1.chrom
            print("union on "+chrom)
    
    outputVCF.close()

In [13]:
#findIntersection(childVCF, motherVCF, "outputCM.vcf")
#findIntersection(childVCF, fatherVCF, "outputCF.vcf")
outputCM = VariantFile("outputCM.vcf")
outputCF = VariantFile("outputCF.vcf")
findUnion(outputCM, outputCF, "outputUnion.vcf")

union on chr1
union on chr2
union on chr3
union on chr4
union on chr5
union on chr6
union on chr7
union on chr8
union on chr9
union on chr10
union on chr11
union on chr12
union on chr13
union on chr14
union on chr15
union on chr16
union on chr17
union on chr18
union on chr19
union on chr20
union on chr21
union on chr22
