In [84]:
from pysam import VariantFile

In [85]:
childVCF = VariantFile("HG002-NA24385-50x_filtered.vcf")
fatherVCF = VariantFile("HG003.hs37d5.60x.1.converted_filtered.vcf")
motherVCF = VariantFile("HG004.hs37d5.60x.1.converted_filtered.vcf")

In [86]:
def findMaxChromPos(VCFs):
    maxPos = 0
    for vcf in VCFs:  
        for rec in vcf.fetch():
            maxPos = max(maxPos, rec.pos)
    return maxPos, len(str(abs(maxPos)))

In [87]:
#VCFs = (childVCF, fatherVCF, motherVCF)
#count, digits = findMaxChromPos(VCFs)
#print("Maximum position among all chromosomes: " + str(count) + ", number of digits: " + str(digits))

In [88]:
# check if there's a variation which returns child to reference genome

In [89]:
def checkPosIntersection(childVariant, parentVariant):  
    
    if childVariant.ref != parentVariant.ref:
        return False, None
    
    childGT = next(childVariant.samples.itervalues())['GT']
    parentGT = next(parentVariant.samples.itervalues())['GT']
    gt01 = (0,1)
    gt11 = (1,1)
    gt12 = (1,2)
    childALT = childVariant.alts
    parentALT = parentVariant.alts
    
    if parentGT == gt01:
        # parent has 0/1 genotype, child has 0/1 genotype
        # if child and parent have the same REF and ALT -> return the variant
        if childGT == gt01:
            if childALT[0] == parentALT[0]:
                return True, childVariant
        # parent has 0/1 genotype, child has 1/1 genotype
        # if child and parent have the same REF and ALT -> child variant with 0/1 gt goes into intersection
        elif childGT == gt11:
            if childALT[0] == parentALT[0]:
                next(childVariant.samples.itervalues())['GT'] = (0,1)
                return True, childVariant
        # parent has 0/1 genotype, child has 1/2 genotype
        # if one of child's alts is the same as parent's alt-> return the variant
        else :
            if childALT[0] == parentALT[0] or childALT[1] == parentALT[0]:
                next(childVariant.samples.itervalues())['GT'] = (0,1)
                childVariant.alts = parentVariant.alts
                return True, childVariant
    elif parentGT == gt11:
        # parent has 1/1 genotype, child has 0/1 genotype
        # if child's alt is the same as parent's alt -> return the variant
        if childGT == gt01:
            if childALT[0] == parentALT[0]:
                return True, childVariant
        # parent has 1/1 genotype, child has 1/1 genotype
        # if child's alt is the same as parent's alt-> return the variant
        elif childGT == gt11:
            if childALT[0] == parentALT[0]:
                next(childVariant.samples.itervalues())['GT'] = (0,1)
                return True, childVariant
        # parent has 1/1 genotype, child has 1/2 genotype
        # if one of child's alts is the same as parent's-> return the variant
        else :
            if childALT[0] == parentALT[0] or childALT[1] == parentALT[0]:
                next(childVariant.samples.itervalues())['GT'] = (0,1)
                childVariant.alts = parentVariant.alts
                return True, childVariant
    else :
        # parent has 1/2 genotype, child has 0/1 genotype
        # if child's alt is the same as one of parent's alts-> return the variant
        if childGT == gt01:
            if childALT[0] == parentALT[0] or childALT[0] == parentALT[1]:
                next(childVariant.samples.itervalues())['GT'] = (0,1)
                return True, childVariant
        #parent has 1/2 genotype, child has 1/1 genotype
        elif childGT == gt11:
            if childALT[0] == parentALT[0] or childALT[0] == parentALT[1]:
                next(childVariant.samples.itervalues())['GT'] = (0,1)
                return True, childVariant
        #parent has 1/2 genotype, child has 1/2 genotype
        else :
            if childALT[0] == parentALT[0] or childALT[0] == parentALT[1]:
                next(childVariant.samples.itervalues())['GT'] = (0,1)
                childVariant.alts = (childALT[0],)
                return True, childVariant
            elif childALT[1] == parentALT[0] or childALT[1] == parentALT[1]:
                next(childVariant.samples.itervalues())['GT'] = (0,1)
                childVariant.alts = (childALT[1],)
                return True, childVariant
    
    return False, None

In [90]:
def findIntersection(childVCF, parentVCF, outputFileName):
    # check filename extension -----------------------
    outputVCF = VariantFile(outputFileName, 'w', header=childVCF.header)
    chrom = ""
    for (childVariant, parentVariant) in zip(childVCF.fetch(),parentVCF.fetch()):
        # sex chromosomes are skipped -------------------
        if childVariant.chrom == "chrX" or parentVariant.chrom == "chrX": 
            return
        #find the same variant position on the same chromosome for both
        while childVariant.pos != parentVariant.pos or childVariant.chrom != parentVariant.chrom:
            while childVariant.pos > parentVariant.pos:
                if childVariant.chrom != parentVariant.chrom:
                    childVariant = next(childVCF, None) 
                else:
                    parentVariant = next(parentVCF, None)
                if parentVariant is None or childVariant is None or parentVariant.chrom == "chrX" or childVariant.chrom == "chrX": 
                    return
            while childVariant.pos < parentVariant.pos:
                if childVariant.chrom != parentVariant.chrom:
                    parentVariant = next(parentVCF, None)
                else:
                    childVariant = next(childVCF, None) 
                if parentVariant is None or childVariant is None or parentVariant.chrom == "chrX" or childVariant.chrom == "chrX": 
                    return
        
        #child variant position and parent variant position are matched
        condition, variant = checkPosIntersection(childVariant, parentVariant)
        if condition: 
            outputVCF.write(variant)
    outputVCF.close()

In [91]:
findIntersection(childVCF, motherVCF, "outputCM.vcf")