# Looking at crossover event across all strains.

In [1]:
%matplotlib inline

# Loading in necessary functions.

In [2]:
#importing necessary modules
import vcfparser
import sys
from operator import itemgetter
import matplotlib.pyplot as plt
import ast
from joblib import Parallel, delayed
import random
import numpy as np
from sklearn.metrics import jaccard_similarity_score

#Loading necessary parameters.
vcfroot = "../../vcf/GATK/"
vcfFileNames = ["1335full_phased.vcf", "1007_phased.vcf", "1012_phased.vcf", "1013_phased.vcf", "1014_phased.vcf", "1015_phased.vcf", "IT_phased.vcf"]
strainIDs = ["1335", "1007", "1012", "1013", "1014", "1015", "3367"]
vcfObjects = []
for i in vcfFileNames:
    vcfObjects.append(vcfparser.VCF(vcfroot+i))   

#SNPs: 166939
#INDELs: 8634
#VARs: 175573
#HAPBLOCKs: 27724
AVG SNP DEPTH: 78.9326377063
TOTAL LENGTH COVERED: 22884028 bps

#SNPs: 127221
#INDELs: 6114
#VARs: 133335
#HAPBLOCKs: 17129
AVG SNP DEPTH: 31.2347845652
TOTAL LENGTH COVERED: 21074161 bps

#SNPs: 133119
#INDELs: 7099
#VARs: 140218
#HAPBLOCKs: 12206
AVG SNP DEPTH: 55.4267283801
TOTAL LENGTH COVERED: 22834488 bps

#SNPs: 225281
#INDELs: 11471
#VARs: 236752
#HAPBLOCKs: 19570
AVG SNP DEPTH: 49.9257197405
TOTAL LENGTH COVERED: 27558517 bps

#SNPs: 71266
#INDELs: 2285
#VARs: 73551
#HAPBLOCKs: 11309
AVG SNP DEPTH: 26.0904950307
TOTAL LENGTH COVERED: 13439146 bps

#SNPs: 137099
#INDELs: 7461
#VARs: 144560
#HAPBLOCKs: 12428
AVG SNP DEPTH: 50.858065855
TOTAL LENGTH COVERED: 23842001 bps

#SNPs: 266584
#INDELs: 13112
#VARs: 279696
#HAPBLOCKs: 23241
AVG SNP DEPTH: 46.7624277787
TOTAL LENGTH COVERED: 25769954 bps



In [3]:
def loaddict(filename):
    s = open(filename, 'r').read()
    return eval(s)

In [4]:
#Compairs two pairs of variants from two strains
#and detects if there is a sign of crossover.
#List of potential outputs

#-2: fatal error happened
#-1: point mutation detected
#0: no crossover detected
#1: crossover detected

def isCrossOver(v0a,v1a,v0b,v1b,verbose=False):
    #first compare and seee if they have the same pairs

    #make sure there is no mutation
    #check for the first position.
    if not(isMutation(v0a.ref, v0a.alt, v0b.ref, v0b.alt)):
        if verbose: print "mutation1"
        return -1
        
    #check for the second position.
    if not(isMutation(v1a.ref, v1a.alt, v1b.ref, v1b.alt)):
        if verbose: print "mutation2"
        return -1

    #determine if there was recombination
    #Comparing only the block-1 not block-2.
    #a1 and a2 both represnt haplotypes for strain A.
    a1 = []
    a2 = []
    if v0a.format["HP"].ref == 1:
        a1.append(v0a.ref)
        a2.append(v0a.alt)
    elif v0a.format["HP"].alt == 1:
        a1.append(v0a.alt)
        a2.append(v0a.ref)
    else:
        print "This should never happen"
        return -2
    
    if v1a.format["HP"].ref == 1:
        a1.append(v1a.ref)
        a2.append(v1a.alt)
    elif v1a.format["HP"].alt == 1:
        a1.append(v1a.alt)
        a2.append(v1a.ref)
    else:
        print "This should never happen"
        return -2
    
    #b1 and b2 both represnt haplotypes for strain B.
    b1 = []
    b2 = []
    if v0b.format["HP"].ref == 1:
        b1.append(v0b.ref)
        b2.append(v0b.alt)
    elif v0b.format["HP"].alt == 1:
        b1.append(v0b.alt)
        b2.append(v0b.ref)
    else:
        print "This should never happen"
        return -2

    if v1b.format["HP"].ref == 1:
        b1.append(v1b.ref)
        b2.append(v1b.alt)
    elif v1b.format["HP"].alt == 1:
        b1.append(v1b.alt)
        b2.append(v1b.ref)
    else:
        print "This should never happen"
        return -2

    #now check if there is a crossover.
    if a1 != b1 and a1 != b2:
        if verbose:
            print
            print a1, b1
            print a2, b2
        
        return 1
    return 0

#This function checks if a point mutation has happend in this pair
#the pair will be disregarded if point mutation is spotted.
def isMutation(x1, x2, y1, y2):
    if x1 == y1 and x2 == y2:
        return True
    elif x1 == y2 and x2 == y1:
        return True
    else:
        return False

In [21]:
def pairsFromDict(dic):
    ret = []
    for k in dic.keys():
        if k[0:3] == "Chr":
            for i in dic[k]:
                ret.append((k,i))
    return set(ret)

# Functions for visualization.

In [18]:
#get the common MISP pairs.

In [23]:
common_pairs = pairsFromDict(loaddict(vcfFileNames[0]+"vs"+vcfFileNames[1]+".MISPpairs"))
for i in range(2, len(vcfFileNames)):
    common_pairs = common_pairs.intersection(pairsFromDict(loaddict(vcfFileNames[0]+"vs"+vcfFileNames[i]+".MISPpairs")))

In [24]:
print common_pairs

set([('Chr3', (1450867, 1450886)), ('Chr12', (1110751, 1110752)), ('Chr1', (584858, 584865)), ('Chr4', (1498430, 1498434)), ('Chr5', (1087088, 1087107)), ('Chr15', (75777, 75800)), ('Chr5', (317174, 317186)), ('Chr12', (145016, 145017)), ('Chr1', (1156976, 1156979)), ('Chr5', (2109945, 2109946)), ('Chr1', (1376348, 1376349)), ('Chr5', (2060140, 2060141)), ('Chr20', (326505, 326510)), ('Chr2', (2170311, 2170312)), ('Chr9', (934967, 934968)), ('Chr5', (1062733, 1062744)), ('Chr1', (1206953, 1206959)), ('Chr3', (1805746, 1805766)), ('Chr14', (618751, 618752)), ('Chr5', (1087107, 1087119)), ('Chr11a', (6241, 6242)), ('Chr22', (153098, 153099)), ('Chr24', (183952, 183953)), ('Chr1', (1055425, 1055436)), ('Chr3', (913997, 914015)), ('Chr3', (2422352, 2422355)), ('Chr8', (449077, 449078)), ('Chr9', (192481, 192482)), ('Chr5', (1353548, 1353549)), ('Chr1', (2422435, 2422449)), ('Chr7', (1774380, 1774381))])
