In [0]:
import pandas as pd
import numpy as np
import vcf
import time
import re
from scipy.stats.binom import pmf

In [0]:
class Pileup_line:
    def __init__(self, line): 
        split_line = line.split("\t")
        self.sequence = split_line[0]
        self.position = split_line[1]
        self.ref_base = split_line[2]
        self.read_count = split_line[3]
        self.read_results = split_line[4]
        self.base_quality = split_line[5]
        self.mapping_quality = split_line[6]
        
    # Pomocna funkcija za racunanje kvaliteta baze iz odgovarajuceg ascii karaktera
    # Phred quality score: https://en.wikipedia.org/wiki/Phred_quality_score
    def calc_base_quality(ascii_char):
        return 1-10**(-ord(ascii_char)/10)
    
    # Ista funkcija kao gore, samo sto se prosledi redni broj baze/read-a
    def calc_base_quality(self, base_num):
        # base_num < self.read_count !!!!
        return 1-10**(-ord(self.base_quality[base_num])/10)
    
    # Pomocna funkcija za racunanje broja pojavljivanja karaktera u stringu
    # Ovo sam mislila za prebrojavanje npr baza koje se poklapaju sa referentnom tj. broj . u read_results
    def num_of_occurence(self, char):
        return self.read_results.count(char)
    
    def count(self):
        #treba resiti pocetak i kraj stringa tj ^neki_znak i neki_znak$ ignorisati
        
        #sa ovim ispod izvrsavanje koda traje 28sekundi
        #posle ^ moze da ide bilo koji karakter, ukljucujuci i one bitne pa sa ovom linijom to brisem da ne bismo pogresno racunale
        #self.read_results = re.sub(r"\^.",'',self.read_results) 

        
        num_C = self.read_results.count('g') + self.read_results.count('C')
        num_G = self.read_results.count('c') + self.read_results.count('G')
        num_A = self.read_results.count('t') + self.read_results.count('A')
        num_T = self.read_results.count('a') + self.read_results.count('T')
        num_match = self.read_results.count('.') + self.read_results.count(',')
        num_deletions = self.read_results.count('*') + self.read_results.count('-')
        num_insertions = self.read_results.count('+')
        
        if self.ref_base == 'C':
            num_C += num_match
        elif self.ref_base == 'G':
            num_G += num_match
        elif self.ref_base == 'A':
            num_A += num_match
        elif self.ref_base == 'T':
            num_T += num_match
        

    def determine_genotype(Variation variation1, Variation variation2):
		    k1 = variation1.number_of_supporting_reads
		    k2 = variation2.number_of_supporting_reads
		    n = k1+k2
		    p = 0.5 # ovaj broj moze da se modifikuje i da predstavlja kvalitet baza
		    P_k1k1 = pmf(k1,n,p)
		    P_k1k2 = pmf(k2,n,p)
		    P_k2k2 = pmf(n,n,p)
		    # napomena: brojevi genotipa 1, 2 .. zavise od toga kojim se redosledom pojavljuju u ALT polju u vcf fajlu
		    # ali cu ja ovde da radim ovako posto mi nista drugacije ne pada na pamet:
		    # na osnovu onoga sto dobijem ovde za genotip cu da formiram ALT polje
		    # 1) ako je genotip 0/0 nemamo alt polje uopste
		    # 2) ako je genotip 1/1, ili 0/1 imam samo jedan slucaj u ALT polju
		    # 3) ako je genotip 1/2 imacu dva slucaja u ALT polju
		    # 4) pretpostavljam da necemo imati vise od 3 slucaja u alt polju
		    genotype_list = []
		    if P_k1k1 > P_k1k2 and P_k1k1 > P_k2k2: # genotype = 'k1/k1'
			      if variation1.variation_type == 'REF':
				        genotype = '0/0'
			      else:
				        genotype = '1/1'
				        genotype_list.append(variation1)
		    elif P_k1k2 > P_k1k1 and P_k1k2 > P_k2k2: # genotype = 'k1/k2'
			      if variation1.variation_type == 'REF' or variation2.variation_type == 'REF':
				        genotype = '0/1'
				        if variation1.variation_type == 'REF':
					          genotype_list.append(variation1)
				        else:
					          genotype_list.append(variation2)
			      else:
				        genotype = '1/2'
				        genotype_list.append(variation1)
				        genotype_list.append(variation2)
		    else: # genotype = 'k2/k2'
			      if variation2.variation_type == 'REF':
				        genotype = '0/0'
			      else:
				        genotype = '1/1'
				        genotype_list.append(variation2)
		
		    return genotype, genotype_list



In [0]:
class Variation: # ili tako nesto
    # variation_type = {'SNV', 'insertion', 'deletion', 'inversion ?'}
    # variation_sequence = string od velikih slova, npr 'A' za SNV, 'AG' za inserciju/deleciju
    # quality ??
    
    # prosta metoda koja ce da konvertuje string iz pileupa: ,+1ga --> GA

In [0]:
with open('/sbgenomics/project-files/merged-normal_pileup.pileup', 'r') as pileup_file:
    start = time.time()
    for line in iter(pileup_file.readline, ''):
        position = pileup_file.tell() # uzimamo poziciju dokle smo stigli
        pileup = Pileup_line(line)
		    variation1, variation2 = pileup.determine_variation()
        genotype, genotype_list = pileup.determine_genotype(variation1, variation2)
        #inicijalizacija polja ref i alt u zavisnosti od tipa varijacije
        if genotype == '0/0':
			      continue
		    elif genotype == '1/1' or genotype == '0/1': # proveri jel / ili \
			      variation = genotype_list[0] # u ovim slucajevima imamo samo jedan element u genotype_list.. ovako je samo lakse dalje
            if variation.variation_type == 'insertion':
                REF = pileup.ref_base
                ALT = pileup.ref_base + variation.variation_sequence
            elif variation_type == 'deletion':
                REF = pileup.ref_base + variation.variation_sequence
                ALT = pileup.ref_base
            else: # SNV
                REF = pileup.ref_base
                ALT = variation.variation_sequence
        
            if variation.variation_type == 'insertion' or variation.variation_type == 'deletion':
                condition = True
                while(condition):
                    for i in range(len(variation.variation_sequence)):
                        line = pileup_file.readline()
                        pileup = Pileup_line(line)
                        if variation.variation_sequence[i] == pileup.ref_base:
                            ALT += ref_base
                            REF += ref_base
                        else:
                            condition = false
                            pileup_file.seek(position)
                            break
		    else: # genotype == '1/2' # OVAJ ELSE NIJE ZAVRSEN!!!!
			      for i in range(2):
				        variation = genotype_list[i] # u ovim slucajevima imamo samo jedan element u genotype_list.. ovako je samo lakse dalje
            	  if variation.variation_type == 'insertion':
                	  REF = pileup.ref_base
                	  ALT = pileup.ref_base + variation.variation_sequence
            	  elif variation_type == 'deletion':
                  	REF = pileup.ref_base + variation.variation_sequence
                  	ALT = pileup.ref_base
            	  else: # SNV
                  	REF = pileup.ref_base
                  	ALT = variation.variation_sequence
        
              	if variation.variation_type == 'insertion' or variation.variation_type == 'deletion':
                  	condition = True
                  	while(condition):
                      	for i in range(len(variation.variation_sequence)):
                      	    line = pileup_file.readline()
                    	      pileup = Pileup_line(line)
                    	      if variation.variation_sequence[i] == pileup.ref_base:
                    	          ALT += ref_base
                    	          REF += ref_base
                      	    else:
                      	        condition = false
                      	        pileup_file.seek(position)
                      	        break


        
    end = time.time()
    print(end-start)

28.33063554763794
