<a href="https://colab.research.google.com/github/adrijanailic/GI_projekat/blob/master/skripta_no2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np
import vcf
import time
import re
import collections
from scipy.stats import binom

In [0]:
class Variation:
    # variation_type = {'SNV', 'insertion', 'deletion', 'inversion ?'}
    # variation_sequence = string od velikih slova, npr 'A' za SNV, 'AG' za inserciju/deleciju
    # quality ??
    def __init__(self, sequence, number_of_supporting_reads, total_num_of_reads): 
        self.variation_type, self.variation_sequence = self.convert_pileup_string(sequence)
        self.number_of_supporting_reads = number_of_supporting_reads
        self.VAF = number_of_supporting_reads/total_num_of_reads

    # Metoda prima string koji je sekvenca preuzeta iz pileup fajla, a vraca dva izlaza na osnovu te sekvence. 
    # Prvi je variation_type = {'REF', 'SNV', 'insertion', 'deletion'}, a drugi je variation_sequence. Variation_sequence za REF je '.',
    # za SNV odgovarajuce veliko slovo npr 'A', a za inserciju i deleciju je string od insertovanih/deletovanih baza npr 'AGC'
    def convert_pileup_string(sequence):
        if len(sequence) == 1:
            variation_sequence = sequence
            if sequence == '.': 
                variation_type = 'REF'
            else:
                variation_type = 'SNV'
        else:
            if sequence[1] == '+':
                variation_type = 'insertion'
            else:
                variation_type = 'deletion'
            # kod ispod konvertuje string oblika ,+3AGC u AGC
            sequence = sequence[2:]
            variation_sequence = ''.join(filter(str.isalpha, sequence))
            return variation_type, variation_sequence

In [0]:
class Pileup_line:
    def __init__(self, line): 
      split_line = line.split("\t")
      self.sequence = split_line[0]
      self.position = split_line[1]
      self.ref_base = split_line[2]
      self.read_count = split_line[3]
      self.read_results = split_line[4]
      self.base_quality = split_line[5]
      self.mapping_quality = split_line[6]

    # Pomocna funkcija za racunanje kvaliteta baze iz odgovarajuceg ascii karaktera
    # Phred quality score: https://en.wikipedia.org/wiki/Phred_quality_score
    def calc_base_quality(ascii_char):
      return 1-10**(-ord(ascii_char)/10)
    
    # Ista funkcija kao gore, samo sto se prosledi redni broj baze/read-a
    def calc_base_quality(self, base_num):
      # base_num < self.read_count !!!!
      return 1-10**(-ord(self.base_quality[base_num])/10)

    def ideal_rr_count(read_result):
      # read_result je string pa mogu odmah da promijenim , u .
      # i mala slova u velika
      read_result = read_result.replace(',','.').upper()
      char_counter = collections.Counter(read_result).most_common() # Prebroji karaktere i poreda ih od najcescih do najrjedje  
      var_list = char_counter 
      return var_list

    def segregate_indels(read_result):
      # Iskoristila sam re, da na laksi nacin pronadjem brojeve, jer su oni zajednicki za insercije i delecije
      # Mozda je ovo moguce napraviti i tako sto se prvo read result iz stringa prebaci u array, ali da se pripazi na visecifrene brojeve
      # Pa da se iskoristi np.findall ili slicno da se nadju ti brojevi, i njihovi indeksi i na slican nacin kao u ovom kodu, izbace odgovarajuci el iz tog arraya
      numbers = np.array([int(m) for m in re.findall('[\d]+', read_result)])
      # Indeksi brojeva
      indices = np.array([m.span() for m in re.finditer('[\d]+', read_result)])
      # String -> np.array
      read_result = np.array([char for char in read_result])
      # Formiram listu uredjenih dvojki kojih ima koliko i indelsa, a prvi broj u toj dvojki odgovara pocetnom indeksu insercije, tj tacki ili zarezu, 
      # A drugi broj odgovara indeksu na kom se nalazi zadnji element te insercije
      ranges = [(index[0]-2,index[1]+numbers[i]) for i,index in enumerate(indices)]
      # Formiranje liste insercija na koju je moguce primjeniti collection.counter
      indels_list = []
      for item in ranges:
        indels_list.append(''.join(read_result[item[0]:item[1]]))

      # Brisanje insercija iz read_results
      indices_for_removing = []
      for item in ranges:
        indices_for_removing.extend([*range(item[0],item[1])])
      read_result = np.delete(read_result,indices_for_removing)
      # read_result to string
      read_result = ''.join(read_result[:])

      return read_result, indels_list


    def delete_carets(read_result):  
      read_result = re.sub('\^.','',read_result)
      return read_result
    
    def determine_variant(self):
      read_result = self.read_results
      read_count = self.read_count
      # Ovo je prva opcija, znaci da koristim find koja ce samo da posluzi za flag
      ins = read_result.find("+") # Flag za inserciju
      dele = read_result.find("-") # Flag za deleciju
      car = read_result.find("^") # Flag za caret
      # Druga opcija je da koristim re bibl da nadjem odmah i indekse svih insercija, delecija i kapica, kako bih onda mogla da ih izbacim
      # Zasad ostavljam prvu opciju
  
      if ins == -1 and dele == -1 and car == -1:
        #print("No insertion, deletions or carets")
        list_of_vars = ideal_rr_count(read_result)
        # TO DO
        #Varijanta1 = list_of_vars[0]
        #Varijanta2 = list_of_vars[1]
        Varijanta1 = Variants(list_of_vars[0][0],list_of_vars[0][1], read_count)
        Varijanta2 = Variants(list_of_vars[1][0],list_of_vars[1][1], read_count)

      # Sljedeci slucaj je da nemamo ni insercija ni delecija samo kapicu imamo, izbrisemo kapicu i znak pored, i primjenimo ovo gore
      elif (car != -1) and (ins == -1 and dele == -1):
        print("Carets yes, no indels")
        read_result = delete_carets(read_result)
        # Prakticno smo dobili idealan read result i mozemo pozvati funkciju ideal_rr_count
        list_of_vars = ideal_rr_count(read_result)

        #Varijanta1 = list_of_vars[0]
        #Varijanta2 = list_of_vars[1]
        Varijanta1 = Variants(list_of_vars[0][0],list_of_vars[0][1], read_count)
        Varijanta2 = Variants(list_of_vars[1][0],list_of_vars[1][1], read_count)

      # Ukoliko imam insercije ili delecije, ili oboje istovremeno
      elif (ins != -1) or (dele != -1):
        # Provjeri ima li kapica
        if car != -1:
          #print("Carets! Oh, no!!!")
          read_result = delete_carets(read_result)

        #print("No carets")
        read_result, indels_list = segregate_indels(read_result)

        # zarezi -> tacke, mala slova -> VELIKA SLOVA
        indels_list = [s.replace(',','.') for s in indels_list]
        number_of_insertions = collections.Counter(map(str.upper, indels_list)).most_common()

        # Pozvati ideal_rr_count na read_result
        list_of_vars = ideal_rr_count(read_result)
        list_of_vars.extend(number_of_insertions)
        list_of_vars = sorted(list_of_vars, key=lambda tup: tup[1], reverse = True)
        print(list_of_vars)

        #Varijanta1 = list_of_vars[0]
        #Varijanta2 = list_of_vars[1]

        Varijanta1 = Variants(list_of_vars[0][0],list_of_vars[0][1], read_count)
        Varijanta2 = Variants(list_of_vars[1][0],list_of_vars[1][1], read_count)

      return [Varijanta1, Varijanta2]

    def determine_genotype(variation1, variation2):
      k1 = variation1.number_of_supporting_reads
      k2 = variation2.number_of_supporting_reads
      n = k1+k2
      p = 0.5 # ovaj broj moze da se modifikuje i da predstavlja kvalitet baza
      P_k1k1 = binom.pmf(k1,n,p)
      P_k1k2 = binom.pmf(k2,n,p)
      P_k2k2 = binom.pmf(n,n,p)
      # napomena: brojevi genotipa 1, 2 .. zavise od toga kojim se redosledom pojavljuju u ALT polju u vcf fajlu
      # ali cu ja ovde da radim ovako posto mi nista drugacije ne pada na pamet:
      # na osnovu onoga sto dobijem ovde za genotip cu da formiram ALT polje
      # 1) ako je genotip 0/0 nemamo alt polje uopste
      # 2) ako je genotip 1/1, ili 0/1 imam samo jedan slucaj u ALT polju
      # 3) ako je genotip 1/2 imacu dva slucaja u ALT polju
      # 4) pretpostavljam da necemo imati vise od 3 slucaja u alt polju
      genotype_list = []
      if P_k1k1 > P_k1k2 and P_k1k1 > P_k2k2: # genotype = 'k1/k1'
          if variation1.variation_type == 'REF':
              genotype = '0/0'
          else:
              genotype = '1/1'
              genotype_list.append(variation1)
      elif P_k1k2 > P_k1k1 and P_k1k2 > P_k2k2: # genotype = 'k1/k2'
          if variation1.variation_type == 'REF' or variation2.variation_type == 'REF':
              genotype = '0/1'
              if variation1.variation_type == 'REF':
                  genotype_list.append(variation1)
              else:
                  genotype_list.append(variation2)
          else:
              genotype = '1/2'
              genotype_list.append(variation1)
              genotype_list.append(variation2)
      else: # genotype = 'k2/k2'
          if variation2.variation_type == 'REF':
              genotype = '0/0'
          else:
              genotype = '1/1'
              genotype_list.append(variation2)

      return genotype, genotype_list

In [0]:
with open('/sbgenomics/project-files/merged-normal_pileup.pileup', 'r') as pileup_file:
    start = time.time()
    for line in iter(pileup_file.readline, ''):
        position = pileup_file.tell() # uzimamo poziciju dokle smo stigli
        pileup = Pileup_line(line)
        variation1, variation2 = pileup.determine_variation()
        genotype, genotype_list = pileup.determine_genotype(variation1, variation2)
        if genotype == '0/0':
            continue
        elif genotype == '1/1' or genotype == '0/1': # proveri jel / ili \
            variation = genotype_list[0] 
            if variation.variation_type == 'insertion':
                REF = pileup.ref_base
                ALT = pileup.ref_base + variation.variation_sequence
            elif variation_type == 'deletion':
                REF = pileup.ref_base + variation.variation_sequence
                ALT = pileup.ref_base
            else: # SNV
                REF = pileup.ref_base
                ALT = variation.variation_sequence
        
            if variation.variation_type == 'insertion' or variation.variation_type == 'deletion':
                condition = True
                while(condition):
                    for i in range(len(variation.variation_sequence)):
                        line = pileup_file.readline()
                        pileup = Pileup_line(line)
                        if variation.variation_sequence[i] == pileup.ref_base:
                            ALT += ref_base
                            REF += ref_base
                        else:
                            condition = false
                            pileup_file.seek(position)
                            break
        else: # genotype == '1/2' # OVAJ ELSE NIJE ZAVRSEN!!!!
            REF = ''
            ALT = ''
            for i in range(2):
                variation = genotype_list[i] 
                if variation.variation_type == 'insertion':
                    REF += pileup.ref_base  # kako se formira uopste ref kad imamo 2 alta?? sta ako je na jednom insercija, a na drugom delecija, ??? promenljiva REF treba da se modifikuje kad skontamo ovo... ALT bi trebalo da je okej
                    ALT += pileup.ref_base + variation.variation_sequence
                elif variation_type == 'deletion':
                    REF += pileup.ref_base + variation.variation_sequence
                    ALT += pileup.ref_base
                else: # SNV
                    REF += pileup.ref_base
                    ALT += variation.variation_sequence
        
                if variation.variation_type == 'insertion' or variation.variation_type == 'deletion':
                    condition = True
                    while(condition):
                        for i in range(len(variation.variation_sequence)):
                            line = pileup_file.readline()
                            pileup = Pileup_line(line)
                            if variation.variation_sequence[i] == pileup.ref_base:
                                ALT += ref_base
                                REF += ref_base
                            else:
                                condition = false
                                pileup_file.seek(position)
                                break
                if i == 0:
                    ALT += ','
                    REF += ','
                


    end = time.time()
    print(end-start)