In [96]:
def tabulateCopies(geneMap, copiesMap):
    
    finalMap = {}
    autosome_set = {"Muller_C", "Muller_E", "Muller_F", "Muller_B"}
    for key in geneMap:
        ## KEY: contig,catName,srcGene 
        ## VALUE: [location1,location2,location3....etc]
        ## get the list of the YL/YM/YS locations, and tabulate occurrences 
        ys_count = 0
        ym_count = 0
        yl_count = 0
        auto_count = 0
        x_count = 0

        curr_locations = geneMap[key]

        ## iterate through the alternative locations and count occurrences
        for i in range(len(curr_locations)):
            #print(curr_locations[i])
            new_key = curr_locations[i]## key to check  
            chromosome = new_key.split(":")[0]

            if chromosome == "Muller_A-AD":
                x_count += 1

            elif chromosome in autosome_set:
                auto_count += 1
                
            elif chromosome[0] == "Y":  # we have one of the Y chromosome contigs
                y_strain = chromosome.split("_")[0]
                y_contig = chromosome[3:len(chromosome)] ##all contigs end in pilon

                relevantMap = copiesMap[y_strain] ## extract the map that is relevant
                copies = int(relevantMap[y_contig])
                if y_strain == "YL":
                    yl_count += 1*copies
                elif y_strain == "YM":
                    ym_count += 1*copies
                else:
                    ys_count += 1*copies    
        ### done with looping through the list of alternative locations)

        

        ## final check - check if the key is YS/YM/YL, then check also
        if key[0] == "Y":
            y = key.split(",")[0]
            y_strain = y.split("_")[0]
            y_contig = y[3:len(y)]
            relevantMap = copiesMap[y_strain]
            copies = int(relevantMap[y_contig])
            if y_strain == "YL":
                yl_count += 1*copies
            elif y_strain == "YM":
                ym_count += 1*copies
            else:
                ys_count += 1*copies
                
        elif key.split(",") == "Muller_A-AD":
                x_count += 1
                
        else:
                auto_count += 1
        ## now write to the final map
        finalMap[key] = (ys_count, ym_count, yl_count, x_count, auto_count)
        print(key, finalMap[key])

    return finalMap
    

In [97]:
## @ param: file name for sizes
## @ return: map of contig --> contig occurrences
def populateCovMap(covFile): 
    covData = open(covFile, 'r')
    curr_line = covData.readline()
    cov_map = {}

    while curr_line:
        contig = curr_line.split()[0]
        copies = curr_line.split()[7]
        cov_map[contig] = copies
        curr_line = covData.readline()

    covData.close()

    return cov_map

# initiate a dictionary for mod gff (location,gene) --> (alternative locations)
# @param: geneFile
# @return: a map (geneName --> (locations of the alt copies, as a list))
## YAY IT WORKS
def populateGeneMap(geneFile):
    genes = open(geneFile, 'r')
    curr_line = genes.readline()
    geneMap = {}
    
    while curr_line:
        contig = curr_line.split()[0]
        src_gene = curr_line.split()[1]
        alt_loc = curr_line.split()[2].split(",") # retrive these locations as a list
        cat_name = curr_line.split()[3]

        keyStr = contig + "," + cat_name + ","  + src_gene
        geneMap[keyStr] = alt_loc

        curr_line = genes.readline()
    
    genes.close()
    
    return geneMap

In [102]:
import json
def main():
    mapYL = populateCovMap("LarY_cffc_asm_v2_maskedAutoX.copyNumber.tsv")
    mapYM = populateCovMap("MedY_cffc_asm_v2_maskedAutoX.copyNumber.tsv")
    mapYS = populateCovMap("SmaY_cffc_asm_v2_maskedAutoX.copyNumber.tsv")
    copiesMap = {"YL":mapYL, "YM":mapYM, "YS":mapYS}
    ## ADD FILE NAME HERE 
    geneMap = populateGeneMap("dpse_pre_bionano_onlyGenes_IDd_Ys_fullCol.tsv")
    finalMap = tabulateCopies(geneMap, copiesMap)
    
    f = open("dpse_pre_bionano_genes_Y_tables.txt", 'w')
    for k,v in finalMap.items():
        f.write(str(k) + "," + str(v) + "\n")
    f.close()

In [103]:
main()

Contig_12,dpse_pre_b_G0000003,LOC26533999 (5, 2, 3, 1, 1)
Contig_12,dpse_pre_b_G0000004,LOC117184351 (5, 1, 3, 1, 1)
Contig_12,dpse_pre_b_G0000005,LOC117185426 (3, 1, 3, 1, 1)
Contig_13,dpse_pre_b_G0000006,LOC26532143 (2, 1, 3, 0, 2)
Contig_14,dpse_pre_b_G0000007,trnN (4, 0, 0, 0, 1)
Contig_14,dpse_pre_b_G0000008,trnA (4, 0, 0, 0, 1)
Contig_14,dpse_pre_b_G0000009,trnG (2, 0, 0, 0, 1)
Contig_14,dpse_pre_b_G0000010,COX3 (5, 0, 2, 1, 2)
Contig_14,dpse_pre_b_G0000011,ATP6 (5, 0, 2, 1, 2)
Contig_14,dpse_pre_b_G0000012,ATP8 (1, 0, 0, 0, 1)
Contig_14,dpse_pre_b_G0000013,trnD (1, 0, 0, 0, 1)
Contig_14,dpse_pre_b_G0000014,trnK (1, 0, 0, 0, 1)
Contig_14,dpse_pre_b_G0000015,ND6 (7, 2, 2, 2, 2)
Contig_14,dpse_pre_b_G0000016,trnP (7, 2, 2, 3, 2)
Contig_14,dpse_pre_b_G0000017,trnT (7, 2, 2, 3, 2)
Contig_14,dpse_pre_b_G0000018,ND4L (7, 2, 2, 1, 2)
Contig_14,dpse_pre_b_G0000019,ND4 (5, 1, 1, 0, 1)
Contig_14,dpse_pre_b_G0000020,trnH (5, 2, 0, 0, 1)
Contig_14,dpse_pre_b_G0000021,trnR (4, 0, 0, 0, 1)
Con

In [47]:
#mapYL = populateCovMap("LarY_cffc_asm_v2_maskedAutoX.copyNumber.tsv") ## yay it works
#for k,v in mapYL.items():
#   print(k,v)

#geneMap = populateGeneMap("dpse_pre_bionano_onlyGenes_IDd_Ys_fullCol.tsv")
for k,v in geneMap.items():
    print(k,v)

Contig_12,dpse_pre_b_G0000003,LOC26533999 ['Muller_A-AD:8252-9267', 'YL_tig00000771_pilon:17871-18747', 'YL_tig00000771_pilon:34233-35104', 'YL_tig00000771_pilon:56123-56850', 'YM_tig00006468_pilon:271501-272370', 'YM_tig00006468_pilon:338284-338655', 'YS_tig00006386_pilon:80909-81927', 'YS_tig00006585_pilon:87081-87442', 'YS_tig00035851_pilon:1551-2262']
Contig_12,dpse_pre_b_G0000004,LOC117184351 ['Muller_A-AD:8498-9252', 'YL_tig00000771_pilon:17982-18732', 'YL_tig00000771_pilon:34344-35089', 'YL_tig00000771_pilon:56123-56835', 'YM_tig00006468_pilon:271516-272282', 'YS_tig00006386_pilon:80924-81678', 'YS_tig00006585_pilon:87081-87384', 'YS_tig00035851_pilon:1567-2247']
Contig_12,dpse_pre_b_G0000005,LOC117185426 ['Muller_A-AD:8932-9199', 'YL_tig00000771_pilon:18417-18679', 'YL_tig00000771_pilon:34778-35036', 'YL_tig00000771_pilon:56521-56782', 'YM_tig00006468_pilon:271569-271831', 'YS_tig00006386_pilon:80977-81254', 'YS_tig00035851_pilon:1941-2199']
Contig_13,dpse_pre_b_G0000006,LOC265

In [None]:
### MASTER COPY DO WHATEVER HERE blah


## take in tsv file from CAT --> tabulate the counts for genes found in YL, YM, YS etc

## @ param: file name for sizes
## @ return: map of contig --> contig occurrences
def populateCovMap(covFile): 
  covData = open(covFile, 'r')
  curr_line = covData.readline()
  contig = curr_line.split()[0]
  copies = curr_line.split()[7]

  cov_map = {}

  while curr_line:
    cov_map[contig] = copies
    covData.readline()  

  covData.close()
  return cov_map

# initiate a dictionary for mod gff (location,gene) --> (alternative locations)
# @param: geneFile
# @return: a map (geneName --> (locations of the alt copies, as a list))
def populateGeneMap(geneFile):
  genes = open(geneFile, 'r')
  curr_line = genes.readline()

  geneMap = {}
  while curr_line:
    contig = curr_line.split()[0]
    src_gene = curr_line.split()[1]
    alt_loc = curr_line.split()[2].split(",") # retrive these locations as a list
    cat_name = curr_line.split()[3]

    keyStr = contig + "," + cat_name + ","  + src_gene
    geneMap[keyStr] = alt_loc

    curr_line = genes.readline()
  
  genes.close()
  return mainMap


def tabulateCopies(geneMap, copiesMap):
  
  finalMap = {}
  autosome_set = {"Muller_C", "Muller_E", "Muller_F", "Muller_B"}
  for key in geneMap:
  ## KEY: contig,catName,srcGene 
  ## VALUE: [location1,location2,location3....etc]
  ## get the list of the YL/YM/YS locations, and tabulate occurrences 
  ys_count = 0
  ym_count = 0
  yl_count = 0
  auto_count = 0
  x_count = 0

    curr_locations = geneMap[key]
    for i in curr_locations:

      new_key = curr_locations[i]## key to check  
      chromosome = new_key.split(":")[0]

      if chromosome == "Muller_A-AD":
        x_count += 1

      elif autosome_set.contains(chromosome):
        auto_count += 1

      else:  # we have one of the Y chromosome contigs
        y_strain = chromosome.split("_")[0]
        y_contig = chromosome.split("_")[1] + "_pilon" ##all contigs end in pilon
        relevantMap = copiesMap[y_strain] ## extract the map that is relevant
        copies = int(relevantMap[y_contig])
        if y_strain == "YL":
          yl_count += 1*copies
        elif y_strain == "YM":
          ym_count += 1*copies
        else:
          ys_count += 1*copies  

      ## final check - check if the key is YS/YM/YL, then check also
      if key[0] == "Y":
        ## add to the count also 
        y = key.split(",")[0].split("_")
        y_strain = y[0]
        y_contig = y[1] + "_pilon"
        relevantMap = copiesMap[y_strain]
        copies = int(relevantMap[y_contig])
        if y_strain == "YL":
          yl_count += 1*copies
        elif y_strain == "YM":
          ym_count += 1*copies
        else:
          ys_count += 1*copies
    finalMap[key] = (ys_count, ym_count, yl_count, x_count, auto_count)

  return finalMap

def checkCopies(strain, copiesMap):

  return copyNumber

def main():
  mapYL = populateCovMap("LarY_cffc_asm_v2_maskedAutoX.copyNumber.tsv")
  mapYM = populateCovMap("MedY_cffc_asm_v2_maskedAutoX.copyNumber.tsv")
  mapYS = populateCovMap("SmaY_cffc_asm_v2_maskedAutoX.copyNumber.tsv")
  copiesMap = {"YL":mapYL, "YM":mapYM, "YS":mapYS}
  ## ADD FILE NAME HERE 
  geneMap = populateGeneMap()
  tabulateCopies(geneMap, copiesMap)
