In [5]:
import numpy as np
import pandas as pd
import csv
import re
import time
from collections import Counter
from itertools import combinations

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

In [37]:
def nucleotide_file_to_counts_single_allele(nucleotide_file_name, reference_sequence, site_start, site_end, timepoints, rep, table_col_name, save_path):
    codon_length = 3
    df_data = pd.read_csv(nucleotide_file_name, low_memory=False)
    df_data = df_data.fillna(0)
    df_data = df_data.drop([0, 1, 2], axis = 0)
    df_data = df_data.drop(df_data.columns[0], axis = 1)
    df_data.reset_index(drop=True, inplace=True)
    df_data.columns = df_data.iloc[0]
    df_data = df_data.drop([0], axis = 0)

    CODONS = ['AAA', 'AAC', 'AAG', 'AAT', 'ACA', 'ACC', 'ACG', 'ACT',   # Tri-nucleotide units table
              'AGA', 'AGC', 'AGG', 'AGT', 'ATA', 'ATC', 'ATG', 'ATT',
              'CAA', 'CAC', 'CAG', 'CAT', 'CCA', 'CCC', 'CCG', 'CCT',
              'CGA', 'CGC', 'CGG', 'CGT', 'CTA', 'CTC', 'CTG', 'CTT',
              'GAA', 'GAC', 'GAG', 'GAT', 'GCA', 'GCC', 'GCG', 'GCT',
              'GGA', 'GGC', 'GGG', 'GGT', 'GTA', 'GTC', 'GTG', 'GTT',
              'TAA', 'TAC', 'TAG', 'TAT', 'TCA', 'TCC', 'TCG', 'TCT',
              'TGA', 'TGC', 'TGG', 'TGT', 'TTA', 'TTC', 'TTG', 'TTT']   
    
    df_data[df_data['hgvs_nt'].str.contains('X', regex=False)]
    df_data = df_data[~df_data.hgvs_nt.str.contains('X', regex=False)]
    
    df_frequency = df_data.loc[:,df_data.columns[2]:].astype('float')
    df_frequency.loc[:,df_frequency.columns[2]:] = df_frequency.loc[:,df_frequency.columns[2]:].div(df_frequency.sum(axis=1),axis=0)
    site_list = list(range(site_start, site_end+1))
    raw_codon = [reference_sequence[i:i+codon_length] for i in range(0, len(reference_sequence), codon_length)]
    allele_counts_columns = ['replicate', 'generation', 'site', 'codon', 'counts']
    allele_counts_table = df_data[table_col_name]

    temp = allele_counts_table.columns.tolist()[0]
    allele_counts_table = allele_counts_table.rename(columns={temp: 'variants'})
    count_table_columns = allele_counts_table.columns.tolist()
    allele_counts_table[allele_counts_table['variants'] == '_wt']

    allele_counts_table_no_wt = allele_counts_table.drop(allele_counts_table.index[allele_counts_table['variants'] == '_wt'])

    total_count = []
    total_mut = []
    total_wt  = []
    
    for i in range(len(timepoints)):
        counts_all = allele_counts_table[count_table_columns[i+1]].tolist()
        temp = [int(integer) for integer in counts_all]
        counts_all = temp
        summation_all = sum(counts_all)
        total_count.append(summation_all)
        counts_mut = allele_counts_table_no_wt[count_table_columns[i+1]].tolist()
        temp = [int(integer) for integer in counts_mut]
        counts_mut = temp
        summation_mut = sum(counts_mut)
        total_mut.append(summation_mut)
        summation_wt = summation_all - summation_mut
        total_wt.append(summation_wt)
#     print(total_count, total_mut, total_wt)
    
    codon_allele_dict = {}
    for gen in timepoints:
        codon_allele_dict[gen] = {}
        for idx in site_list:
            codon_allele_dict[gen][idx] = {}
            for codon in CODONS:
                codon_allele_dict[gen][idx][codon] = 0

            codon_allele_dict[gen][idx][raw_codon[site_list.index(idx)]] = total_count[timepoints.index(gen)]
    
    reference_list = list(reference_sequence)
    for i in range(allele_counts_table_no_wt.shape[0]):
        print("Progress {:2.1%}".format(i / allele_counts_table_no_wt.shape[0]), end="\r")
        variants_allele = allele_counts_table_no_wt.iloc[i].variants
        mutation_number = allele_counts_table_no_wt.iloc[i].tolist()[1:]
        temp = [int(integer) for integer in mutation_number]
        mutation_number = temp
        nucleotide = [x for x in variants_allele if x.isalpha()]
        variant_site = re.findall("(\d+)", variants_allele)
        nucleotide = nucleotide[1:]
        variant_list = reference_list.copy()
        for j in range(len(variant_site)):
            variant_list[int(variant_site[j])-1] = nucleotide[2 * j + 1]
        variant_sequence = ''.join(variant_list)
        variant_codon = [variant_sequence[i:i+codon_length] for i in range(0, len(variant_sequence), codon_length)]
        for r, n, idx in zip(raw_codon, variant_codon, site_list):
            if r != n:
                for gen in timepoints:
                    codon_allele_dict[gen][idx][r] -= mutation_number[timepoints.index(gen)]
                    codon_allele_dict[gen][idx][n] += mutation_number[timepoints.index(gen)]

    allele_counts_list = []
    for gen, site_codon_counts in codon_allele_dict.items():
        for site, codon_counts in site_codon_counts.items():
            for codon, counts in codon_counts.items():
                allele_counts_list.append([rep, gen, site, codon, counts])

    codon_counts_table = pd.DataFrame(data = allele_counts_list, columns = allele_counts_columns)
    codon_counts_table.to_csv(save_path, sep = ',', index = False, compression = 'zip')
    return df_frequency



In [38]:
def nucleotide_file_to_counts_double_allele(nucleotide_file_name, reference_sequence, site_start, site_end, timepoints, rep, table_col_name, save_path):
    codon_length = 3
    df_data = pd.read_csv(nucleotide_file_name, low_memory=False)
    df_data = df_data.fillna(0)
    df_data = df_data.drop([0, 1, 2], axis = 0)
    df_data = df_data.drop(df_data.columns[0], axis = 1)
    df_data.reset_index(drop=True, inplace=True)
    df_data.columns = df_data.iloc[0]
    df_data = df_data.drop([0], axis = 0)
    

    CODONS = ['AAA', 'AAC', 'AAG', 'AAT', 'ACA', 'ACC', 'ACG', 'ACT',   # Tri-nucleotide units table
              'AGA', 'AGC', 'AGG', 'AGT', 'ATA', 'ATC', 'ATG', 'ATT',
              'CAA', 'CAC', 'CAG', 'CAT', 'CCA', 'CCC', 'CCG', 'CCT',
              'CGA', 'CGC', 'CGG', 'CGT', 'CTA', 'CTC', 'CTG', 'CTT',
              'GAA', 'GAC', 'GAG', 'GAT', 'GCA', 'GCC', 'GCG', 'GCT',
              'GGA', 'GGC', 'GGG', 'GGT', 'GTA', 'GTC', 'GTG', 'GTT',
              'TAA', 'TAC', 'TAG', 'TAT', 'TCA', 'TCC', 'TCG', 'TCT',
              'TGA', 'TGC', 'TGG', 'TGT', 'TTA', 'TTC', 'TTG', 'TTT']   

    df_data[df_data['hgvs_nt'].str.contains('X', regex=False)]
    df_data = df_data[~df_data.hgvs_nt.str.contains('X', regex=False)]
    
    df_frequency = df_data.loc[:,df_data.columns[2]:].astype('float')
    df_frequency.loc[:,df_frequency.columns[2]:] = df_frequency.loc[:,df_frequency.columns[2]:].div(df_frequency.sum(axis=1),axis=0)
    site_list = list(range(site_start, site_end+1))
    raw_codon = [reference_sequence[i:i+codon_length] for i in range(0, len(reference_sequence), codon_length)]
    allele_counts_columns = ['replicate', 'generation', 'site_1', 'codon_1', 'site_2', 'codon_2', 'counts']
    allele_counts_table = df_data[table_col_name]

    temp = allele_counts_table.columns.tolist()[0]
    allele_counts_table = allele_counts_table.rename(columns={temp: 'variants'})
    count_table_columns = allele_counts_table.columns.tolist()
    allele_counts_table[allele_counts_table['variants'] == '_wt']

    allele_counts_table_no_wt = allele_counts_table.drop(allele_counts_table.index[allele_counts_table['variants'] == '_wt'])

    total_count = []
    total_mut = []
    total_wt  = []
    for i in range(len(timepoints)):
        counts_all = allele_counts_table[count_table_columns[i+1]].tolist()
        temp = [int(integer) for integer in counts_all]
        counts_all = temp
        summation_all = sum(counts_all)
        total_count.append(summation_all)
        counts_mut = allele_counts_table_no_wt[count_table_columns[i+1]].tolist()
        temp = [int(integer) for integer in counts_mut]
        counts_mut = temp
        summation_mut = sum(counts_mut)
        total_mut.append(summation_mut)
        summation_wt = summation_all - summation_mut
        total_wt.append(summation_wt)

    length_site_list = len(site_list)
    length_codon_list = len(CODONS)
    codon_allele_dict = {}
    for gen in timepoints:
        codon_allele_dict[gen] = {}
        for idx_i in range(length_site_list):
            codon_allele_dict[gen][site_list[idx_i]] = {}
            codon_allele_dict[gen][site_list[idx_i]][raw_codon[idx_i]] = {}
            for idx_j in range(idx_i+1, length_site_list):
                codon_allele_dict[gen][site_list[idx_i]][raw_codon[idx_i]][site_list[idx_j]] = {}
                codon_allele_dict[gen][site_list[idx_i]][raw_codon[idx_i]][site_list[idx_j]][raw_codon[idx_j]] = total_count[timepoints.index(gen)]

    reference_list = list(reference_sequence)

    for i in range(allele_counts_table_no_wt.shape[0]):
        print("Progress {:2.1%}".format(i / allele_counts_table_no_wt.shape[0]), end="\r")
        variants_allele = allele_counts_table_no_wt.iloc[i].variants
        mutation_number = allele_counts_table_no_wt.iloc[i].tolist()[1:]
        temp = [int(integer) for integer in mutation_number]
        mutation_number = temp
        nucleotide = [x for x in variants_allele if x.isalpha()]
        variant_site = re.findall("(\d+)", variants_allele)
        nucleotide = nucleotide[1:]
        variant_list = reference_list.copy()
        for j in range(len(variant_site)):
            variant_list[int(variant_site[j])-1] = nucleotide[2 * j + 1]
        variant_sequence = ''.join(variant_list)
        variant_codon = [variant_sequence[i:i+codon_length] for i in range(0, len(variant_sequence), codon_length)]
        variant_site = []

        for r, n, idx in zip(raw_codon, variant_codon, site_list):
            if r != n:
                variant_site.append(idx)

        double_variant = []
        for v_site in variant_site:
            v_site_index = site_list.index(v_site)
            for j in range(v_site_index):
                double_variant.append([site_list[j], v_site])
            for j in range(v_site_index+1, len(site_list)):
                double_variant.append([v_site, site_list[j]])

        for d_v in double_variant:
            site_i = d_v[0]
            site_j = d_v[1]
            idx_i = site_list.index(site_i)
            idx_j = site_list.index(site_j)
            codon_i = variant_codon[idx_i]
            codon_j = variant_codon[idx_j]

            for gen in timepoints:
                codon_allele_dict[gen][site_i][raw_codon[idx_i]][site_j][raw_codon[idx_j]] -= mutation_number[timepoints.index(gen)]
                if codon_i not in codon_allele_dict[gen][site_i].keys():
                    codon_allele_dict[gen][site_i][codon_i] = {}
                    codon_allele_dict[gen][site_i][codon_i][site_j] = {}
                    codon_allele_dict[gen][site_i][codon_i][site_j][codon_j] = mutation_number[timepoints.index(gen)]
                else:
                    if site_j not in codon_allele_dict[gen][site_i][codon_i].keys():
                        codon_allele_dict[gen][site_i][codon_i][site_j] = {}
                        codon_allele_dict[gen][site_i][codon_i][site_j][codon_j] = mutation_number[timepoints.index(gen)]
                    else:
                        if codon_j not in codon_allele_dict[gen][site_i][codon_i][site_j].keys():
                            codon_allele_dict[gen][site_i][codon_i][site_j][codon_j] = mutation_number[timepoints.index(gen)]
                        else:
                            codon_allele_dict[gen][site_i][codon_i][site_j][codon_j] += mutation_number[timepoints.index(gen)]

        if len(variant_site)>1:
            error_deletion = list(combinations(variant_site, 2))
            for item in error_deletion:
                site_i = item[0]
                site_j = item[1]
                idx_i = site_list.index(site_i)
                idx_j = site_list.index(site_j)
                codon_i = variant_codon[idx_i]
                codon_j = variant_codon[idx_j]
                for gen in timepoints:
                    codon_allele_dict[gen][site_i][raw_codon[idx_i]][site_j][raw_codon[idx_j]] += mutation_number[timepoints.index(gen)]
                    codon_allele_dict[gen][site_i][codon_i][site_j][codon_j] -= mutation_number[timepoints.index(gen)]


    allele_counts_list = []
    for gen, site_codon_counts in codon_allele_dict.items():
        for site_i, codoni_sitej_codonj_countj in site_codon_counts.items():
            for codon_i, sitej_codonj_countj in codoni_sitej_codonj_countj.items():
                for site_j, codonj_countj in sitej_codonj_countj.items():
                    for codon_j, count_j in codonj_countj.items():
                        allele_counts_list.append([rep, gen, site_i, codon_i, site_j, codon_j, count_j])


    codon_counts_table = pd.DataFrame(data = allele_counts_list, columns = allele_counts_columns)
    codon_counts_table.to_csv(save_path, sep = ',', index = False, compression = 'zip')
    return df_frequency



In [39]:
REFER_SEQ = 'GACGTTCCACTGCCGGCTGGTTGGGAAATGGCTAAAACTAGTTCTGGTCAGCGTTACTTCCTGAACCACATCGACCAGACCACCACGTGGCAGGACCCGCGT'
FILE_NAME = './data/raw_data/YAP1_nucleotide_variant.csv'
TARGET_NAME = 'YAP1'
COUNT_PATH = './outputs/allele_counts/'
START = 1
END = 34
GEN = [0, 1, 2, 3]
REP = [1, 2]
TABLE_COL = {1: ['hgvs_nt','101208_c_0', '101208_c_1', '101208_c_2', '101208_c_3'],
             2: ['hgvs_nt','110307_c_0', '110307_c_1', '110307_c_2', '110307_c_3']}

for rep in REP:
    print(TARGET_NAME + ', replicate '+str(rep)+', single allele')
    SAVE_PATH = COUNT_PATH + TARGET_NAME + '_single_allele_rep' + str(rep) + '.csv.zip' 
    nucleotide_file_to_counts_single_allele(FILE_NAME, REFER_SEQ, START, END, GEN, rep, TABLE_COL[rep], SAVE_PATH)
    print(TARGET_NAME + ', replicate '+str(rep)+', single allele finished')
    print(TARGET_NAME + ', replicate '+str(rep)+', double allele')
    SAVE_PATH = COUNT_PATH + TARGET_NAME + '_double_allele_rep' + str(rep) + '.csv.zip'
    nucleotide_file_to_counts_double_allele(FILE_NAME, REFER_SEQ, START, END, GEN, rep, TABLE_COL[rep], SAVE_PATH)
    print(TARGET_NAME + ', replicate '+str(rep)+', double allele finished')
    

YAP1, replicate 1, single allele
YAP1, replicate 1, single allele finished
YAP1, replicate 1, double allele
YAP1, replicate 1, double allele finished
YAP1, replicate 2, single allele
YAP1, replicate 2, single allele finished
YAP1, replicate 2, double allele
YAP1, replicate 2, double allele finished


In [40]:
REFER_SEQ = 'GATTTATCTGCTCTTCGCGTTGAAGAAGTACAAAATGTCATTAATGCTATGCAGAAAATCTTAGAGTGTCCCATCTGCCTGGAGTTGATCAAGGAACCTGTCTCCACAAAGTGTGACCACATATTTTGCAAATTTTGCATGCTGAAACTTCTCAACCAGAAGAAAGGGCCTTCACAGTGTCCTTTATGTAAGAATGATATAACCAAAAGGAGCCTACAAGAAAGTACGAGATTTAGTCAACTTGTTGAAGAGCTATTGAAAATCATTTGTGCTTTTCAGCTTGACACAGGTTTGGAGTATGCAAACAGCTATAATTTTGCAAAAAAGGAAAATAACTCTCCTGAACATCTAAAAGATGAAGTTTCTATCATCCAAAGTATGGGCTACAGAAACCGTGCCAAAAGACTTCTACAGAGTGAACCCGAAAATCCTTCCTTGCAGGAAACCAGTCTCAGTGTCCAACTCTCTAACCTTGGAACTGTGAGAACTCTGAGGACAAAGCAGCGGATACAACCTCAAAGGACGTCTGTCTACATTGAATTGGGATCTGATTCTTCTGAAGATACCGTTAATAAGGCAACTTATTGCAGTGTGGGAGATCAAGAATTGTTACAAATCACCCCTCAAGGAACCAGGGATGAAATCAGTTTGGATTCTGCAAAAAAGGCTGCTTGTGAATTTTCTGAGACGGATGTAACAAATACTGAACATCATCAACCCAGTAATAATGATTTGAACACCACTGAGAAGCGTGCAGCTGAGAGGCATCCAGAAAAGTATCAGGGTAGTTCTGTTTCAAACTTGCATGTGGAGCCATGTGGCACAAATACTCATGCCAGCTCATTACAGCATGAGAACAGCAGTTTATTACTCACTAAAGACAGAATGAATGTAGAAAAGGCTGAGTTC'
FILE_NAME = './data/raw_data/Y2H_nucleotide_variant.csv'
TARGET_NAME = 'Y2H_1'
COUNT_PATH = './outputs/allele_counts/'
START = 1
END = 303
GEN = [0, 18, 37, 45]
REP = [1, 2, 3]
TABLE_COL = {1: ['hgvs_nt','Y2H_1_Rep1_c_0', 'Y2H_1_Rep1_c_18', 'Y2H_1_Rep1_c_37', 'Y2H_1_Rep1_c_45'],
             2: ['hgvs_nt','Y2H_1_Rep2_c_0', 'Y2H_1_Rep2_c_18', 'Y2H_1_Rep2_c_37', 'Y2H_1_Rep2_c_45'],
             3: ['hgvs_nt','Y2H_1_Rep3_c_0', 'Y2H_1_Rep3_c_18', 'Y2H_1_Rep3_c_37', 'Y2H_1_Rep3_c_45']}

for rep in REP:
    print(TARGET_NAME + ', replicate '+str(rep)+', single allele')
    SAVE_PATH = COUNT_PATH + TARGET_NAME + '_single_allele_rep' + str(rep) + '.csv.zip' 
    nucleotide_file_to_counts_single_allele(FILE_NAME, REFER_SEQ, START, END, GEN, rep, TABLE_COL[rep], SAVE_PATH)
    print(TARGET_NAME + ', replicate '+str(rep)+', single allele finished')
    print(TARGET_NAME + ', replicate '+str(rep)+', double allele')
    SAVE_PATH = COUNT_PATH + TARGET_NAME + '_double_allele_rep' + str(rep) + '.csv.zip'
    nucleotide_file_to_counts_double_allele(FILE_NAME, REFER_SEQ, START, END, GEN, rep, TABLE_COL[rep], SAVE_PATH)
    print(TARGET_NAME + ', replicate '+str(rep)+', double allele finished')
    

Y2H_1, replicate 1, single allele
Y2H_1, replicate 1, single allele finished
Y2H_1, replicate 1, double allele
Y2H_1, replicate 1, double allele finished
Y2H_1, replicate 2, single allele
Y2H_1, replicate 2, single allele finished
Y2H_1, replicate 2, double allele
Y2H_1, replicate 2, double allele finished
Y2H_1, replicate 3, single allele
Y2H_1, replicate 3, single allele finished
Y2H_1, replicate 3, double allele
Y2H_1, replicate 3, double allele finished


In [44]:
REFER_SEQ = 'GATTTATCTGCTCTTCGCGTTGAAGAAGTACAAAATGTCATTAATGCTATGCAGAAAATCTTAGAGTGTCCCATCTGCCTGGAGTTGATCAAGGAACCTGTCTCCACAAAGTGTGACCACATATTTTGCAAATTTTGCATGCTGAAACTTCTCAACCAGAAGAAAGGGCCTTCACAGTGTCCTTTATGTAAGAATGATATAACCAAAAGGAGCCTACAAGAAAGTACGAGATTTAGTCAACTTGTTGAAGAGCTATTGAAAATCATTTGTGCTTTTCAGCTTGACACAGGTTTGGAGTATGCAAACAGCTATAATTTTGCAAAAAAGGAAAATAACTCTCCTGAACATCTAAAAGATGAAGTTTCTATCATCCAAAGTATGGGCTACAGAAACCGTGCCAAAAGACTTCTACAGAGTGAACCCGAAAATCCTTCCTTGCAGGAAACCAGTCTCAGTGTCCAACTCTCTAACCTTGGAACTGTGAGAACTCTGAGGACAAAGCAGCGGATACAACCTCAAAGGACGTCTGTCTACATTGAATTGGGATCTGATTCTTCTGAAGATACCGTTAATAAGGCAACTTATTGCAGTGTGGGAGATCAAGAATTGTTACAAATCACCCCTCAAGGAACCAGGGATGAAATCAGTTTGGATTCTGCAAAAAAGGCTGCTTGTGAATTTTCTGAGACGGATGTAACAAATACTGAACATCATCAACCCAGTAATAATGATTTGAACACCACTGAGAAGCGTGCAGCTGAGAGGCATCCAGAAAAGTATCAGGGTAGTTCTGTTTCAAACTTGCATGTGGAGCCATGTGGCACAAATACTCATGCCAGCTCATTACAGCATGAGAACAGCAGTTTATTACTCACTAAAGACAGAATGAATGTAGAAAAGGCTGAGTTC'
FILE_NAME = './data/raw_data/Y2H_nucleotide_variant.csv'
TARGET_NAME = 'Y2H_2'
COUNT_PATH = './outputs/allele_counts/'
START = 1
END = 303
GEN = [0, 16, 41, 64]
REP = [1, 2, 3]
TABLE_COL = {1: ['hgvs_nt','Y2H_2_Rep1_c_0', 'Y2H_2_Rep1_c_16', 'Y2H_2_Rep1_c_41', 'Y2H_2_Rep1_c_64'],
             2: ['hgvs_nt','Y2H_2_Rep2_c_0', 'Y2H_2_Rep2_c_16', 'Y2H_2_Rep2_c_41', 'Y2H_2_Rep2_c_64'],
             3: ['hgvs_nt','Y2H_2_Rep3_c_0', 'Y2H_2_Rep3_c_16', 'Y2H_2_Rep3_c_41', 'Y2H_2_Rep3_c_64']}

for rep in REP:
    print(TARGET_NAME + ', replicate '+str(rep)+', single allele')
    SAVE_PATH = COUNT_PATH + TARGET_NAME + '_single_allele_rep' + str(rep) + '.csv.zip' 
    nucleotide_file_to_counts_single_allele(FILE_NAME, REFER_SEQ, START, END, GEN, rep, TABLE_COL[rep], SAVE_PATH)
    print(TARGET_NAME + ', replicate '+str(rep)+', single allele finished')
    print(TARGET_NAME + ', replicate '+str(rep)+', double allele')
    SAVE_PATH = COUNT_PATH + TARGET_NAME + '_double_allele_rep' + str(rep) + '.csv.zip'
    nucleotide_file_to_counts_double_allele(FILE_NAME, REFER_SEQ, START, END, GEN, rep, TABLE_COL[rep], SAVE_PATH)
    print(TARGET_NAME + ', replicate '+str(rep)+', double allele finished')
    

Y2H_2, replicate 1, single allele
Y2H_2, replicate 1, single allele finished
Y2H_2, replicate 1, double allele
Y2H_2, replicate 1, double allele finished
Y2H_2, replicate 2, single allele
Y2H_2, replicate 2, single allele finished
Y2H_2, replicate 2, double allele
Y2H_2, replicate 2, double allele finished
Y2H_2, replicate 3, single allele
Y2H_2, replicate 3, single allele finished
Y2H_2, replicate 3, double allele
Y2H_2, replicate 3, double allele finished


In [45]:
REFER_SEQ = 'GATTTATCTGCTCTTCGCGTTGAAGAAGTACAAAATGTCATTAATGCTATGCAGAAAATCTTAGAGTGTCCCATCTGCCTGGAGTTGATCAAGGAACCTGTCTCCACAAAGTGTGACCACATATTTTGCAAATTTTGCATGCTGAAACTTCTCAACCAGAAGAAAGGGCCTTCACAGTGTCCTTTATGTAAGAATGATATAACCAAAAGGAGCCTACAAGAAAGTACGAGATTTAGTCAACTTGTTGAAGAGCTATTGAAAATCATTTGTGCTTTTCAGCTTGACACAGGTTTGGAGTATGCAAACAGCTATAATTTTGCAAAAAAGGAAAATAACTCTCCTGAACATCTAAAAGATGAAGTTTCTATCATCCAAAGTATGGGCTACAGAAACCGTGCCAAAAGACTTCTACAGAGTGAACCCGAAAATCCTTCCTTGCAGGAAACCAGTCTCAGTGTCCAACTCTCTAACCTTGGAACTGTGAGAACTCTGAGGACAAAGCAGCGGATACAACCTCAAAGGACGTCTGTCTACATTGAATTGGGATCTGATTCTTCTGAAGATACCGTTAATAAGGCAACTTATTGCAGTGTGGGAGATCAAGAATTGTTACAAATCACCCCTCAAGGAACCAGGGATGAAATCAGTTTGGATTCTGCAAAAAAGGCTGCTTGTGAATTTTCTGAGACGGATGTAACAAATACTGAACATCATCAACCCAGTAATAATGATTTGAACACCACTGAGAAGCGTGCAGCTGAGAGGCATCCAGAAAAGTATCAGGGTAGTTCTGTTTCAAACTTGCATGTGGAGCCATGTGGCACAAATACTCATGCCAGCTCATTACAGCATGAGAACAGCAGTTTATTACTCACTAAAGACAGAATGAATGTAGAAAAGGCTGAGTTC'
FILE_NAME = './data/raw_data/Y2H_nucleotide_variant.csv'
TARGET_NAME = 'E3'
COUNT_PATH = './outputs/allele_counts/'
START = 1
END = 303
GEN = [0, 1, 2, 3, 4, 5]
REP = [1, 2, 3, 4, 5, 6]
TABLE_COL = {1: ['hgvs_nt','PlusE2NewRep3_c_0', 'PlusE2NewRep3_c_1', 'PlusE2NewRep3_c_2', 'PlusE2NewRep3_c_3', 'PlusE2NewRep3_c_4', 'PlusE2NewRep3_c_5'],
             2: ['hgvs_nt','PlusE2NewRep4_c_0', 'PlusE2NewRep4_c_1', 'PlusE2NewRep4_c_2', 'PlusE2NewRep4_c_3', 'PlusE2NewRep4_c_4', 'PlusE2NewRep4_c_5'],
             3: ['hgvs_nt','PlusE2NewRep5_c_0', 'PlusE2NewRep5_c_1', 'PlusE2NewRep5_c_2', 'PlusE2NewRep5_c_3', 'PlusE2NewRep5_c_4', 'PlusE2NewRep5_c_5'],
             4: ['hgvs_nt','PlusE2Rep3_c_0', 'PlusE2Rep3_c_1', 'PlusE2Rep3_c_2', 'PlusE2Rep3_c_3', 'PlusE2Rep3_c_4', 'PlusE2Rep3_c_5'],
             5: ['hgvs_nt','PlusE2Rep4_c_0', 'PlusE2Rep4_c_1', 'PlusE2Rep4_c_2', 'PlusE2Rep4_c_3', 'PlusE2Rep4_c_4', 'PlusE2Rep4_c_5'],
             6: ['hgvs_nt','PlusE2Rep5_c_0', 'PlusE2Rep5_c_1', 'PlusE2Rep5_c_2', 'PlusE2Rep5_c_3', 'PlusE2Rep5_c_4', 'PlusE2Rep5_c_5']}

for rep in REP:
    print(TARGET_NAME + ', replicate '+str(rep)+', single allele')
    SAVE_PATH = COUNT_PATH + TARGET_NAME + '_single_allele_rep' + str(rep) + '.csv.zip' 
    nucleotide_file_to_counts_single_allele(FILE_NAME, REFER_SEQ, START, END, GEN, rep, TABLE_COL[rep], SAVE_PATH)
    print(TARGET_NAME + ', replicate '+str(rep)+', single allele finished')
    print(TARGET_NAME + ', replicate '+str(rep)+', double allele')
    SAVE_PATH = COUNT_PATH + TARGET_NAME + '_double_allele_rep' + str(rep) + '.csv.zip'
    nucleotide_file_to_counts_double_allele(FILE_NAME, REFER_SEQ, START, END, GEN, rep, TABLE_COL[rep], SAVE_PATH)
    print(TARGET_NAME + ', replicate '+str(rep)+', double allele finished')
    

E3, replicate 1, single allele
E3, replicate 1, single allele finished
E3, replicate 1, double allele
E3, replicate 1, double allele finished
E3, replicate 2, single allele
E3, replicate 2, single allele finished
E3, replicate 2, double allele
E3, replicate 2, double allele finished
E3, replicate 3, single allele
E3, replicate 3, single allele finished
E3, replicate 3, double allele
E3, replicate 3, double allele finished
E3, replicate 4, single allele
E3, replicate 4, single allele finished
E3, replicate 4, double allele
E3, replicate 4, double allele finished
E3, replicate 5, single allele
E3, replicate 5, single allele finished
E3, replicate 5, double allele
E3, replicate 5, double allele finished
E3, replicate 6, single allele
E3, replicate 6, single allele finished
E3, replicate 6, double allele
E3, replicate 6, double allele finished


In [43]:
REFER_SEQ = 'ATAGAGAAGTTTAAACTTCTTGCAGAGAAAGTGGAGGAAATCGTGGCAAAGAATGCGCGGGCAGAAATAGACTACAGCGATGCCCCGGACGAGTTCAGAGACCCTCTGATGGACACCCTGATGACCGATCCCGTGAGACTGCCCTCTGGCACCGTCATGGACCGTTCTATCATCCTGCGGCATCTGCTCAACTCCCCCACCGACCCCTTCAACCGCCAGATGCTGACTGAGAGCATGCTGGAGCCAGTGCCAGAGCTAAAGGAGCAGATTCAGGCCTGGATGAGAGAGAAACAGAGCAGTGACCACTGA'
FILE_NAME = './data/raw_data/Ube4b_nucleotide_variant.csv'
TARGET_NAME = 'Ube4b'
COUNT_PATH = './outputs/allele_counts/'
START = 1
END = 103
GEN = [0, 1, 2, 3]
REP = [1, 2]
TABLE_COL = {1: ['hgvs_nt', 'Rep_2_c_0', 'Rep_2_c_1','Rep_2_c_2', 'Rep_2_c_3'],
             2: ['hgvs_nt', 'Rep_3_c_0', 'Rep_3_c_1','Rep_3_c_2', 'Rep_3_c_3'],}

for rep in REP:
    print(TARGET_NAME + ', replicate '+str(rep)+', single allele')
    SAVE_PATH = COUNT_PATH + TARGET_NAME + '_single_allele_rep' + str(rep) + '.csv.zip'
    nucleotide_file_to_counts_single_allele(FILE_NAME, REFER_SEQ, START, END, GEN, rep, TABLE_COL[rep], SAVE_PATH)
    print(TARGET_NAME + ', replicate '+str(rep)+', single allele finished')
    print(TARGET_NAME + ', replicate '+str(rep)+', double allele')
    SAVE_PATH = COUNT_PATH + TARGET_NAME + '_double_allele_rep' + str(rep) + '.csv.zip'
    nucleotide_file_to_counts_double_allele(FILE_NAME, REFER_SEQ, START, END, GEN, rep, TABLE_COL[rep], SAVE_PATH)
    print(TARGET_NAME + ', replicate '+str(rep)+', double allele finished')


Ube4b, replicate 1, single allele
Ube4b, replicate 1, single allele finished
Ube4b, replicate 1, double allele
Ube4b, replicate 1, double allele finished
Ube4b, replicate 2, single allele
Ube4b, replicate 2, single allele finished
Ube4b, replicate 2, double allele
Ube4b, replicate 2, double allele finished


In [41]:
REFER_SEQ = 'ACCGAGACCGCCTGGATCTCCTTGGTGACCGCTCTGCATCTAGTGCTGGGCCTCAGCGCCGTCCTGGGCCTGCTGCTGCTGAGGTGGCAGTTT'
FILE_NAME = './data/raw_data/TpoR_nucleotide_counts.csv'
TARGET_NAME = 'TpoR'
COUNT_PATH = './outputs/allele_counts/'
START = 1
END = 31
GEN = [0, 1]
REP = [1, 2, 3, 4, 5, 6]
TABLE_COL = {1: ['hgvs_nt', 'Replicate_A_c_0', 'Replicate_A_c_1'],
             2: ['hgvs_nt', 'Replicate_B_c_0', 'Replicate_B_c_1'],
             3: ['hgvs_nt', 'Replicate_C_c_0', 'Replicate_C_c_1'],
             4: ['hgvs_nt', 'Replicate_D_c_0', 'Replicate_D_c_1'],
             5: ['hgvs_nt', 'Replicate_E_c_0', 'Replicate_E_c_1'],
             6: ['hgvs_nt', 'Replicate_F_c_0', 'Replicate_F_c_1']}

for rep in REP:
    print(TARGET_NAME + ', replicate '+str(rep)+', single allele')
    SAVE_PATH = COUNT_PATH + TARGET_NAME + '_single_allele_rep' + str(rep) + '.csv.zip'
    nucleotide_file_to_counts_single_allele(FILE_NAME, REFER_SEQ, START, END, GEN, rep, TABLE_COL[rep], SAVE_PATH)
    print(TARGET_NAME + ', replicate '+str(rep)+', single allele finished')
    print(TARGET_NAME + ', replicate '+str(rep)+', double allele')
    SAVE_PATH = COUNT_PATH + TARGET_NAME + '_double_allele_rep' + str(rep) + '.csv.zip'
    nucleotide_file_to_counts_double_allele(FILE_NAME, REFER_SEQ, START, END, GEN, rep, TABLE_COL[rep], SAVE_PATH)
    print(TARGET_NAME + ', replicate '+str(rep)+', double allele finished')


TpoR, replicate 1, single allele


UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc9 in position 10: invalid continuation byte