## Fine tuning data preparation

* In the DNABERT
    * Their data for pretrain (`6_3k.txt`) didn't really have common sequence with their fine tuning data.
    * There are two fine tuning training data: `train.tsv` (21Mb) and `dev.tsv` (660K)
    * Those data are labelled (delimited by tab) into 2 classes, 0 or 1.


In [1]:
from css_utility import *

In [2]:
# To peek how data look like:
path="../database/dnabert/examples/sample_data/ft/6/"
file_list=[os.path.join(path, file) for file in os.listdir(path)]
file_list

['../database/dnabert/examples/sample_data/ft/6/train.tsv',
 '../database/dnabert/examples/sample_data/ft/6/dev.tsv']

In [3]:
train_pre=pd.read_csv("../database/dnabert/examples/sample_data/pre/6_3k.txt", sep="\n",  \
                      header=None, names=["sequence"])  # data for pretrain
train=pd.read_csv(file_list[0], sep="\t")
dev=pd.read_csv(file_list[1], sep="\t")

In [4]:
train.head()

Unnamed: 0,sequence,label
0,CACAGC ACAGCC CAGCCA AGCCAG GCCAGC CCAGCC CAGC...,0
1,CTAATC TAATCT AATCTA ATCTAG TCTAGT CTAGTA TAGT...,1
2,GGAAGA GAAGAG AAGAGG AGAGGG GAGGGA AGGGAA GGGA...,1
3,CGAAAG GAAAGC AAAGCA AAGCAA AGCAAT GCAATC CAAT...,1
4,TGACTC GACTCC ACTCCC CTCCCA TCCCAA CCCAAA CCAA...,1


In [5]:
print("len(train_pre): ", len(train_pre))
print("len(train.tsv): ", len(train))
print("len(dev.tsv): ", len(dev))

len(train_pre):  3000
len(train.tsv):  32366
len(dev.tsv):  1000


## Read chr2 for creating the similar dataset

In [6]:
chr2_path=np.sort(all_files)[1]
chr2_path

'../database/bed/unzipped/E002_15_coreMarks_stateno.bed'

In [7]:
df=bed2df_expanded(chr2_path)

In [8]:
all_unit_css=df2unitcss(df)

In [9]:
len(all_unit_css[1])

1215996

In [10]:
with open("../database/temp_files/css_gene_lst_all", "rb") as f:
    css_gene_lst_all=pickle.load(f) # genic area only

In [11]:
len(css_gene_lst_all[0])

5255

In [12]:
len(css_gene_lst_all[0][5000])

97262

In [12]:
# length distribution first!
# then the non-genic region!

In [13]:
def flatLst(lst):
        flatten_lst=[elm for sublst in lst for elm in sublst]
        return flatten_lst

In [14]:
flatten_css_gene_lst=flatLst(css_gene_lst_all)
len(flatten_css_gene_lst)

52260

In [15]:
def len_css_gene(css_gene_lst_all,color="teal"):
    def flatLst(lst):
        flatten_lst=[elm for sublst in lst for elm in sublst]
        return flatten_lst
    flatten_css_gene_lst=flatLst(css_gene_lst_all)
#     fig=plt.figure(figsize=(6,4))    
#     plt.hist(flatten_css_gene_lst)
# #     sns.histplot(flatten_css_gene_lst, kde=False, color=color, log_scale=True, element="step", fill=False)
# #     sns.histplot(flatten_css_gene_lst, kde=False, color=color, log_scale=True)
#     plt.xlabel("Length of each element in dataset", fontsize=12)
#     plt.show()
    return flatten_css_gene_lst

In [16]:
flatten_css_gene_lst=len_css_gene(css_gene_lst_all)

In [17]:
len(flatten_css_gene_lst)

52260

In [18]:
# plt.hist(flatten_css_gene_lst)  ## too large? kernel dies everytime I run this

In [25]:
g_lst_chr=whGene2GLChr(whole_gene_file='../database/RefSeq/RefSeq.WholeGene.bed')

In [26]:
# for i in range(len(g_df_chr_lst)):
len(g_lst_chr)   

24

In [30]:
for i in range(len(g_lst_chr)):
    gene_df=g_lst_chr[i]
    print('gene_df["TxStart"].iloc[0] is {} at {}-th chromosome.'.format(gene_df["TxStart"].iloc[0], i))
    assert gene_df["TxStart"].iloc[0]<=2
    if i==3:
        break

gene_df["TxStart"].iloc[0] is 11873 at 0-th chromosome.


AssertionError: 

In [63]:
len(gene_df)

5255

In [72]:
gene_df["TxStart"].iloc[5254]

249200441

In [59]:
#check whether the start, end is inside the whole css length (not from the very first, end at the very last)

for i in range(len(g_df_chr_lst)):
    g_df_chr=g_df_chr_lst[i]["TxStart"]  # pd.Series
    css_chr=css_lst_chr[i]
    assert g_df_chr.iloc[0] <=2, "Gene starts from the very first location."
    assert g_df_chr.iloc[-1]>=len(css_chr), "Gene ends at the very last location."   

In [81]:
def compNonGene2css(whole_gene_file,df):
    """
    Input: Reference gene file, df (CSS)
    Output: list of chromosome-wise list that contains the css at "non-genic" area only.
    """
    g_lst_chr=whGene2GLChr(whole_gene_file) # list of gene table df per chromosome
    css_lst_chr=df2longcss(df) # list of long css per chromosome
    total_chr=len(g_lst_chr)
    
    css_Ngene_lst_all=[]
    for i in tqdm_notebook(range(total_chr)):
        css=css_lst_chr[i]   # long css of i-th chromosome
        gene_df=g_lst_chr[i] # gene df of i-th chromosome
        
        assert gene_df["TxStart"].iloc[0]>=1, "Gene starts from the very first location at {}-th chromosome.".format(i)
        assert gene_df["TxEnd"].iloc[-1]<=len(css), "Gene ends at the very last location at {}-th chromosome.".format(i)  
                
        css_Ngene_lst_chr=[]
        for j in range(len(gene_df)):
            if j==0:
                ng_start=1 # to avoid any "zero" causing problem 
                ng_end=gene_df["TxStart"].iloc[j]
#                 css_gene=css[g_start:g_end] 
            elif j==len(gene_df)-1: 
                ng_start=gene_df["TxEnd"].iloc[j]
                ng_end=len(css)-1
            else:
                ng_start=gene_df["TxEnd"].iloc[j-1]
                if j <=3:
                    print("j: {} | ng_start: {}".format(j, ng_start))
                ng_end=gene_df["TxStart"].iloc[j]
                if j <=3:
                    print("j: {} | ng_end: {}".format(j, ng_end))
        
            css_Ngene=css[ng_start:ng_end]
            css_Ngene_lst_chr.append(css_Ngene)
        
        css_Ngene_lst_all.append(css_Ngene_lst_chr)   
    assert len(css_Ngene_lst_all)==total_chr
    return css_Ngene_lst_all

In [82]:
css_Ngene_lst_all=compNonGene2css(whole_gene_file,df)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=24.0), HTML(value='')))

j: 1 | ng_start: 14409
j: 1 | ng_end: 14361
j: 2 | ng_start: 29370
j: 2 | ng_end: 17368
j: 3 | ng_start: 17436
j: 3 | ng_end: 17368
j: 1 | ng_start: 46588
j: 1 | ng_end: 218135
j: 2 | ng_start: 264743
j: 2 | ng_end: 218135
j: 3 | ng_start: 256340
j: 3 | ng_end: 218135
j: 1 | ng_start: 66175
j: 1 | ng_end: 238278
j: 2 | ng_start: 451097
j: 2 | ng_end: 238278
j: 3 | ng_start: 451097
j: 3 | ng_end: 239325
j: 1 | ng_start: 88099
j: 1 | ng_end: 53178
j: 2 | ng_start: 88099
j: 2 | ng_end: 53178
j: 3 | ng_start: 88099
j: 3 | ng_end: 53178
j: 1 | ng_start: 190087
j: 1 | ng_end: 191625
j: 2 | ng_start: 195468
j: 2 | ng_end: 204874
j: 3 | ng_start: 218297
j: 3 | ng_end: 218337
j: 1 | ng_start: 148159
j: 1 | ng_end: 181465
j: 2 | ng_start: 205484
j: 2 | ng_end: 292056
j: 3 | ng_start: 351355
j: 3 | ng_end: 292056
j: 1 | ng_start: 155461
j: 1 | ng_end: 149717
j: 2 | ng_start: 155461
j: 2 | ng_end: 192968
j: 3 | ng_start: 300740
j: 3 | ng_end: 330135
j: 1 | ng_start: 117024
j: 1 | ng_end: 158344
j:

In [85]:
gene_df.iloc[:10]

Unnamed: 0,chromosome,TxStart,TxEnd,name
1463,chr1,11873,14409,NR_046018
1460,chr1,14361,29370,NR_024540
1458,chr1,17368,17436,NR_106918
1455,chr1,17368,17436,NR_128720
1457,chr1,17368,17436,NR_107062
1454,chr1,17368,17436,NR_107063
1456,chr1,30365,30503,NR_036051
1453,chr1,30365,30503,NR_036267
1465,chr1,30365,30503,NR_036268
1464,chr1,30365,30503,NR_036266


In [None]:
############ Oh,,, genes are overlapped! I need to make a gene table not to overlap.. to show the non-genic (intergenic)


In [86]:
g_df_chr_lst=whGene2GLChr(whole_gene_file='../database/RefSeq/RefSeq.WholeGene.bed')

In [87]:
g_df_chr_lst

[     chromosome    TxStart      TxEnd          name
 1463       chr1      11873      14409     NR_046018
 1460       chr1      14361      29370     NR_024540
 1458       chr1      17368      17436     NR_106918
 1455       chr1      17368      17436     NR_128720
 1457       chr1      17368      17436     NR_107062
 ...         ...        ...        ...           ...
 5247       chr1  249144202  249153315  NM_001193328
 5249       chr1  249144202  249153125  NM_001136036
 5250       chr1  249144202  249153315     NM_017865
 5253       chr1  249200441  249213345  NM_001017434
 5254       chr1  249200441  249213345     NM_170725
 
 [5255 rows x 4 columns],
      chromosome    TxStart      TxEnd          name
 6571       chr2      38813      46588  NM_001077710
 5504       chr2     218135     264743     NR_104223
 6572       chr2     218135     256340  NM_001282687
 6573       chr2     218135     261130     NR_104227
 5511       chr2     218135     264866  NM_001282682
 ...         ...  

In [99]:
g_df_chr_lst[0]["TxStart"].iloc[0]

11873

In [116]:
# try to create a function for collapsing the row by my self


def count_sameStart(g_df_chr_lst,chr_no):
    cnt_same_start=0
    tot_start=len(g_df_chr_lst[chr_no])
    for i in range(len(g_df_chr_lst[chr_no])):
        chr1=g_df_chr_lst[chr_no]["TxStart"]
        if i==0:
            continue
        elif chr1.iloc[i]==chr1.iloc[i-1]:
            cnt_same_start+=1  # how many same start in rows
        else:
            continue
    prop_same_start=cnt_same_start/tot_start
    return cnt_same_start, prop_same_start

In [108]:
cnt_same_start, prop_same=count_sameStart(g_df_chr_lst,0)

In [109]:
cnt_same_start, prop_same

(2171, 0.4131303520456708)

In [117]:
def count_sameEnd(g_df_chr_lst,chr_no):
    cnt_same_end=0
    tot_end=len(g_df_chr_lst[chr_no])
    for i in range(len(g_df_chr_lst[chr_no])):
        chr1=g_df_chr_lst[chr_no]["TxEnd"]       
        if i==0:
            continue
        elif chr1.iloc[i]==chr1.iloc[i-1]:
            cnt_same_end+=1  # how many same start in rows
        else:
            continue
    prop_same_end=cnt_same_end/tot_end
    return cnt_same_end, prop_same_end

In [112]:
cnt_same_end, prop_same=count_sameEnd(g_df_chr_lst,0)

In [113]:
cnt_same_end, prop_same

(2033, 0.3868696479543292)

In [None]:
def count_samePos(g_df_chr_lst):
    cnt_same_start_all=[]
    cnt_same_end_all=[]
    tot_chr_no=len(g_df_chr_lst)
    
    ########### count the same start position ###########
    def count_sameStart(g_df_chr_lst,chr_no):
        cnt_same_start=0
        tot_start=len(g_df_chr_lst[chr_no])
        for i in range(len(g_df_chr_lst[chr_no])):
            chr1=g_df_chr_lst[chr_no]["TxStart"]
            if i==0:
                continue
            elif chr1.iloc[i]==chr1.iloc[i-1]:
                cnt_same_start+=1  # how many same start in rows
            else:
                continue
        prop_same_start=cnt_same_start/tot_start
        return cnt_same_start, prop_same_start
    
    ########### count the same end position ############
    def count_sameEnd(g_df_chr_lst,chr_no):
        cnt_same_end=0
        tot_end=len(g_df_chr_lst[chr_no])
        for i in range(len(g_df_chr_lst[chr_no])):
            chr1=g_df_chr_lst[chr_no]["TxEnd"]       
            if i==0:
                continue
            elif chr1.iloc[i]==chr1.iloc[i-1]:
                cnt_same_end+=1  # how many same start in rows
            else:
                continue
        prop_same_end=cnt_same_end/tot_end
        return cnt_same_end, prop_same_end
    ####################################################
    
    for i in range(tot_chr_no):
        

In [118]:
len(g_df_chr_lst)

24

In [97]:
with open("../database/test_chr1gene.txt", "wb") as f:
    pickle.dump(g_df_chr_lst[0],f)

In [None]:
def compGene2css(whole_gene_file,df):
    g_lst_chr=whGene2GLChr(whole_gene_file) # list of gene table df per chromosome
    css_lst_chr=df2longcss(df) # list of long css per chromosome
    total_chr=len(g_lst_chr)
    
    css_gene_lst_all=[]
    for i in tqdm_notebook(range(total_chr)):
        css=css_lst_chr[i]   # long css of i-th chromosome
        gene_df=g_lst_chr[i] # gene df of i-th chromosome
        
        css_gene_lst_chr=[]
        for j in range(len(gene_df)): # it's right, because we scan the gene file from the df. and only get the no.
            g_start=gene_df["TxStart"].iloc[j]-1  # python counts form 0
            g_end=gene_df["TxEnd"].iloc[j]+1      # python excludes the end
            
            css_gene=css[g_start:g_end]           # cut the gene area only
            css_gene_lst_chr.append(css_gene)     # store in the list
          
        css_gene_lst_all.append(css_gene_lst_chr)  # list of list
    
    assert len(css_gene_lst_all)==total_chr
    return css_gene_lst_all