### TEST for unitility
Various functions to process the initial bed data

In [121]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from motif_utils import seq2kmer

In [122]:
# file name reader
# make a list of all the filename

path='../database/bed/unzipped/'
bed_files=os.listdir(path)

def file_list_maker(path, files):
    all_files=[]
    for file in files:
        file_path=os.path.join(path,file)
        all_files.append(file_path)
    return all_files
all_files=file_list_maker(path, bed_files)

In [123]:
all_files[0]

'../database/bed/unzipped/E001_15_coreMarks_stateno.bed'

In [124]:
# test file
test_filename='../database/bed/unzipped/E017_15_coreMarks_stateno.bed'

In [125]:
state_dict={1:"A", 2:"B", 3:"C", 4:"D", 5:"E",6:"F",7:"G",8:"H" ,
                9:"I" ,10:"J",11:"K", 12:"L", 13:"M", 14:"N", 15:"O"}

In [126]:
# create dataframe from bed file
# bed file here means: EXXX_15_coreMarks_stateno.bed

def bed2df_as_is(filename):    
    
    """Create dataframe from the .bed file, as is.
    Dataframe contains following columns:
    chromosome |  start |  end  | state """
    
    df_raw=pd.read_csv(filename, sep='\t', lineterminator='\n', header=None, low_memory=False)
    df=df_raw.rename(columns={0:"chromosome",1:"start",2:"end",3:"state"})
    df=df[:-1]
    df["start"]=pd.to_numeric(df["start"])
    df["end"]=pd.to_numeric(df["end"])
    
    return df

In [127]:
def bed2df_expanded(filename):
    
    """Create an expanded dataframe from the .bed file.
    Dataframe contains following columns:
    chromosome |  start |  end  | state | length | unit | state_seq | state_seq_full"""
   
    df_raw=pd.read_csv(filename, sep='\t', lineterminator='\n', header=None, low_memory=False)
    df=df_raw.rename(columns={0:"chromosome",1:"start",2:"end",3:"state"})
    df=df[:-1]
    df["start"]=pd.to_numeric(df["start"])
    df["end"]=pd.to_numeric(df["end"])
    df["state"]=pd.to_numeric(df["state"])
    df["length"]=df["end"]-df["start"]
    df["unit"]=(df["length"]/100).astype(int)
               
    df["state_seq"]=df["state"].map(state_dict)
    df["state_seq_full"]=df["unit"]*df["state_seq"]
    
    return df 

In [128]:
def numchr(df):
    assert "chromosome" in df.columns, "Check your df has the column named 'chromosome'"
    return df["chromosome"].nunique()    

In [132]:
def total_df_maker(all_files):
    
    """Create a list of dataframe from a list of bed files.]
    This function utilizes the function named 'bed2df_expanded.'"""
    
    total_df=[]
    for filename in all_files:
        df=bed2df_expanded(filename)
        total_df.append(df)
    return total_df

In [133]:
df_asis=bed2df_as_is(test_filename)
df_asis.head()

Unnamed: 0,chromosome,start,end,state
0,chr1,0,9800,15
1,chr1,9800,10800,9
2,chr1,10800,13000,15
3,chr1,13000,13200,7
4,chr1,13200,16000,5


In [134]:
df=bed2df_expanded(test_filename)
df.head()

Unnamed: 0,chromosome,start,end,state,length,unit,state_seq,state_seq_full
0,chr1,0,9800,15,9800,98,O,OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO...
1,chr1,9800,10800,9,1000,10,I,IIIIIIIIII
2,chr1,10800,13000,15,2200,22,O,OOOOOOOOOOOOOOOOOOOOOO
3,chr1,13000,13200,7,200,2,G,GG
4,chr1,13200,16000,5,2800,28,E,EEEEEEEEEEEEEEEEEEEEEEEEEEEE


In [135]:
numchr(df)

25

In [136]:
total_df_list=total_df_maker(all_files)

In [138]:
total_df_list[0].head()

Unnamed: 0,chromosome,start,end,state,length,unit,state_seq,state_seq_full
0,chr1,0,9800,15,9800,98,O,OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO...
1,chr1,9800,10600,9,800,8,I,IIIIIIII
2,chr1,10600,540400,15,529800,5298,O,OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO...
3,chr1,540400,540800,1,400,4,A,AAAA
4,chr1,540800,569800,15,29000,290,O,OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO...


In [140]:
total_df_list[126].head()

Unnamed: 0,chromosome,start,end,state,length,unit,state_seq,state_seq_full
0,chr1,0,54000,15,54000,540,O,OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO...
1,chr1,54000,56200,14,2200,22,N,NNNNNNNNNNNNNNNNNNNNNN
2,chr1,56200,84000,15,27800,278,O,OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO...
3,chr1,84000,88800,14,4800,48,N,NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
4,chr1,88800,235400,15,146600,1466,O,OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO...


In [None]:
# To collect the number of sequences for each chromosome, scan the start column
total_row=len(df)
chr_len=[]
chr_check=[]
chr_index=[]

for i in range(total_row):
    if (df["start"].iloc[i]==0) & (i >0):
        chr_len.append(df["end"].iloc[i-1])
        chr_check.append(df["start"].iloc[i]) # assert
        chr_index.append(i-1)


end_len=df["end"].iloc[-1]
end_index=total_row-1

chr_len.append(end_len)
chr_index.append(end_index)

assert len(chr_len)==df["chromosome"].nunique()
assert len(chr_index)==df["chromosome"].nunique()

### class test

In [74]:
class bed2df_cls:
    
    def __init__(self, fname):
        
        self.fname=fname
        
        df_raw=pd.read_csv(fname, sep='\t', lineterminator='\n', 
                           header=None, low_memory=False)
        df=df_raw.rename(columns={0:"chromosome",1:"start",2:"end",3:"state"})
        df=df[:-1] # remove the end row: it displayed the cell id and track no.
        df["start"]=pd.to_numeric(df["start"])
        df["end"]=pd.to_numeric(df["end"])
        df["length"]=df["end"]-df["start"]
        df["unit"]=(df["length"]/100).astype(int)
        
        state_dict={1:"A", 2:"B", 3:"C", 4:"D", 5:"E",6:"F",7:"G",8:"H" ,
                    9:"I" ,10:"J",11:"K", 12:"L", 13:"M", 14:"N", 15:"O"}
        
        df["state"]=pd.to_numeric(df["state"])
        df["state_seq"]=df["state"].map(state_dict)
        df["state_seq_full"]=df["unit"]*df["state_seq"]
        
        self.df=df
        self.df_len=len(df)
        self.numchr=df["chromosome"].nunique()
        
        print(".df : dataframe \n.df_len : length of dataframe \n.numchr : no. of chromosome")
        
     

In [76]:
df_test=bed2df_cls(filename)
df_test.df.head()

.df : dataframe 
.df_len : length of dataframe 
.numchr : no. of chromosome


Unnamed: 0,chromosome,start,end,state,length,unit,state_seq,state_seq_full
0,chr1,0,9800,15,9800,98,O,OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO...
1,chr1,9800,10800,9,1000,10,I,IIIIIIIIII
2,chr1,10800,13000,15,2200,22,O,OOOOOOOOOOOOOOOOOOOOOO
3,chr1,13000,13200,7,200,2,G,GG
4,chr1,13200,16000,5,2800,28,E,EEEEEEEEEEEEEEEEEEEEEEEEEEEE
