In [14]:
from css_utility_expansion_dev import *

In [15]:
# test_file.bed is the 4th column (state is converted to only have number)
# using the command: awk 'BEGIN{FS=OFS="\t"} {split($4, a, "_"); $4 = a[1]; print}' filename.bed > modified_filename.bed

bed_path="../database/bed/IHEC_unzipped/test_file.bed"

In [17]:
def bed2df_expanded(filename, state_num=15):
    """Create an expanded dataframe from the .bed file.
    Dataframe contains following columns:
    chromosome |  start |  end  | state | length | unit | state_seq | state_seq_full"""
    if not os.path.exists(filename):
        raise FileNotFoundError("Please provide a valid file path.")

    df_raw=pd.read_csv(filename, sep='\t', lineterminator='\n', header=None, low_memory=False)
    df=df_raw.rename(columns={0:"chromosome",1:"start",2:"end",3:"state"})
    df=df[:-1]
    df["start"]=pd.to_numeric(df["start"])
    df["end"]=pd.to_numeric(df["end"])
    df["state"]=pd.to_numeric(df["state"])
    df["length"]=df["end"]-df["start"]
    df["unit"]=(df["length"]/200).astype(int)  # chromatin state is annotated every 200 bp (18th May 2022)
    ##################### modified to accommodate 18 states
    if state_num==18:
        df["state_seq"]=df["state"].map(state_dict_18)
        df["state_seq_full"]=df["unit"]*df["state_seq"]
    #######################################################
    else:
        df["state_seq"]=df["state"].map(state_dict)
        df["state_seq_full"]=df["unit"]*df["state_seq"]
    
    return df 

In [18]:
df=bed2df_expanded(bed_path, 18)

In [19]:
df

Unnamed: 0,chromosome,start,end,state,length,unit,state_seq,state_seq_full
0,chr10,0,73800,18,73800,369,R,RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR...
1,chr10,73800,74200,2,400,2,B,BB
2,chr10,74200,76200,6,2000,10,F,FFFFFFFFFF
3,chr10,76200,76600,2,400,2,B,BB
4,chr10,76600,87200,18,10600,53,R,RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR...
...,...,...,...,...,...,...,...,...
331556,chrX,156000200,156000400,11,200,1,K,K
331557,chrX,156000400,156000600,9,200,1,I,I
331558,chrX,156000600,156001200,10,600,3,J,JJJ
331559,chrX,156001200,156002000,9,800,4,I,IIII


In [20]:
test_css=df2unitcss(df)

In [22]:
len(test_css[0])

668987

In [26]:
test_css[0][5000:5500]

'EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEEEEEEEEEEEEEEEEEEEEEEEEEEEEFFJJJJJJJJJFFFFFFFFFFFFFFFAAAAAAAAAAJJJGGGGGFFFFEEEEEEEEEEFGHCCAAAAAAAAAAAAAIIIIFFFFFFFKKKKKIIIIIIIIIIIIIIIIJJJJFFFFFFFFJJJJJJJJJIIJJJJJJJJJIIIIIJJFFFFFEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEGGGGGGGGGGGGGGGEEEEEEEEEEEEEEFFFFEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE'