# utility_gene
Handling the human gene location file using reference human genome *hg19*

<br>

**Gene file info**
* This file includes the information of the location of genes on the human genome.
* Location: (local linux DLBOX2 ➡️) `../database/RefSeq/RefSeq.WholeGene.bed` (server ➡️) `euphonium:/work/Database/UCSC/hg19` 
* Structure: 
 * tab-delimited <br>
 * columns: `{0:"chromosome",1:"TxStart",2:"TxEnd",3:"name",4:"unk0",5:'strand', 6:'cdsStart', 7:'cdsEnd',8:"unk1",9:"exonCount",10:"unk2",11:"unk3"}`


**Function list**


In [None]:
!jupyter nbconvert --to script utility_gene.ipynb

In [1]:
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from motif_utils import seq2kmer
from scipy.stats import norm
import collections
import operator
import itertools
import pickle
from tqdm import tqdm
from tqdm.notebook import tqdm_notebook

## Gene file preprocessing

In [2]:
whole_gene_file='../database/RefSeq/RefSeq.WholeGene.bed'

### WhGene2GLChr
* 

In [None]:
# function for preprocess the whole gene data and produce chromosome-wise gene lists
# each element is dataframe

def whGene2GLChr(whole_gene_file='../database/RefSeq/RefSeq.WholeGene.bed'):
    g_fn=whole_gene_file
    g_df_raw=pd.read_csv(g_fn, sep='\t', lineterminator='\n', header=None, low_memory=False)
    g_df_int=g_df_raw.rename(columns={0:"chromosome",1:"TxStart",2:"TxEnd",3:"name",4:"unk0",
                                  5:'strand', 6:'cdsStart', 7:'cdsEnd',8:"unk1",9:"exonCount",
                                  10:"unk2",11:"unk3"})
    g_df=g_df_int[["chromosome","TxStart","TxEnd","name"]]
    
    # Remove other than regular chromosomes
    chr_lst=['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10',
             'chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chr19',
             'chr20','chr21','chr22','chrX','chrY']
    g_df=g_df.loc[g_df["chromosome"].isin(chr_lst)]
    
    # Create a list of chromosome-wise dataframe 
    g_df_chr_lst=[]
    for num in range(len(chr_lst)):
        chr_num=chr_lst[num]
        g_chr_df='g_'+chr_num
        locals()[g_chr_df]=g_df[g_df["chromosome"]==chr_num]
        g_chr_df=locals()[g_chr_df]
        g_chr_df=g_chr_df.sort_values("TxStart")
        g_df_chr_lst.append(g_chr_df)
        
    return g_df_chr_lst