# TEST pycl package
** This notebook contains tests for all functions contained in pycl package **

In [1]:
# Larger display 
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:75% !important; }</style>"))

# Import of required packages
from os import remove

# import all function from pycl
from pycl import *
import pycl

---
# PREDICATES

## is_readable_file

In [2]:
help(is_readable_file)

Help on function is_readable_file in module pycl:

is_readable_file(fp)
    Verify the readability of a file or list of file



In [3]:
try:
    is_readable_file("./test_data/KJHYTGYUJ")
    print ("OK")
except OSError as E:
    print(E)

./test_data/KJHYTGYUJ is not a valid file


In [4]:
try:
    is_readable_file("./test_data/RADAR_Secondary.txt")
    print ("OK")
except OSError as E:
    print(E)

OK


## is_gziped

In [5]:
help(is_gziped)

Help on function is_gziped in module pycl:

is_gziped(fp)
    Return True if the file is Gziped else False



In [6]:
is_gziped("./test_data/RADAR_Secondary.txt")

False

In [7]:
is_gziped("./test_data/RADAR_Secondary.txt.gz")

True

---
# PATH MANIPULATION

## file_basename

In [8]:
help(file_basename)

Help on function file_basename in module pycl:

file_basename(path)
    Return the basename of a file without folder location and extension



In [9]:
file_basename("./test_data/RADAR_Secondary.txt.gz")

'RADAR_Secondary'

## file_extension


In [10]:
help(file_extension)

Help on function file_extension in module pycl:

file_extension(path)
    Return The extension of a file in lower-case



In [11]:
file_extension("./test_data/RADAR_Secondary.txt.gz")

'gz'

## file_name

In [12]:
help(file_name)

Help on function file_name in module pycl:

file_name(path)
    Return The complete name of a file with the extension but without folder location



In [13]:
file_name("./test_data/test/RADAR_Secondary.txt.gz")

'RADAR_Secondary.txt.gz'

## dir_name

In [14]:
help(dir_name)

Help on function dir_name in module pycl:

dir_name(path)
    Return the complete path where is located the file without the file name



In [15]:
dir_name("./test_data/test/RADAR_Secondary.txt.gz")

'test'

---
# STRING FORMATTING

## supersplit

In [16]:
help(supersplit)

Help on function supersplit in module pycl:

supersplit(string, separator='')
    like split but can take a list of separators instead of a simple separator



In [17]:
a = "chr7\t74138\t774138\tA>I|LOC100129917|LUNG:LYMPHOBLASTOID_CELL_LINE|15342557:15258596:22327324\t0"

print(supersplit(a, ["\t","|"]))

print(supersplit(a))

print(supersplit(a, "|"))

['chr7', '74138', '774138', 'A>I', 'LOC100129917', 'LUNG:LYMPHOBLASTOID_CELL_LINE', '15342557:15258596:22327324', '0']
['chr7', '74138', '774138', 'A>I|LOC100129917|LUNG:LYMPHOBLASTOID_CELL_LINE|15342557:15258596:22327324', '0']
['chr7\t74138\t774138\tA>I', 'LOC100129917', 'LUNG:LYMPHOBLASTOID_CELL_LINE', '15342557:15258596:22327324\t0']


## rm_blank

In [18]:
help(rm_blank)

Help on function rm_blank in module pycl:

rm_blank(name, replace='')
    Replace blank spaces in a name by a given character (default = remove)
    Blanks at extremities are always removed and nor replaced



In [19]:
a = "chr\t\t17|LU NG:LYMPHOBLAST    OID_CELL_LINE|15342557:152585     96:22327324\t0"

print(rm_blank(a))

print(rm_blank(a, replace="*"))

chr17|LUNG:LYMPHOBLASTOID_CELL_LINE|15342557:15258596:223273240
chr*17|LU*NG:LYMPHOBLAST*OID_CELL_LINE|15342557:152585*96:22327324*0


---
# FILE MANIPULATION

## copyFile

In [20]:
help(copyFile)

Help on function copyFile in module pycl:

copyFile(src, dest)
    Copy a single file to a destination file or folder (with error handling/reporting)
    @param src Source file path
    @param dest Path of the folder where to copy the source file



In [21]:
copyFile(src="./test_data/RADAR_Secondary.txt", dest="./test_data/")

Error: './test_data/RADAR_Secondary.txt' and './test_data/RADAR_Secondary.txt' are the same file


In [22]:
copyFile(src="./test_data/RADAR_Secondary.txt", dest="./test_data/test_dir/")

## gzip_file

In [23]:
help(gzip_file)

Help on function gzip_file in module pycl:

gzip_file(in_path, out_path=None)
    @param in_path Path of the input uncompressed file
    @param out_path Path of the output compressed file (facultative)
    @exception  OSError Can be raise by open



In [24]:
gzip_file("./test_data/RADAR_Secondary.txt")

Compressing ./test_data/RADAR_Secondary.txt


'/home/aleg/Programming/Python3/pycl/test_data/RADAR_Secondary.txt.gz'

## gunzip_file

In [25]:
help(gunzip_file)

Help on function gunzip_file in module pycl:

gunzip_file(in_path, out_path=None)
    @param in_path Path of the input compressed file
    @param out_path Path of the output uncompressed file (facultative)
    @exception  OSError Can be raise by open



In [26]:
gunzip_file("./test_data/RADAR_Secondary.txt.gz")

Uncompressing ./test_data/RADAR_Secondary.txt.gz


'/home/aleg/Programming/Python3/pycl/test_data/RADAR_Secondary.txt'

---
# FILE INFORMATION

## head

In [27]:
help(head)

Help on function head in module pycl:

head(file, n=10, ignore_hashtag_line=False)
    Emulate linux head cmd. Also works for gzip files



In [28]:
head("./test_data/RADAR_Main.txt", n= 3)

#chromosome	position	gene	strand	annot1	annot2	alu?	non_alu_repetitive?	conservation_chimp	conservation_rhesus	conservation_mouse
chr1	206256301	C1orf186	-	intronic	intronic	no	no	N	N	N
chr6	116991832	intergenic	-	intergenic	intergenic	no	no	N	N	N



In [29]:
head("./test_data/RADAR_Main.txt", n=3, ignore_hashtag_line=True)

chr1	206256301	C1orf186	-	intronic	intronic	no	no	N	N	N
chr6	116991832	intergenic	-	intergenic	intergenic	no	no	N	N	N
chr7	30504355	NOD1	-	intronic	intronic	no	no	N	N	N



In [30]:
head("./test_data/RADAR_Secondary.txt.gz", n=3, ignore_hashtag_line=True)

chr1:1037916	Peng et al 2012	Lymphoblastoid cell line	9	66.67
chr1:1156882	Peng et al 2012	Lymphoblastoid cell line	42	36.59
chr1:1157460	Peng et al 2012	Lymphoblastoid cell line	66	22.73



## linerange

In [31]:
help(linerange)

Help on function linerange in module pycl:

linerange(file, range_list=[])
    Print a range of lines in a file according to a list of start end lists



In [32]:
file = "./test_data/RADAR_Secondary.txt"
linerange (file)

0	#location	reference	tissue	coverage	editing_level(%)
1	chr1:1037916	Peng et al 2012	Lymphoblastoid cell line	9	66.67
2	chr1:1156882	Peng et al 2012	Lymphoblastoid cell line	42	36.59

97	chr1:10560773	Peng et al 2012	Lymphoblastoid cell line	20	40.00
98	chr1:10602697	Peng et al 2012	Lymphoblastoid cell line	5	60.00
99	chr1:11138237	Peng et al 2012	Lymphoblastoid cell line	14	42.86



In [33]:
file = "./test_data/RADAR_Secondary.txt"
linerange (file, [[0,5],[10,11],[75,80],[98,105]])

0	#location	reference	tissue	coverage	editing_level(%)
1	chr1:1037916	Peng et al 2012	Lymphoblastoid cell line	9	66.67
2	chr1:1156882	Peng et al 2012	Lymphoblastoid cell line	42	36.59
3	chr1:1157460	Peng et al 2012	Lymphoblastoid cell line	66	22.73
4	chr1:1252441	Peng et al 2012	Lymphoblastoid cell line	11	72.73
5	chr1:1252443	Peng et al 2012	Lymphoblastoid cell line	11	45.45

10	chr1:1594977	Peng et al 2012	Lymphoblastoid cell line	227	24.67
11	chr1:1594978	Peng et al 2012	Lymphoblastoid cell line	228	4.82

75	chr1:6710585	Peng et al 2012	Lymphoblastoid cell line	30	65.52
76	chr1:6710595	Peng et al 2012	Lymphoblastoid cell line	28	50.00
77	chr1:6941764	Peng et al 2012	Lymphoblastoid cell line	7	57.14
78	chr1:7908257	Peng et al 2012	Lymphoblastoid cell line	10	40.00
79	chr1:7980494	Peng et al 2012	Lymphoblastoid cell line	38	24.32
80	chr1:7980525	Peng et al 2012	Lymphoblastoid cell line	24	37.50

98	chr1:10602697	Peng et al 2012	Lymphoblastoid cell line	5	60.00
99	chr1:11138237	Peng et

## colsum

In [2]:
help(colsum)

Help on function colsum in module pycl:

colsum(file, colrange=None, separator='', header=False, ignore_hashtag_line=False, max_items=10, ret_type='md')
    Create a summary of selected columns of a file
    Possible return types:
        md = markdown formatted table,
        dict = raw parsing dict,
        report = Indented_text_report
        html = for ipython notebook



In [35]:
print(colsum("./test_data/RADAR_Main.txt", header=True, colrange=[0,2,6], max_items=15))

|#chromosome|chr1|chr17|chr9|chr15|chr6|chr14|chr18|chr2|chrY|chr4|chr7|
|:---|:---|:---|:---|:---|:---|:---|:---|:---|:---|:---|:---|
|Count|4|3|2|2|2|1|1|1|1|1|1|

|gene|RABEP1|NUP133|JUB|GREB1L|SPHKAP|NLGN4Y|CELSR2|RBPJ|TLE4|SOCS7|ADPGK|UBE2O|TSC1|GRIK2|MEF2A|...|
|:---|:---|:---|:---|:---|:---|:---|:---|:---|:---|:---|:---|:---|:---|:---|:---|:---|
|Count|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|...|

|alu?|no|
|:---|:---|
|Count|19|




In [36]:
colsum("./test_data/RADAR_Main.txt", header=True, ret_type="dict", colrange=[0,3])

OrderedDict([(0,
              OrderedDict([('chr1', 4),
                           ('chr6', 2),
                           ('chr7', 1),
                           ('chr15', 2),
                           ('chr9', 2),
                           ('chr17', 3),
                           ('chr4', 1),
                           ('chrY', 1),
                           ('chr2', 1),
                           ('chr18', 1),
                           ('chr14', 1)])),
             (3, OrderedDict([('-', 10), ('+', 9)]))])

In [37]:
print(colsum(
        "./test_data/RADAR_clean.txt",
        header=True,
        ignore_hashtag_line=True,
        ret_type="report",
        separator=["\t","|"],
        max_items=5))

0
	chr1	997
1
	225974746	1
	225974735	1
	225974581	1
	224599486	1
	224584888	1
	...	...
2
	225974746	1
	225974735	1
	225974581	1
	224599486	1
	224584888	1
	...	...
3
	A>I	997
4
	FDPS	34
	MDM4	31
	CTSS	28
	DNAJC11	25
	S100PBP	24
	...	...
5
	YH	997
6
	22327324	997
7
	33.33	31
	66.67	31
	50.00	23
	57.14	22
	60.00	22
	...	...
8
	-	527
	+	470



## fastcount

In [38]:
help(fastcount)

Help on function fastcount in module pycl:

fastcount(file)
    Efficient way to count the number of lines in a file



In [39]:
fastcount("./test_data/Small_m5C_Squires_hg38.bed")

200

## simplecount

In [40]:
help(simplecount)

Help on function simplecount in module pycl:

simplecount(filename, ignore_hashtag_line=False)
    Simple way to count the number of lines in a file with more options



In [41]:
simplecount("./test_data/Small_m5C_Squires_hg38.bed", ignore_hashtag_line=True)

194

---
# DIRECTORY MANIPULATION

## mkdir

In [42]:
help(mkdir)

Help on function mkdir in module pycl:

mkdir(fp, level=1)
    Reproduce the ability of UNIX "mkdir -p" command
    (ie if the path already exits no exception will be raised).
    Can create nested directories by recursivity
    @param  fp path name where the folder should be created
    @level  level   level in the path where to start to create the directories.
                    Used by the program for the recursive creation of directories
    @exception  OSError or PermissionError can be raise by os.mkdir



In [43]:
mkdir("./test_data/test_dir")

In [44]:
mkdir ("./test/test/test")
!rm -rf ./test

Creating /home/aleg/Programming/Python3/pycl/test
Creating /home/aleg/Programming/Python3/pycl/test/test
Creating /home/aleg/Programming/Python3/pycl/test/test/test


---
# SHELL MANIPULATION

## make_cmd_str

In [45]:
help(make_cmd_str)

Help on function make_cmd_str in module pycl:

make_cmd_str(prog_name, opt_dict={}, opt_list=[])
    Create a Unix like command line string from a
    @param prog_name Name (if added to the system path) or path of the program
    @param opt_dict Dictionary of option arguments such as "-t 5". The option flag have to
    be the key (without "-") and the the option value in the dictionary value. If no value is
    requested after the option flag "None" had to be assigned to the value field.
    @param opt_list List of simple command line arguments
    @exemple make_cmd_str("bwa", {"b":None, t":6, "i":"../idx/seq.fa"}, ["../read1", "../read2"])



In [46]:
make_cmd_str("bwa", {"-b":None, "-t":6, "-i":"../idx/seq.fa"}, ["../read1", "../read2"])

'bwa -b -t 6 -i ../idx/seq.fa ../read1 ../read2 '

## bash_basic

In [47]:
help(bash_basic)

Help on function bash_basic in module pycl:

bash_basic(cmd)
    Sent basic bash command



In [48]:
print(bash_basic("ls -l"))
print(bash_basic("echo TTTT"))
print(bash_basic("grep ./test_data/RADAR_Secondary.txt"))

total 144
-rw-rw-r-- 1 aleg aleg 35141 Jun  6 10:20 LICENSE
drwxrwxr-x 2 aleg aleg  4096 Aug 12 16:36 __pycache__
-rwxrwxrwx 1 aleg aleg 32346 Aug 12 16:35 pycl.py
-rwxrwxrwx 1 aleg aleg   438 Aug  9 12:03 README.md
drwxrwxr-x 3 aleg aleg  4096 Aug 11 17:15 test_data
-rw-rw-r-- 1 aleg aleg 62443 Aug 12 16:28 test_pycl.ipynb


None
TTTT


None


None


## bash

In [49]:
help(bash)

Help on function bash in module pycl:

bash(cmd, stdin=None, ret_stderr=False, ret_stdout=True, str_output=True)
    Run a command line in the default shell and return the standard output
    @param  cmd A command line string formatted as a string
    @param  stdinput    Facultative parameters to redirect an object to the standard input
    @param  ret_stderr  If True the standard error output will be returned
    @param  ret_stdout  If True the standard output will be returned
    @param  str_output  Transform the std output in a string instead of the bytes-like object
    @note If ret_stderr and ret_stdout are True a tuple will be returned and if both are False
    None will be returned
    @return If no standard error return the standard output as a string
    @exception  OSError Raise if a message is return on the standard error output
    @exception  (ValueError,OSError) May be raise by Popen



In [50]:
print(bash("ls"))

LICENSE
__pycache__
pycl.py
README.md
test_data
test_pycl.ipynb



In [51]:
bash("head -n 2", stdin=bash("ls", str_output=False))

'LICENSE\n__pycache__\n'

In [52]:
bash("hed -n 2", ret_stderr=True)

('', '/bin/sh: 1: hed: not found\n')

## bash_live

In [2]:
help(bash_live)

Help on function bash_live in module pycl:

bash_live(cmd, live='stdout', print_stdout=True, ret_stdout=False, log_stdout=None, print_stderr=True, ret_stderr=False, log_stderr=None)
    More advanced verssion of bash calling with live printing of the standard output and possibilities to log the redirect
    the output and error as a string return or directly in files.
    @param  cmd A command line string formatted as a string
    @param  print_stdout    If True the standard output will be LIVE printed through the system standard output stream
    @param  ret_stdout      If True the standard output will be returned as a string
    @param  log_stdout      If a filename is given, the standard output will logged in this file
    @param  print_stderr    If True the standard error will be printed through the system standard error stream
    @param  ret_stderr      If True the standard error will be returned as a string
    @param  log_stderr      If a filename is given, the standard error w

In [3]:
bash_live("ls", print_stdout=True, ret_stdout=True,)

LICENSE
__pycache__
pycl.py
README.md
test_data
test_pycl.ipynb


'LICENSE\n__pycache__\npycl.py\nREADME.md\ntest_data\ntest_pycl.ipynb\n'

In [16]:
bash_live("for i in 1 2 3 4; do echo $i && sleep 1 && XX ;done", live="stderr",  print_stdout=True, ret_stdout=True, print_stderr=True)

/bin/sh: 1: XX: not found
/bin/sh: 1: XX: not found
/bin/sh: 1: XX: not found
/bin/sh: 1: XX: not found
Error code #127 during execution of the command : for i in 1 2 3 4; do echo $i && sleep 1 && XX ;done


In [10]:
bash_live("ls", print_stdout=False, ret_stdout=False, log_stdout="./test_data/stdout.txt")
head("./test_data/stdout.txt")

LICENSE
__pycache__
pycl.py
README.md
test_data
test_pycl.ipynb
Only 6 lines in the file



---
# DICTIONNARY FORMATTING

## dict_to_md

In [53]:
help(dict_to_md)

Help on function dict_to_md in module pycl:

dict_to_md(d, key_label='', value_label='', transpose=False, sort_by_key=False, sort_by_val=True, max_items=None)
    Def to transform a dict into a markdown formated table



In [54]:
d = {"a":12,"b":14,"c":8,"d":56,"e":76}
print (dict_to_md(d, "Letter", "Number", sort_by_val=True))
print (dict_to_md(d, "Letter", "Number", transpose=True, max_items=2))

|Letter|Number|
|:---|:---|
|e|76|
|d|56|
|b|14|
|a|12|
|c|8|

|Letter|e|d|...|
|:---|:---|:---|:---|
|Number|76|56|...|



## dict_to_report

In [55]:
help(dict_to_report)

Help on function dict_to_report in module pycl:

dict_to_report(d, tab='\t', ntab=0, sep=':', sort_dict=True, max_items=None)
    Recursive function to return a text report from nested dict or OrderedDict objects



In [56]:
d = {"a":12,"b":14,"c":{"c1":12,"c2":{"c2.1":33221,"c2.2":765},"c3":32,"c4":443},"d":56,"e":76}
print(dict_to_report(d, tab=" | "))

d = {"a":12,"b":14,"c":{"c1":12,"c2":{"c2.1":33221,"c2.2":765, "c2.3":7533,"c2.4":76433,"c2.5":876543,"c2.6":89765,"c2.7":8654},"c3":32,"c4":443},"d":56,"e":76}
print(dict_to_report(d, tab="--", max_items=4, sort_dict=True))

a:12
b:14
c
 | c1:12
 | c2
 |  | c2.1:33221
 |  | c2.2:765
 | c3:32
 | c4:443
d:56
e:76

a:12
b:14
c
--c1:12
--c2
----c2.5:876543
----c2.6:89765
----c2.4:76433
----c2.1:33221
----...:...
--c3:32
--c4:443
d:56
e:76



## dict_to_html

In [2]:
help(dict_to_html)

Help on class dict_to_html in module pycl:

class dict_to_html(collections.OrderedDict)
 |  Overridden dict class which takes a 2 level dict and renders an HTML Table in IPython Notebook
 |  Using the magic repr_html_
 |  {'a':{'val1':2,'val2':3},'b':{'val1':4,'val2':5},'c':{'val1':7,'val2':8}}
 |  
 |  Method resolution order:
 |      dict_to_html
 |      collections.OrderedDict
 |      builtins.dict
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, d, max_col=20, max_row=20)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from collections.OrderedDict:
 |  
 |  __delitem__(self, key, /)
 |      Delete self[key].
 |  
 |  __eq__(self, value, /)
 |      Return self==value.
 |  
 |  __ge__(self, value, /)
 |      Return self>=value.
 |  
 |  __gt__(self, value, /)
 |      Return self>value.
 |  
 |  __iter__(self, /)
 |      Implement i

In [6]:
dict_to_html({'a':{'val1':2,'val2':3, "val3":45},'b':{'val1':4,'val2':5, "val3":55},'c':{'val1':7,'val2':8, "val3":6}})

0,1,2,3
,val1,val2,val3
c,7,8,6
b,4,5,55
a,2,3,45


In [7]:
dict_to_html({'a':{'val1':2,'val2':3, "val3":45},'b':{'val1':4,'val2':5, "val3":55},'c':{'val1':7,'val2':8, "val3":6}}, max_col=2, max_row=2)

0,1,2
,val1,val2
c,7,8
b,4,5


---
# TABLE FORMATTING

## reformat_table

In [12]:
help(reformat_table)

Help on function reformat_table in module pycl:

reformat_table(input_file, output_file='', return_df=False, init_template=[], final_template=[], header='', keep_original_header=True, header_from_final_template=False, replace_internal_space='_', replace_null_val='*', subst_dict={}, filter_dict=[], predicate=None, standard_template=None, verbose=False)
    Reformat a table given an initial and a final line templates indicated as a list where numbers
    indicate the data column and strings the formatting characters
    
    @param  input_file   A file with a structured text formatting (gzipped or not)
    @param  output_file   A file path to output the reformatted table (if empty will not write in a file)
    @param  return_df   If true will return a pandas dataframe containing the reformated table (Third party pandas package required)
            by default the columns will be names after the final template [DEFAULT:False]
    @param  init_template   A list of indexes and separators de

In [3]:
# With numeric index
reformat_table(
    input_file="./test_data/Small_m5C_Squires_hg38.bed",
    output_file="./test_data/Small_m5C_Squires_hg38_reformat.bed",
    init_template=[0,"\t",1,"\t",2,"\t",3,"|",4,"\t",5,"\t",6],
    final_template=[0,"\t",1,"\t",2,"\tm5C|*|HeLa|22344696\t-\t",6],
    replace_internal_space='_',
    replace_null_val="*",
    keep_original_header=False,
    header="# New header\n"
    )

linerange ("./test_data/Small_m5C_Squires_hg38.bed")
linerange ("./test_data/Small_m5C_Squires_hg38_reformat.bed")

0	# Transcriptome-wide map of m5C [hg38 coordinates]
1	# Reference: Squires et al., Nucleic Acids Res. 40, 5023 (2012) [PMID 22344696, DOI 10.1093/nar/gks144]
2	#

197	chr1	19311959	19311960	Squires|id185	0	-
198	chr1	19608342	19608343	Squires|id186	0	+
199	chr1	19608343	19608344	Squires|id187	0	+

0	# New header
1	chr1	631539	631540	m5C|*|HeLa|22344696	-	+
2	chr1	631540	631541	m5C|*|HeLa|22344696	-	+

192	chr1	19311959	19311960	m5C|*|HeLa|22344696	-	-
193	chr1	19608342	19608343	m5C|*|HeLa|22344696	-	+
194	chr1	19608343	19608344	m5C|*|HeLa|22344696	-	+



In [4]:
# With str index
reformat_table(
    input_file="./test_data/Small_m5C_Squires_hg38.bed",
    output_file="./test_data/Small_m5C_Squires_hg38_reformat.bed",
    init_template=["{chrom}","\t","{start}","\t","{end}","|","{name}","\t","{score}","\t","{strand}"],
    final_template=["{start}","\t","{end}","\tadditional_informations\t","{name}"],
    replace_internal_space='_',
    replace_null_val="*",
    keep_original_header=False,
    header="# New header\n",
    verbose=True
    )

linerange ("./test_data/Small_m5C_Squires_hg38.bed")
linerange ("./test_data/Small_m5C_Squires_hg38_reformat.bed")

Initial template values
chrom	start	end|name	score	strand
Final template values
start	end	additional_informations	name
194 Lines processed	194 Lines pass	0 Lines filtered out	0 Lines fail
0	# Transcriptome-wide map of m5C [hg38 coordinates]
1	# Reference: Squires et al., Nucleic Acids Res. 40, 5023 (2012) [PMID 22344696, DOI 10.1093/nar/gks144]
2	#

197	chr1	19311959	19311960	Squires|id185	0	-
198	chr1	19608342	19608343	Squires|id186	0	+
199	chr1	19608343	19608344	Squires|id187	0	+

0	# New header
1	631539	631540	Squires	additional_informations	id1
2	631540	631541	Squires	additional_informations	id2

192	19311959	19311960	Squires	additional_informations	id185
193	19608342	19608343	Squires	additional_informations	id186
194	19608343	19608344	Squires	additional_informations	id187



In [5]:
subst_dict = {0:{"chr1":"1", "chr2":"2"}, 3:{"Peng":"22344696"}}
filter_dict = {18:["intron"]}
input_file="./test_data/Small_editing_Peng_hg38.bed"
output_file="./test_data/Small_editing_Peng_hg38_reformat.bed"

reformat_table(
    input_file, output_file,
    init_template=[0,"\t",1,"\t",2,"\t",3,"|",4,"|",5,"|",6,"|",7,"|",8,"|",9,"->",10,"|",11,"%|",12,"|",13,"|",14,"|",15,"|",16,"|",17,"|",18,"|",19,"\t",20,"\t",21],
    final_template=[0,"\t",1,"\t",2,"\t",9,">",10,"|",3,"|HeLa|",19,"\t",11,"\t",21],
    replace_internal_space='_',
    replace_null_val="*",
    subst_dict = subst_dict,
    filter_dict = filter_dict,
    verbose=True
    )

linerange (input_file)
linerange (output_file)

Initial template values
0	1	2	3|4|5|6|7|8|9->10|11%|12|13|14|15|16|17|18|19	20	21
Final template values
0	1	2	9>10|3|HeLa|19	11	21
194 Lines processed	139 Lines pass	55 Lines filtered out	0 Lines fail
0	# Transcriptome-wide map of editing sites [hg38 coordinates]
1	# Reference: Peng et al., Nat. Biotechnol. 30, 253 (2012) [PMID 22327324, DOI 10.1038/nbt.2122]
2	#

197	chr1	9173454	9173455	Peng|chr1|9156101|-|T|Y|A->G|35.14%|99|T|24|C|13|37|intergenic|-	0	-
198	chr1	9173533	9173534	Peng|chr1|9156180|-|T|Y|A->G|24.10%|61|T|148|C|47|195|intergenic|-	0	-
199	chr1	9173535	9173536	Peng|chr1|9156182|-|T|Y|A->G|66.15%|99|C|129|T|66|195|intergenic|-	0	-

0	# Transcriptome-wide map of editing sites [hg38 coordinates]
1	# Reference: Peng et al., Nat. Biotechnol. 30, 253 (2012) [PMID 22327324, DOI 10.1038/nbt.2122]
2	#

142	1	9173454	9173455	A>G|22344696|HeLa|-	35.14	-
143	1	9173533	9173534	A>G|22344696|HeLa|-	24.10	-
144	1	9173535	9173536	A>G|22344696|HeLa|-	66.15	-



In [5]:
input_file="./test_data/Small_editing_Peng_hg38.bed"

df = reformat_table(
    input_file,
    return_df=True,
    init_template=[0,"\t",1,"\t",2,"\t",3,"|",4,"|",5,"|",6,"|",7,"|",8,"|",9,"->",10,"|",11,"%|",12,"|",13,"|",14,"|",15,"|",16,"|",17,"|",18,"|",19,"\t",20,"\t",21],
    final_template=[0,"\t",1,"\t",2,"\t",9,">",10,"|",3,"|HeLa|",19,"\t",11,"\t",21],
    replace_internal_space='_',
    replace_null_val="*",
    )

print(head(input_file, 11))

df.head()

# Transcriptome-wide map of editing sites [hg38 coordinates]
# Reference: Peng et al., Nat. Biotechnol. 30, 253 (2012) [PMID 22327324, DOI 10.1038/nbt.2122]
#
# Data cleaned and converted to BED6, coordinate conversion to hg38 using liftOver.
# Maintainer: Maurits Evers (maurits.evers@anu.edu.au)
#
chr1	1102535	1102536	Peng|chr1|1027779|-|T|Y|A->G|66.67%|37|C|6|T|3|9|intron|C1orf159	0	-
chr1	1221501	1221502	Peng|chr1|1146745|-|T|Y|A->G|36.59%|99|T|26|C|15|42|intron|SDF4	0	-
chr1	1222079	1222080	Peng|chr1|1147323|-|T|Y|A->G|22.73%|94|T|51|C|15|66|intron|SDF4	0	-
chr1	1251840	1251841	Peng|chr1|1177084|-|T|Y|A->G|56.25%|99|C|9|T|7|16|intergenic|-	0	-
chr1	1252243	1252244	Peng|chr1|1177487|-|T|Y|A->G|19.44%|30|T|29|C|7|36|intergenic|-	0	-

None


Unnamed: 0,0,1,2,9,10,3,19,11,21
0,chr1,1102535,1102536,A,G,Peng,C1orf159,66.67,-
1,chr1,1221501,1221502,A,G,Peng,SDF4,36.59,-
2,chr1,1222079,1222080,A,G,Peng,SDF4,22.73,-
3,chr1,1251840,1251841,A,G,Peng,-,56.25,-
4,chr1,1252243,1252244,A,G,Peng,-,19.44,-


In [11]:
input_file = "./test_data/gencode_sample.gff3"

df = reformat_table(
    input_file,
    return_df=True,
    standard_template="gff3_ens_transcript", 
    keep_original_header=False,
    header_from_final_template= True,
    final_template=["{seqid}","\t","{type}","\t","{start}","\t","{end}","\t","{strand}","\t","{ID}","\t","{gene_type}","\t","{transcript_type}","\t","{gene_name}"],
    verbose=True
    )

print(head(input_file, 11))
df.head()

Using gff3 ensembl transcript template. Non-transcript features will be filtered out
Initial template values
seqid	source	type	start	end	score	strand	phase	ID=ID;Parent=Parent;gene_id=gene_id;transcript_id=transcript_id;gene_type=gene_type;gene_status=gene_status;gene_name=gene_name;transcript_type=transcript_type;transcript_status=transcript_status;transcript_name=transcript_name;level=level;transcript_support_level=transcript_support_level;tag=tag;havana_gene=havana_gene;havana_transcript=havana_transcript
Final template values
seqid	type	start	end	strand	ID	gene_type	transcript_type	gene_name
##gff-version 3
#description: evidence-based annotation of the human genome (GRCh38), version 24 (Ensembl 83) - long non-coding RNAs
#provider: GENCODE
#contact: gencode-help@sanger.ac.uk
#format: gff3
#date: 2015-12-03
##sequence-region chr1 1 248956422
chr1	HAVANA	gene	29554	31109	.	+	.	ID=ENSG00000243485.3;gene_id=ENSG00000243485.3;gene_type=lincRNA;gene_status=KNOWN;gene_name=RP11-34P13.3;l

Unnamed: 0,seqid,type,start,end,strand,ID,gene_type,transcript_type,gene_name
0,chr1,transcript,29554,31097,+,ENST00000473358.1,lincRNA,lincRNA,RP11-34P13.3
1,chr1,transcript,30267,31109,+,ENST00000469289.1,lincRNA,lincRNA,RP11-34P13.3
2,chr1,transcript,34554,36081,-,ENST00000417324.1,lincRNA,lincRNA,FAM138A
3,chr1,transcript,35245,36073,-,ENST00000461467.1,lincRNA,lincRNA,FAM138A
4,chr1,transcript,89295,120932,-,ENST00000466430.5,lincRNA,lincRNA,RP11-34P13.7


---
# WEB TOOLS

## url_exist

In [None]:
help()

In [None]:
url_exist("http://www.google.com") # When this one will be False it will probably be the end of the world

In [None]:
url_exist("http://www.JUYGKUYHGJHFJ.com")

# wget

In [None]:
help(wget)

In [None]:
outfile = wget("")
if outfile:
    print(outfile)
    remove(outfile)

In [None]:
outfile = wget("https://github.com/a-slidaster/test_data/RADAR_Secondary.txt.gz")
if outfile:
    print(outfile)
    remove(outfile)

In [None]:
outfile = wget("https://github.com/a-slide/pycl/blob/master/test_data/RADAR_Secondary.txt.gz", progress_block=10000)
if outfile:
    print(outfile)
    remove(outfile)

In [None]:
outfile = wget("https://www.encodeproject.org/files/ENCFF000HJC/@@download/ENCFF000HJC.bigWig", "test.bigWig", 50000000)
if outfile:
    print(outfile)
    remove(outfile)