# TEST pycltools package
** This notebook contains tests for the functions contained in pycltools package **

In [167]:
# Jupyter specific imports
from IPython.core.display import display, HTML, Markdown
# Import of required packages
from os import remove
# import all the functions from pyCL
from pycltools.pycltools import *

# JUPYTER NOTEBOOK SPECIFIC TOOLS

## jhelp

In [168]:
help(jhelp)

Help on function jhelp in module pycltools.pycltools:

jhelp(function, full=True, print_private=False, **kwargs)
    Print a nice looking help string based on the name of a declared function. By default print the function
    definition and description
    * function
        Name of a declared function or class method
    * full
        If True, the help string will included a description of all arguments



In [169]:
jhelp(jhelp, full=True)

## jprint

In [170]:
jhelp(jprint, full=True)

In [45]:
txt="Lorem ipsum condimentum elementum sapien nam eleifend quisque sapien curae"
jprint(txt,font="sans", color="purple", size=200, bold=True)

In [46]:
txt="Lorem ipsum\n\tcondimentum elementum\n\t\tsapien nam eleifend quisque\n\t\t\tsapien curae"
jprint(txt,font="sans", color="powderblue", size=200, bold=True, line_height=50)

In [47]:
jprint("Lorem","ipsum","condimentum","elementum", 1, True, bold=False, italic=False, highlight=False, underlined=True, striked=False, subscripted=False, superscripted=False, font="calibri", color="grey", size=250, align="center")

## toogle_code

In [171]:
jhelp(toogle_code, full=True)

In [49]:
#toogle_code()

## larger_display

In [172]:
jhelp(larger_display, full=True)

In [173]:
larger_display(100)

# PREDICATES

## is_readable_file

In [174]:
jhelp(is_readable_file, full=True)

In [53]:
try:
    is_readable_file("./data/KJHYTGYUJ")
    print ("OK")
except OSError as E:
    print(E)

./data/KJHYTGYUJ is not a valid file


In [54]:
try:
    is_readable_file("./data/RADAR_Secondary.txt")
    print ("OK")
except OSError as E:
    print(E)

OK


## is_gziped

In [175]:
jhelp(is_gziped, full=True)

In [56]:
is_gziped("./data/RADAR_Secondary.txt")

False

In [57]:
is_gziped("./data/RADAR_Secondary.txt.gz")

True

## has_extension

In [176]:
jhelp(has_extension, full=True)

In [59]:
has_extension("./data/test/RADAR_Secondary.txt.gz", "gz")

True

In [60]:
has_extension("./data/test/RADAR_Secondary.txt.gz", "fa")

False

In [61]:
has_extension("./data/test/RADAR_Secondary.txt.gz", "txt", -2)

True

---

# PATH MANIPULATION

## file_basename

In [177]:
jhelp(file_basename, full=True)

In [63]:
file_basename("./data/RADAR_Secondary.txt.gz")

'RADAR_Secondary'

## extensions

In [178]:
jhelp(extensions, full=True)

In [65]:
print(extensions("./data/RADAR_Secondary.txt.gz"))
print(extensions("./data/RADAR_Secondary.txt"))
print(extensions("./data/RADAR_Secondary"))

.txt.gz
.txt



## extensions_list

In [66]:
jhelp(extensions_list, full=True)

In [67]:
print(extensions_list("./data/RADAR_Secondary.txt.gz"))
print(extensions_list("./data/RADAR_Secondary.txt"))
print(extensions_list("./data/RADAR_Secondary"))

['txt', 'gz']
['txt']
[]


## file_name

In [179]:
jhelp(file_name, full=True)

In [69]:
file_name("./data/test/RADAR_Secondary.txt.gz")

'RADAR_Secondary.txt.gz'

## dir_name

In [180]:
jhelp(dir_name, full=True)

In [71]:
print(dir_name("./data/test/RADAR_Secondary.txt.gz"))
print(dir_name("./__init__.py"))
print(dir_name("/bin/bash"))

test
.
bin


## dir_path

In [72]:
jhelp(dir_path, full=True)

In [73]:
print(dir_path("./data/test/RADAR_Secondary.txt.gz"))
print(dir_path("./__init__.py"))
print(dir_path("/bin/bash"))

./data/test
.
/bin


---

# STRING FORMATTING

## supersplit

In [181]:
jhelp(supersplit, full=True)

In [75]:
a = "chr7\t74138\t774138\tA>I|LOC100129917|LUNG:LYMPHOBLASTOID_CELL_LINE|15342557:15258596:22327324\t0"

print(supersplit(a, ["\t","|"]))

print(supersplit(a))

print(supersplit(a, "|"))

['chr7', '74138', '774138', 'A>I', 'LOC100129917', 'LUNG:LYMPHOBLASTOID_CELL_LINE', '15342557:15258596:22327324', '0']
['chr7', '74138', '774138', 'A>I|LOC100129917|LUNG:LYMPHOBLASTOID_CELL_LINE|15342557:15258596:22327324', '0']
['chr7\t74138\t774138\tA>I', 'LOC100129917', 'LUNG:LYMPHOBLASTOID_CELL_LINE', '15342557:15258596:22327324\t0']


## rm_blank

In [182]:
jhelp(rm_blank, full=True)

In [77]:
a = "chr\t\t17|LU NG:LYMPHOBLAST    OID_CELL_LINE|15342557:152585     96:22327324\t0"

print(rm_blank(a))

print(rm_blank(a, replace="*"))

chr17|LUNG:LYMPHOBLASTOID_CELL_LINE|15342557:15258596:223273240
chr*17|LU*NG:LYMPHOBLAST*OID_CELL_LINE|15342557:152585*96:22327324*0


---

# FILE MANIPULATION

## copyFile

In [183]:
jhelp(copyFile, full=True)

In [79]:
copyFile(src="./data/RADAR_Secondary.txt", dest="./data/")

Error: './data/RADAR_Secondary.txt' and './data/RADAR_Secondary.txt' are the same file


In [80]:
copyFile(src="./data/RADAR_Secondary.txt", dest="./data/RADAR_Secondary_copy.txt")

## gzip_file

In [184]:
jhelp(gzip_file, full=True)

In [82]:
gzip_file("./data/RADAR_Secondary.txt")

Compressing ./data/RADAR_Secondary.txt


'/home/aleg/Programming/pycltools/docs/data/RADAR_Secondary.txt.gz'

## gunzip_file

In [185]:
jhelp(gunzip_file, full=True)

In [84]:
gunzip_file("./data/RADAR_Secondary.txt.gz")

Uncompressing ./data/RADAR_Secondary.txt.gz


'/home/aleg/Programming/pycltools/docs/data/RADAR_Secondary.txt'

---

# FILE INFORMATION

## linerange

In [186]:
jhelp(linerange, full=True)

In [86]:
file = "./data/RADAR_Secondary.txt"
linerange (file)

0	#location	reference	tissue	coverage	editing_level(%)
1	chr1:1037916	Peng et al 2012	Lymphoblastoid cell line	9	66.67
2	chr1:1156882	Peng et al 2012	Lymphoblastoid cell line	42	36.59
...
97	chr1:10560773	Peng et al 2012	Lymphoblastoid cell line	20	40.00
98	chr1:10602697	Peng et al 2012	Lymphoblastoid cell line	5	60.00
99	chr1:11138237	Peng et al 2012	Lymphoblastoid cell line	14	42.86


In [87]:
file = "./data/gencode_sample.gff3"
linerange (file, [[2,5],[10,12],[98,100]], max_char_line=100)

...
2	#provider: GENCODE
3	#contact: gencode-help@sanger.ac.uk
4	#format: gff3
5	#date: 2015-12-03
...
10	chr1	HAVANA	exon	30564	30667	.	+	.	ID=exon:ENST00000473358.1:2;Parent=ENST00000473358.1;gene_id=E...
11	chr1	HAVANA	exon	30976	31097	.	+	.	ID=exon:ENST00000473358.1:3;Parent=ENST00000473358.1;gene_id=E...
12	chr1	HAVANA	transcript	30267	31109	.	+	.	ID=ENST00000469289.1;Parent=ENSG00000243485.3;gene_id=EN...
...
98	chr1	HAVANA	exon	287517	287921	.	-	.	ID=exon:ENST00000335577.4:2;Parent=ENST00000335577.4;gene_id...
99	chr1	HAVANA	gene	357383	359681	.	-	.	ID=ENSG00000236743.1;gene_id=ENSG00000236743.1;gene_type=lin...
100	chr1	HAVANA	transcript	357383	359681	.	-	.	ID=ENST00000441866.1;Parent=ENSG00000236743.1;gene_id...
...


In [88]:
file = "./data/RADAR_Secondary.txt.gz"
linerange (file, line_numbering=False)

#location	reference	tissue	coverage	editing_level(%)
chr1:1037916	Peng et al 2012	Lymphoblastoid cell line	9	66.67
chr1:1156882	Peng et al 2012	Lymphoblastoid cell line	42	36.59
...
chr1:10560773	Peng et al 2012	Lymphoblastoid cell line	20	40.00
chr1:10602697	Peng et al 2012	Lymphoblastoid cell line	5	60.00
chr1:11138237	Peng et al 2012	Lymphoblastoid cell line	14	42.86


## cat

In [187]:
jhelp(cat, full=True)

In [90]:
file = "./data/RADAR_Secondary.txt.gz"
cat (file, max_lines=10)

#location	reference	tissue	coverage	editing_level(%)
chr1:1037916	Peng et al 2012	Lymphoblastoid cell line	9	66.67
chr1:1156882	Peng et al 2012	Lymphoblastoid cell line	42	36.59
chr1:1157460	Peng et al 2012	Lymphoblastoid cell line	66	22.73
chr1:1252441	Peng et al 2012	Lymphoblastoid cell line	11	72.73
...
chr1:10521237	Peng et al 2012	Lymphoblastoid cell line	34	17.65
chr1:10521238	Peng et al 2012	Lymphoblastoid cell line	35	37.14
chr1:10560773	Peng et al 2012	Lymphoblastoid cell line	20	40.00
chr1:10602697	Peng et al 2012	Lymphoblastoid cell line	5	60.00
chr1:11138237	Peng et al 2012	Lymphoblastoid cell line	14	42.86


In [91]:
file="./data/gencode_sample.gff3"
cat (file, max_lines=20, line_numbering=True, max_char_line=100)

0	##gff-version 3
1	#description: evidence-based annotation of the human genome (GRCh38), version 24 (Ensembl 83) - lo...
2	#provider: GENCODE
3	#contact: gencode-help@sanger.ac.uk
4	#format: gff3
5	#date: 2015-12-03
6	##sequence-region chr1 1 248956422
7	chr1	HAVANA	gene	29554	31109	.	+	.	ID=ENSG00000243485.3;gene_id=ENSG00000243485.3;gene_type=lincRN...
8	chr1	HAVANA	transcript	29554	31097	.	+	.	ID=ENST00000473358.1;Parent=ENSG00000243485.3;gene_id=ENS...
9	chr1	HAVANA	exon	29554	30039	.	+	.	ID=exon:ENST00000473358.1:1;Parent=ENST00000473358.1;gene_id=EN...
...
9990	chr1	HAVANA	exon	221983000	221983143	.	+	.	ID=exon:ENST00000421147.5:3;Parent=ENST00000421147.5...
9991	chr1	HAVANA	transcript	221966410	221984964	.	+	.	ID=ENST00000441160.1;Parent=ENSG00000228437.5;...
9992	chr1	HAVANA	exon	221966410	221966502	.	+	.	ID=exon:ENST00000441160.1:1;Parent=ENST00000441160.1...
9993	chr1	HAVANA	exon	221983000	221983143	.	+	.	ID=exon:ENST00000441160.1:2;Parent=ENST00000441160.1...
9994	chr1	HAVA

## tail

In [188]:
jhelp(tail, full=True)

In [93]:
file = "./data/RADAR_clean.txt"
tail (file, n = 4)

...
chr1	225974581	225974581	A>I|SRP9|YH|22327324	28.89	+
chr1	225974735	225974735	A>I|SRP9|YH|22327324	23.88	+
chr1	225974746	225974746	A>I|SRP9|YH|22327324	71.19	+


In [94]:
file = "./data/RADAR_Secondary.txt.gz"
tail (file, n = 4, line_numbering=True)

...
97	chr1:10560773	Peng et al 2012	Lymphoblastoid cell line	20	40.00
98	chr1:10602697	Peng et al 2012	Lymphoblastoid cell line	5	60.00
99	chr1:11138237	Peng et al 2012	Lymphoblastoid cell line	14	42.86


In [95]:
file="./data/gencode_sample.gff3"
tail (file, n = 5, max_char_line=100)

...
chr1	HAVANA	transcript	222041705	222064763	.	-	.	ID=ENST00000438158.1;Parent=ENSG00000232679.1;gene_...
chr1	HAVANA	exon	222064685	222064763	.	-	.	ID=exon:ENST00000438158.1:1;Parent=ENST00000438158.1;gene...
chr1	HAVANA	exon	222058414	222058678	.	-	.	ID=exon:ENST00000438158.1:2;Parent=ENST00000438158.1;gene...
chr1	HAVANA	exon	222041705	222041922	.	-	.	ID=exon:ENST00000438158.1:3;Parent=ENST00000438158.1;gene...


## head

In [189]:
jhelp(head, full=True)

In [97]:
head("./data/RADAR_Main.txt", n= 3)

#chromosome position  gene       strand annot1     annot2     alu? non_alu_repetitive? conservation_chimp conservation_rhesus conservation_mouse 
chr1        206256301 C1orf186   -      intronic   intronic   no   no                  N                  N                   N                  
chr6        116991832 intergenic -      intergenic intergenic no   no                  N                  N                   N                  



In [98]:
head("./data/RADAR_Main.txt", ignore_comment_line=True,n= 3)

chr1 206256301 C1orf186   - intronic   intronic   no no N N N 
chr6 116991832 intergenic - intergenic intergenic no no N N N 
chr7 30504355  NOD1       - intronic   intronic   no no N N N 



In [99]:
head("./data/RADAR_Main.txt", n=5, max_char_line=110)

#chromosome position  gene       strand annot1     annot2     alu? non_alu_repetitive? conservation_chimp cons...
chr1        206256301 C1orf186   -      intronic   intronic   no   no                  N                  N   ...
chr6        116991832 intergenic -      intergenic intergenic no   no                  N                  N   ...
chr7        30504355  NOD1       -      intronic   intronic   no   no                  N                  N   ...
chr1        85127959  SSX2IP     -      Syn        Gln->Gln   no   no                  N                  N   ...



In [100]:
head("./data/RADAR_Secondary.txt.gz", n=6, ignore_comment_line=True)

chr1:1037916 Peng et al 2012 Lymphoblastoid cell line 9  66.67 
chr1:1156882 Peng et al 2012 Lymphoblastoid cell line 42 36.59 
chr1:1157460 Peng et al 2012 Lymphoblastoid cell line 66 22.73 
chr1:1252441 Peng et al 2012 Lymphoblastoid cell line 11 72.73 
chr1:1252443 Peng et al 2012 Lymphoblastoid cell line 11 45.45 
chr1:1253357 Peng et al 2012 Lymphoblastoid cell line 31 32.26 



In [101]:
head("./data/sample.sam", n=6, ignore_comment_line=True)

chr1|35235|35295|-|5.1   272 chr12 37283     0 61M * 0 0 *                                                  *                                                  
chr1|90965|91025|-|7.57  256 chr16 90215899  0 61M * 0 0 *                                                  *                                                  
chr1|91055|91115|-|7.60  256 chr2  168290980 0 61M * 0 0 *                                                  *                                                  
chr1|92081|92141|-|8.1   272 chr1  268657    0 61M * 0 0 *                                                  *                                                  
chr1|92111|92171|-|8.2   256 chr5  181462264 0 61M * 0 0 *                                                  *                                                  
chr1|110943|111003|-|9.1 0   chrY  24307299  0 61M * 0 0 AATGAAAGATATGTGTTTTTCATATTACCAGGTAGATGATAAGGAGATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII 



In [102]:
head ("./data/sample_100.bam", n=6)

chr1|1736694|1736754|-|168.51      256 chr6  108404793 0  32M29H   * 0 0 *                                                  *                                                  
chr1|20158612|20158672|+|508.32    0   chr1  20158612  60 61M      * 0 0 CTCAGAGGCTTGAAAAGTAGCATCCACCCCCTTCTGGGCATCAATCACAG IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII 
chr1|47096793|47096853|-|1008.6    272 chr1  156061950 0  2H54M5H  * 0 0 *                                                  *                                                  
chr1|65003940|65004000|-|1364.17   256 chr13 107349700 0  16M1I44M * 0 0 *                                                  *                                                  
chr1|108202106|108202166|+|1958.74 0   chr1  108202106 60 61M      * 0 0 GGACAGAAAACAAATCAGTAGTTACCAGTTGTGACTAGCGGGAAGGGAAT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII 
chr1|147173091|147173151|+|2353.12 272 chr2  74122749  0  22H39M   * 0 0 *                                              

## linesample

In [103]:
jhelp(linesample, full=True)

In [104]:
linesample("./data/RADAR_clean.txt", n_lines=10, line_numbering=True)

77	chr1	6710595	6710595	A>I|DNAJC11|YH|22327324	50.00	-
96	chr1	10521237	10521237	A>I|DFFA|YH|22327324	17.65	-
266	chr1	32737172	32737172	A>I|LCK|YH|22327324	35.71	+
342	chr1	40205396	40205396	A>I|PPIE|YH|22327324	63.64	+
448	chr1	52875019	52875019	A>I|PRPF38A|YH|22327324	38.89	+
533	chr1	85449497	85449497	A>I|MCOLN2|YH|22327324	20.00	-
610	chr1	114296188	114296188	A>I|PHTF1|YH|22327324	20.59	-
767	chr1	155444343	155444343	A>I|ASH1L|YH|22327324	42.86	-
824	chr1	157516004	157516004	A>I|FCRL5|YH|22327324	21.88	-
946	chr1	204526795	204526795	A>I|MDM4|YH|22327324	32.29	+


In [105]:
linesample("./data/RADAR_Secondary.txt.gz", n_lines=10, line_numbering=True)

4	chr1:1252441	Peng et al 2012	Lymphoblastoid cell line	11	72.73
8	chr1:1418532	Peng et al 2012	Lymphoblastoid cell line	5	60.00
51	chr1:6608345	Peng et al 2012	Lymphoblastoid cell line	13	46.15
56	chr1:6707305	Peng et al 2012	Lymphoblastoid cell line	33	39.39
61	chr1:6708354	Peng et al 2012	Lymphoblastoid cell line	15	40.00
62	chr1:6708680	Peng et al 2012	Lymphoblastoid cell line	24	25.00
63	chr1:6708681	Peng et al 2012	Lymphoblastoid cell line	24	20.83
75	chr1:6710585	Peng et al 2012	Lymphoblastoid cell line	30	65.52
90	chr1:10520702	Peng et al 2012	Lymphoblastoid cell line	98	11.22
93	chr1:10520751	Peng et al 2012	Lymphoblastoid cell line	166	28.92


## count_uniq

In [106]:
jhelp(count_uniq, full=True)

In [107]:
count_uniq("./data/Small_editing_Peng_hg38.bed", colnum=17, sep=['\t',"|"])

17
intergenic    110
intron         55
3-UTR          17
unknown        12
dtype: int64

In [108]:
count_uniq("./data/gencode_sample.gff3", colnum=17, sep=["\t","=", ";"], select_values={2:["transcript", "exon"], 6:"+"})

17
lincRNA                     2031
antisense                   1600
processed_transcript         686
sense_intronic               105
TEC                           36
sense_overlapping             11
3prime_overlapping_ncrna       2
dtype: int64

## colsum

In [190]:
jhelp(colsum, full=True)

In [110]:
display(Markdown(colsum("./data/RADAR_Main.txt", header=True, colrange=[0,2,6], max_items=15)))

|#chromosome|chr1|chr17|chr9|chr15|chr6|chr14|chr18|chr2|chrY|chr4|chr7|
|:---|:---|:---|:---|:---|:---|:---|:---|:---|:---|:---|:---|
|Count|4|3|2|2|2|1|1|1|1|1|1|

|gene|RABEP1|NUP133|JUB|GREB1L|SPHKAP|NLGN4Y|CELSR2|RBPJ|TLE4|SOCS7|ADPGK|UBE2O|TSC1|GRIK2|MEF2A|...|
|:---|:---|:---|:---|:---|:---|:---|:---|:---|:---|:---|:---|:---|:---|:---|:---|:---|
|Count|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|...|

|alu?|no|
|:---|:---|
|Count|19|



In [111]:
colsum("./data/RADAR_Main.txt", header=True, ret_type="dict", colrange=[0,3])

OrderedDict([(0,
              OrderedDict([('chr1', 4),
                           ('chr6', 2),
                           ('chr7', 1),
                           ('chr15', 2),
                           ('chr9', 2),
                           ('chr17', 3),
                           ('chr4', 1),
                           ('chrY', 1),
                           ('chr2', 1),
                           ('chr18', 1),
                           ('chr14', 1)])),
             (3, OrderedDict([('-', 10), ('+', 9)]))])

In [112]:
print(colsum(
        "./data/RADAR_clean.txt",
        header=True,
        ignore_hashtag_line=True,
        ret_type="report",
        separator=["\t","|"],
        max_items=5))

0
	chr1	997
1
	225974746	1
	225974735	1
	225974581	1
	224599486	1
	224584888	1
	...	...
2
	225974746	1
	225974735	1
	225974581	1
	224599486	1
	224584888	1
	...	...
3
	A>I	997
4
	FDPS	34
	MDM4	31
	CTSS	28
	DNAJC11	25
	S100PBP	24
	...	...
5
	YH	997
6
	22327324	997
7
	33.33	31
	66.67	31
	50.00	23
	57.14	22
	60.00	22
	...	...
8
	-	527
	+	470



## fastcount

In [191]:
jhelp(fastcount, full=True)

In [114]:
fastcount("./data/RADAR_Secondary.txt")

100

In [115]:
fastcount("./data/RADAR_Secondary.txt.gz")

100

## simplecount

In [192]:
jhelp(simplecount, full=True)

In [117]:
simplecount("./data/Small_m5C_Squires_hg38.bed", ignore_hashtag_line=True)

194

In [118]:
simplecount("./data/RADAR_Secondary.txt.gz")

100

---

# DIRECTORY MANIPULATION

## mkdir

In [193]:
jhelp(mkdir, full=True)

In [120]:
mkdir("./data/test_dir")

In [121]:
mkdir ("./test/test/test")
!rm -rf ./test

Creating /home/aleg/Programming/pycltools/docs/test
Creating /home/aleg/Programming/pycltools/docs/test/test
Creating /home/aleg/Programming/pycltools/docs/test/test/test


---

# SHELL MANIPULATION

## make_cmd_str

In [194]:
jhelp(make_cmd_str, full=True)

In [123]:
make_cmd_str("bwa", {"-b":None, "-t":6, "-i":"../idx/seq.fa"}, ["../read1", "../read2"])

'bwa -b -t 6 -i ../idx/seq.fa ../read1 ../read2 '

## bash_basic

In [195]:
jhelp(bash_basic, full=True)

In [125]:
print(bash_basic("ls -l"))
print(bash_basic("echo TTTT"))
print(bash_basic("grep ./data/RADAR_Secondary.txt"))

total 136
drwxrwxr-x 3 aleg aleg  4096 Dec 10 10:54 data
-rw-rw-r-- 1 aleg aleg 39582 Dec 10 12:05 pycltools_functions_list.ipynb
-rw-rw-r-- 1 aleg aleg 93686 Dec 10 12:07 pycltools_tests.ipynb


None
TTTT


None


None


## bash

In [196]:
jhelp(bash, full=True)

In [127]:
bash("ls", print_stdout=True, ret_stdout=True,)

data
pycltools_functions_list.ipynb
pycltools_tests.ipynb


'data\npycltools_functions_list.ipynb\npycltools_tests.ipynb\n'

In [128]:
bash("for i in 1 2 3 4; do echo $i && sleep 1 && ls error ;done", live="stderr",  print_stdout=True, ret_stdout=True, print_stderr=True)

ls: cannot access 'error': No such file or directory
ls: cannot access 'error': No such file or directory
ls: cannot access 'error': No such file or directory
ls: cannot access 'error': No such file or directory
Error code #2 during execution of the command : for i in 1 2 3 4; do echo $i && sleep 1 && ls error ;done


In [129]:
bash("ls", print_stdout=False, ret_stdout=False, log_stdout="./data/stdout.txt")
head("./data/stdout.txt")

Only 3 lines in the file
data                           
pycltools_functions_list.ipynb 
pycltools_tests.ipynb          



## bash_update

In [197]:
jhelp(bash_update, full=True)

In [131]:
#bash_update("htop")

---

# DICTIONNARY FORMATTING

## dict_to_md

In [198]:
jhelp(dict_to_md, full=True)

In [133]:
d = {"a":12,"b":14,"c":8,"d":56,"e":76}
display(Markdown(dict_to_md(d, "Letter", "Number", sort_by_val=True)))
display(Markdown(dict_to_md(d, "Letter", "Number", transpose=True, max_items=3)))

|Letter|Number|
|:---|:---|
|e|76|
|d|56|
|b|14|
|a|12|
|c|8|


|Letter|e|d|b|...|
|:---|:---|:---|:---|:---|
|Number|76|56|14|...|


## dict_to_report

In [199]:
jhelp(dict_to_report, full=True)

In [135]:
d = {"a":12,"b":14,"c":{"c1":12,"c2":{"c2.1":33221,"c2.2":765},"c3":32,"c4":443},"d":56,"e":76}
print(dict_to_report(d, tab=" | "))

d = {"a":12,"b":14,"c":{"c1":12,"c2":{"c2.1":33221,"c2.2":765, "c2.3":7533,"c2.4":76433,"c2.5":876543,"c2.6":89765,"c2.7":8654},"c3":32,"c4":443},"d":56,"e":76}
print(dict_to_report(d, tab="--", max_items=4, sort_dict=True))

a:12
b:14
c
 | c1:12
 | c2
 |  | c2.1:33221
 |  | c2.2:765
 | c3:32
 | c4:443
d:56
e:76

a:12
b:14
c
--c1:12
--c2
----c2.5:876543
----c2.6:89765
----c2.4:76433
----c2.1:33221
----...:...
--c3:32
--c4:443
d:56
e:76



---

# TABLE FORMATTING

## reformat_table

In [200]:
jhelp(reformat_table, full = True)

In [137]:
# With numeric index
reformat_table(
    input_file="./data/Small_m5C_Squires_hg38.bed",
    output_file="./data/Small_m5C_Squires_hg38_reformat.bed",
    init_template=[0,"\t",1,"\t",2,"\t",3,"|",4,"\t",5,"\t",6],
    final_template=[0,"\t",1,"\t",2,"\tm5C|*|HeLa|22344696\t-\t",6],
    replace_internal_space='_',
    replace_null_val="*",
    keep_original_header=False,
    header="# New header\n"
    )

linerange ("./data/Small_m5C_Squires_hg38.bed")
linerange ("./data/Small_m5C_Squires_hg38_reformat.bed")

0	# Transcriptome-wide map of m5C [hg38 coordinates]
1	# Reference: Squires et al., Nucleic Acids Res. 40, 5023 (2012) [PMID 22344696, DOI 10.1093/nar/gks144]
2	#
...
197	chr1	19311959	19311960	Squires|id185	0	-
198	chr1	19608342	19608343	Squires|id186	0	+
199	chr1	19608343	19608344	Squires|id187	0	+
0	# New header
1	chr1	631539	631540	m5C|*|HeLa|22344696	-	+
2	chr1	631540	631541	m5C|*|HeLa|22344696	-	+
...
192	chr1	19311959	19311960	m5C|*|HeLa|22344696	-	-
193	chr1	19608342	19608343	m5C|*|HeLa|22344696	-	+
194	chr1	19608343	19608344	m5C|*|HeLa|22344696	-	+


In [138]:
# With str index
reformat_table(
    input_file="./data/Small_m5C_Squires_hg38.bed",
    output_file="./data/Small_m5C_Squires_hg38_reformat.bed",
    init_template=["{chrom}","\t","{start}","\t","{end}","|","{name}","\t","{score}","\t","{strand}"],
    final_template=["{start}","\t","{end}","\tadditional_informations\t","{name}"],
    replace_internal_space='_',
    replace_null_val="*",
    keep_original_header=False,
    header="# New header\n",
    verbose=True
    )

linerange ("./data/Small_m5C_Squires_hg38.bed")
linerange ("./data/Small_m5C_Squires_hg38_reformat.bed")

Enumerated named argument list:
	verbose: True
	standard_template: None
	predicate: None
	filter_dict: []
	subst_dict: {}
	replace_null_val: *
	replace_internal_space: _
	header_from_final_template: False
	keep_original_header: False
	header: # New header

	final_template: ['{start}', '\t', '{end}', '\tadditional_informations\t', '{name}']
	init_template: ['{chrom}', '\t', '{start}', '\t', '{end}', '|', '{name}', '\t', '{score}', '\t', '{strand}']
	return_df: False
	output_file: ./data/Small_m5C_Squires_hg38_reformat.bed
	input_file: ./data/Small_m5C_Squires_hg38.bed
Unenumerated named arguments list:
Initial template values
chrom	start	end|name	score	strand
Final template values
start	end	additional_informations	name
194 Lines processed	194 Lines pass	0 Lines filtered out	0 Lines fail
0	# Transcriptome-wide map of m5C [hg38 coordinates]
1	# Reference: Squires et al., Nucleic Acids Res. 40, 5023 (2012) [PMID 22344696, DOI 10.1093/nar/gks144]
2	#
...
197	chr1	19311959	19311960	Squires|i

In [139]:
subst_dict = {0:{"chr1":"1", "chr2":"2"}, 3:{"Peng":"22344696"}}
filter_dict = {18:["intron"]}
input_file="./data/Small_editing_Peng_hg38.bed"
output_file="./data/Small_editing_Peng_hg38_reformat.bed"

reformat_table(
    input_file, output_file,
    init_template=[0,"\t",1,"\t",2,"\t",3,"|",4,"|",5,"|",6,"|",7,"|",8,"|",9,"->",10,"|",11,"%|",12,"|",13,"|",14,"|",15,"|",16,"|",17,"|",18,"|",19,"\t",20,"\t",21],
    final_template=[0,"\t",1,"\t",2,"\t",9,">",10,"|",3,"|HeLa|",19,"\t",11,"\t",21],
    replace_internal_space='_',
    replace_null_val="*",
    subst_dict = subst_dict,
    filter_dict = filter_dict,
    verbose=True
    )

linerange (input_file)
linerange (output_file)

Enumerated named argument list:
	verbose: True
	standard_template: None
	predicate: None
	filter_dict: {18: ['intron']}
	subst_dict: {0: {'chr1': '1', 'chr2': '2'}, 3: {'Peng': '22344696'}}
	replace_null_val: *
	replace_internal_space: _
	header_from_final_template: False
	keep_original_header: True
	header: 
	final_template: [0, '\t', 1, '\t', 2, '\t', 9, '>', 10, '|', 3, '|HeLa|', 19, '\t', 11, '\t', 21]
	init_template: [0, '\t', 1, '\t', 2, '\t', 3, '|', 4, '|', 5, '|', 6, '|', 7, '|', 8, '|', 9, '->', 10, '|', 11, '%|', 12, '|', 13, '|', 14, '|', 15, '|', 16, '|', 17, '|', 18, '|', 19, '\t', 20, '\t', 21]
	return_df: False
	output_file: ./data/Small_editing_Peng_hg38_reformat.bed
	input_file: ./data/Small_editing_Peng_hg38.bed
Unenumerated named arguments list:
Initial template values
0	1	2	3|4|5|6|7|8|9->10|11%|12|13|14|15|16|17|18|19	20	21
Final template values
0	1	2	9>10|3|HeLa|19	11	21
194 Lines processed	139 Lines pass	55 Lines filtered out	0 Lines fail
0	# Transcriptome-wide 

In [140]:
input_file="./data/Small_editing_Peng_hg38.bed"

df = reformat_table(
    input_file,
    return_df=True,
    init_template=[0,"\t",1,"\t",2,"\t",3,"|",4,"|",5,"|",6,"|",7,"|",8,"|",9,"->",10,"|",11,"%|",12,"|",13,"|",14,"|",15,"|",16,"|",17,"|",18,"|",19,"\t",20,"\t",21],
    replace_internal_space='_',
    replace_null_val="*",
    verbose=True)

print(head(input_file, 11))

df.head()

Enumerated named argument list:
	verbose: True
	standard_template: None
	predicate: None
	filter_dict: []
	subst_dict: {}
	replace_null_val: *
	replace_internal_space: _
	header_from_final_template: False
	keep_original_header: True
	header: 
	final_template: []
	init_template: [0, '\t', 1, '\t', 2, '\t', 3, '|', 4, '|', 5, '|', 6, '|', 7, '|', 8, '|', 9, '->', 10, '|', 11, '%|', 12, '|', 13, '|', 14, '|', 15, '|', 16, '|', 17, '|', 18, '|', 19, '\t', 20, '\t', 21]
	return_df: True
	output_file: 
	input_file: ./data/Small_editing_Peng_hg38.bed
Unenumerated named arguments list:
No final template given. Create final template from init template
Initial template values
0	1	2	3|4|5|6|7|8|9->10|11%|12|13|14|15|16|17|18|19	20	21
Final template values
0	1	2	3|4|5|6|7|8|9->10|11%|12|13|14|15|16|17|18|19	20	21
# Transcriptome-wide map of editing sites [hg38 coordinates]
# Reference: Peng et al., Nat. Biotechnol. 30, 253 (2012) [PMID 22327324, DOI 10.1038/nbt.2122]
#
# Data cleaned and converted

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,chr1,1102535,1102536,Peng,chr1,1027779,-,T,Y,A,...,37,C,6,T,3,9,intron,C1orf159,0,-
1,chr1,1221501,1221502,Peng,chr1,1146745,-,T,Y,A,...,99,T,26,C,15,42,intron,SDF4,0,-
2,chr1,1222079,1222080,Peng,chr1,1147323,-,T,Y,A,...,94,T,51,C,15,66,intron,SDF4,0,-
3,chr1,1251840,1251841,Peng,chr1,1177084,-,T,Y,A,...,99,C,9,T,7,16,intergenic,-,0,-
4,chr1,1252243,1252244,Peng,chr1,1177487,-,T,Y,A,...,30,T,29,C,7,36,intergenic,-,0,-


In [141]:
input_file = "./data/gencode_sample.gff3"

df = reformat_table(
    input_file,
    return_df=True,
    standard_template="gff3_ens_transcript", 
    keep_original_header=False,
    header_from_final_template= True,
    verbose=True
    )

print(head(input_file, 11))
df.head()

Enumerated named argument list:
	verbose: True
	standard_template: gff3_ens_transcript
	predicate: None
	filter_dict: []
	subst_dict: {}
	replace_null_val: *
	replace_internal_space: _
	header_from_final_template: True
	keep_original_header: False
	header: 
	final_template: []
	init_template: []
	return_df: True
	output_file: 
	input_file: ./data/gencode_sample.gff3
Unenumerated named arguments list:
Using gff3 ensembl transcript template. Non-transcript features will be filtered out
No final template given. Create final template from init template
Initial template values
seqid	source	type	start	end	score	strand	phase	ID=ID;Parent=Parent;gene_id=gene_id;transcript_id=transcript_id;gene_type=gene_type;gene_status=gene_status;gene_name=gene_name;transcript_type=transcript_type;transcript_status=transcript_status;transcript_name=transcript_name;level=level;transcript_support_level=transcript_support_level;tag=tag;havana_gene=havana_gene;havana_transcript=havana_transcript
Final template v

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,ID,Parent,...,gene_status,gene_name,transcript_type,transcript_status,transcript_name,level,transcript_support_level,tag,havana_gene,havana_transcript
0,chr1,HAVANA,transcript,29554,31097,.,+,.,ENST00000473358.1,ENSG00000243485.3,...,KNOWN,RP11-34P13.3,lincRNA,KNOWN,RP11-34P13.3-001,2,5,"not_best_in_genome_evidence,dotter_confirmed,b...",OTTHUMG00000000959.2,OTTHUMT00000002840.1
1,chr1,HAVANA,transcript,30267,31109,.,+,.,ENST00000469289.1,ENSG00000243485.3,...,KNOWN,RP11-34P13.3,lincRNA,KNOWN,RP11-34P13.3-002,2,5,"not_best_in_genome_evidence,basic",OTTHUMG00000000959.2,OTTHUMT00000002841.2
2,chr1,HAVANA,transcript,34554,36081,.,-,.,ENST00000417324.1,ENSG00000237613.2,...,KNOWN,FAM138A,lincRNA,KNOWN,FAM138A-001,2,1,basic,OTTHUMG00000000960.1,OTTHUMT00000002842.1
3,chr1,HAVANA,transcript,35245,36073,.,-,.,ENST00000461467.1,ENSG00000237613.2,...,KNOWN,FAM138A,lincRNA,KNOWN,FAM138A-002,2,3;havana_gene=OTTHUMG00000000960.1;havana_tran...,*,*,*
4,chr1,HAVANA,transcript,89295,120932,.,-,.,ENST00000466430.5,ENSG00000238009.6,...,KNOWN,RP11-34P13.7,lincRNA,KNOWN,RP11-34P13.7-001,2,5,"not_best_in_genome_evidence,basic",OTTHUMG00000001096.2,OTTHUMT00000003225.1


# WEB TOOLS

## url_exist

In [201]:
jhelp(url_exist, full=True)

In [143]:
url_exist("http://www.google.com") # When this one will be False it will probably be the end of the world

True

In [144]:
url_exist("http://www.JUYGKUYHGJHFJ.com")

True

## wget

In [202]:
jhelp(wget, full=True)

In [146]:
outfile = wget("")
if outfile:
    print(outfile)
    remove(outfile)

unknown url type: ''


In [147]:
outfile = wget("https://www.encodeproject.org/files/ENCFF000HJC/@@download/ENCFF000HJC.bigWig", "test.bigWig", 50000000)
if outfile:
    print(outfile)
    remove(outfile)

Downloading: https://www.encodeproject.org/files/ENCFF000HJC/@@download/ENCFF000HJC.bigWig	Bytes: 258930225
50.0 MB Downloaded	[19.31 %]
100.0 MB Downloaded	[38.62 %]
150.0 MB Downloaded	[57.93 %]
200.0 MB Downloaded	[77.24 %]
250.0 MB Downloaded	[96.55 %]
258.9 MB Downloaded	[100 %]
test.bigWig


---
# FUNCTION TOOLS

## print_arg

In [203]:
jhelp(print_arg, full=True)

In [149]:
def test (A,B,C=7,*args, **kwarg):
    print_arg()

test(1,2,3,5, z=65, x=100)

Enumerated named argument list:
	C: 3
	B: 2
	A: 1
Unenumerated named arguments list:
	z: 65
	x: 100
Unnamed positional arguments list:
	5


# SSH TOOLS

## scp

In [204]:
jhelp(scp, full=True)

In [151]:
#scp(hostname="ebi-cli-001.ebi.ac.uk", local_file="../README.md", remote_dir="~/test", username="aleg", rsa_private_key="/home/aleg/.ssh/ebi_rsa")

In [152]:
#scp(hostname="ebi", local_file="../README.md", remote_dir="~/test")

# Package Tools

## get_package_file

In [153]:
jhelp(get_package_file, full=True)

In [154]:
get_package_file("pyCL", "pyCL/")



# SAM/BAM TOOLS

## bam_sample

In [155]:
jhelp(bam_sample, full=True)

In [156]:
bam_sample("./data/sample.sam", fp_out="./data/sample_100.sam", n_reads=100, verbose=True)
linesample("./data/sample_100.sam", n_lines=10, max_char_line=100)

Found 5000 reads in input file
Wrote 100 reads in output file
20	@SQ	SN:chr21	LN:46709983
44	@SQ	SN:KI270305.1	LN:1472
111	@SQ	SN:KI270508.1	LN:1951
146	@SQ	SN:KI270710.1	LN:40176
170	@SQ	SN:KI270734.1	LN:165050
171	@SQ	SN:KI270735.1	LN:42811
217	chr14|61657775|61657835|+|13447.7	272	chr7	127489894	0	61M	*	0	0	*	*	NM:i:3	MD:Z:39A14A4A1	AS:i:49
234	chr17|43159683|43159737|-|19991.10	272	chr9	131908717	0	55M	*	0	0	*	*	NM:i:0	MD:Z:55	AS:i:55
239	chr18|14010134|14010194|+|21568.4	272	chr5	4925139	0	61M	*	0	0	*	*	NM:i:0	MD:Z:61	AS:i:61
266	chr3|138485055|138485115|+|33361.101	256	chr12	6132886	0	61M	*	0	0	*	*	NM:i:5	MD:Z:16G4C0A3C25G8	AS:...


In [157]:
bam_sample("./data/sample.sam", fp_out="./data/sample_100.bam", n_reads=100, verbose=True)
!samtools view "./data/sample_100.bam" | head

Found 5000 reads in input file
Wrote 100 reads in output file
chr1|805036|805096|+|89.10	272	chr8	436410	0	61M	*	0	0	*	*	NM:i:3	MD:Z:7A19C0A32	AS:i:46
chr1|110408997|110409057|+|2013.22	272	chr15	35143322	0	13H48M	*	0	0	*	*	NM:i:3	MD:Z:37G0A2T6	AS:i:37
chr1|121462469|121462529|+|2240.83	0	chr1	121462469	48	61M	*	0	0	AATCTATTTATTTATTTTTCTTCAGTGTTACAATGAAACAACATTGCTTTATTTAAATTTT	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	NM:i:0	MD:Z:61	AS:i:61	XS:i:46
chr1|205386423|205386483|+|3446.41	272	KI270750.1	53599	0	7H47M7H	*	0	0	*	*	NM:i:3	MD:Z:27C0A6A11	AS:i:32
chr1|221508699|221508759|+|3731.6	272	chrX	69857918	0	37M24H	*	0	0	*	*	NM:i:0	MD:Z:37	AS:i:37
chr1|246607871|246607931|+|4121.10	256	chr19	29557507	0	19H42M	*	0	0	*	*	NM:i:0	MD:Z:42	AS:i:42
chr10|14878128|14878188|-|4488.16	256	chr15	84959120	0	17H44M	*	0	0	*	*	NM:i:0	MD:Z:44	AS:i:44
chr10|65751058|65751118|+|5083.9	272	chr2	222774610	0	18H43M	*	0	0	*	*	NM:i:1	MD:Z:6G36	AS:i:38
chr10|106187699|106187759|+|5744.7	272	c

In [158]:
bam_sample("./data/sample.txt", fp_out="./data/sample_100.bam", n_reads=100, verbose=True)



In [159]:
bam_sample("./data/sample.sam", fp_out="./data/sample_100.txt", n_reads=100, verbose=True)



# DNA SEQUENCE TOOLS

## base_generator

In [205]:
jhelp(base_generator, full = True)

In [161]:
bg = base_generator()
for i in range(10):
    print (next(bg))

A
T
C
A
T
C
G
G
T
T


In [162]:
bg = base_generator(bases=['A', 'T', 'C', 'G', 'N'], weights=[0.8, 0.8, 0.2, 0.2, 0.1])
for i in range(10):
    print (next(bg))

G
T
A
G
G
G
G
A
T
G


## make_sequence

In [206]:
jhelp(make_sequence, full=True)

In [164]:
make_sequence()

'ACTGGCGTCGGATCGTGAGGTACTGATATTTCCGGCTCGCTGCCTATACCTATCAGTCCAAGTATGATGACTAGGAAGAACGCTAGTAATAGTGGGCGTTCACGGTTGAGAACCTCTTATTCATGGAAATAAATATTGAGTCTTGTGGGTCTGATAAGCGTTCCCCAAGTAAGTACGAAAAATCTGAGAGCCAAAGGAACTACCGTTATGAGGATCTCTGTTTAAATTCTGATAATATGTATTTGGATCCGAAATACGCGGTGATGGTGTGTAGTTACCTTAGGCTGATCGGTAAGCACTGCATCTACAGTTATAGTCCCCACTTTTCGTTTGCAAGCAAAAGTTGATCTATGTCACCCTCAATCTCGTAAAGGTGTTGCTATGGTTAAAGTAAGTGTCTCCTAGTGCTGATCAGAGCAAACGCTAAGGGAAAGGGGAGCTAAGCCCTTATGATCAAAGAGACAGATGGCTTAGCGCCCAATTCAGCTATTATGTGAAATACATGTACGGGAAAAATTCTTCACTTGGAAGAAACAATGGTGAGTCTTTATCCAGGAACATGTAAGGAATTTGTAGTTCCAAATTCGGTCTATGTCCAATGATGACAGAAGCTAACGTATTGCGTTATGAATCAGGTGTACTTGTGTTTGATTTTAGTAATCCTTCGACTGAATTTGCATCTGTGGACGAGATATCACGGAGATTTGGGTGTCTCTACTTGAACATCATAGTTTGTCATAGGGCTAGTTCTTGGCATTTAATAAAATTAATAATATTGACTAATAACAACGCGACTGTTCGTCGCTAAATTGAAAACCATACAATGATCTATTTCAATACCTATTTGTCCCCACAGTAATCGATTTGCTTTATTTATAAGAGAAGATTATCAATATTTTAAGTTCTATGAATTCCTAGCACTCATAGGTCTGTGTCCCGGTGTTCCAATCTGGTGTCAACGTCGATCAGCCTTTGTCTAGTTCTTAATCTAGAGTTTAG

In [165]:
make_sequence(bases=['A', 'T', 'C', 'G', 'N'], weights=[], length=100)

'TATNGGATTNANGGCGTNGAATGNATNANCGTTGNNCCAAATTGANCGNTGTNNTTNGATNNTNAGGCTTGCCCTCNCGCAAAACCNGNCAACTTNNNNG'

In [166]:
make_sequence(bases=['A', 'T', 'C', 'G', 'N'], weights=[0.8, 0.8, 0.2, 0.2, 0.1], length=100)

'ATCATGATCGNTTTTAATCAAAATTATCTTAATAAATTAATTTCTATTTTANGNAANAGATATCTNTCTTCCTNATACNCAATATAAGTTAAAACTAGGG'