diff --git a/pysequila/aoh.ipynb b/pysequila/aoh.ipynb index fa3ee7a..47647aa 100644 --- a/pysequila/aoh.ipynb +++ b/pysequila/aoh.ipynb @@ -4,7 +4,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false, + "collapsed": true, "jupyter": { "outputs_hidden": false }, @@ -18,27 +18,38 @@ "mkdir data\n", "cd data\n", " \n", - "#wget http://biodatageeks.ii.pw.edu.pl/sequila/data/WES/NA12878.proper.wes.chr1.md.bam\n", "wget http://biodatageeks.ii.pw.edu.pl/sequila/data/WES/NA12878.proper.wes.chr1.md.bam\n", "wget http://biodatageeks.ii.pw.edu.pl/sequila/data/WES/Homo_sapiens_assembly18.chr1.fasta\n", - "wget http://biodatageeks.ii.pw.edu.pl/sequila/data/WES/Homo_sapiens_assembly18.chr1.fasta.fai\n" + "wget http://biodatageeks.ii.pw.edu.pl/sequila/data/WES/Homo_sapiens_assembly18.chr1.fasta.fai" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "pip install pandas" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ - "pip install pandas\n", "pip install matplotlib" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": { - "collapsed": false, + "collapsed": true, "jupyter": { "outputs_hidden": false }, @@ -59,9 +70,9 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": { - "collapsed": false, + "collapsed": true, "jupyter": { "outputs_hidden": false }, @@ -75,14 +86,14 @@ "spark = SparkSession \\\n", ".builder \\\n", ".appName(f'{app_name}') \\\n", + ".config('spark.driver.memory','16g') \\\n", ".getOrCreate()" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": { - "collapsed": false, "jupyter": { "outputs_hidden": false }, @@ -90,18 +101,7 @@ "name": "#%%\n" } }, - "outputs": [ - { - "data": { - "text/plain": [ - "DataFrame[]" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "from pysequila import SequilaSession\n", "ss = SequilaSession(spark)\n", @@ -113,72 +113,8 @@ }, { "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Collecting pandas\n", - " Downloading pandas-1.1.4-cp37-cp37m-manylinux1_x86_64.whl (9.5 MB)\n", - "\u001b[K |████████████████████████████████| 9.5 MB 1.6 MB/s eta 0:00:01 |▉ | 235 kB 451 kB/s eta 0:00:21\n", - "\u001b[?25hCollecting numpy>=1.15.4\n", - " Downloading numpy-1.19.3-cp37-cp37m-manylinux2010_x86_64.whl (14.9 MB)\n", - "\u001b[K |████████████████████████████████| 14.9 MB 1.2 MB/s eta 0:00:01\n", - "\u001b[?25hRequirement already satisfied: python-dateutil>=2.7.3 in /home/jovyan/venv/pysequila/lib/python3.7/site-packages (from pandas) (2.8.1)\n", - "Collecting pytz>=2017.2\n", - " Downloading pytz-2020.1-py2.py3-none-any.whl (510 kB)\n", - "\u001b[K |████████████████████████████████| 510 kB 3.3 MB/s eta 0:00:01\n", - "\u001b[?25hRequirement already satisfied: six>=1.5 in /home/jovyan/venv/pysequila/lib/python3.7/site-packages (from python-dateutil>=2.7.3->pandas) (1.15.0)\n", - "Installing collected packages: numpy, pytz, pandas\n", - "Successfully installed numpy-1.19.3 pandas-1.1.4 pytz-2020.1\n", - "Note: you may need to restart the kernel to use updated packages.\n" - ] - } - ], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Collecting matplotlib\n", - " Downloading matplotlib-3.3.2-cp37-cp37m-manylinux1_x86_64.whl (11.6 MB)\n", - "\u001b[K |████████████████████████████████| 11.6 MB 6.2 MB/s eta 0:00:01 |██████████████████████ | 8.0 MB 6.2 MB/s eta 0:00:01\n", - "\u001b[?25hCollecting pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3\n", - " Downloading pyparsing-2.4.7-py2.py3-none-any.whl (67 kB)\n", - "\u001b[K |████████████████████████████████| 67 kB 2.1 MB/s eta 0:00:01\n", - "\u001b[?25hCollecting kiwisolver>=1.0.1\n", - " Downloading kiwisolver-1.2.0-cp37-cp37m-manylinux1_x86_64.whl (88 kB)\n", - "\u001b[K |████████████████████████████████| 88 kB 2.6 MB/s eta 0:00:01\n", - "\u001b[?25hRequirement already satisfied: python-dateutil>=2.1 in /home/jovyan/venv/pysequila/lib/python3.7/site-packages (from matplotlib) (2.8.1)\n", - "Collecting cycler>=0.10\n", - " Downloading cycler-0.10.0-py2.py3-none-any.whl (6.5 kB)\n", - "Collecting pillow>=6.2.0\n", - " Downloading Pillow-8.0.1-cp37-cp37m-manylinux1_x86_64.whl (2.2 MB)\n", - "\u001b[K |████████████████████████████████| 2.2 MB 3.9 MB/s eta 0:00:01\n", - "\u001b[?25hRequirement already satisfied: numpy>=1.15 in /home/jovyan/venv/pysequila/lib/python3.7/site-packages (from matplotlib) (1.19.3)\n", - "Requirement already satisfied: certifi>=2020.06.20 in /home/jovyan/venv/pysequila/lib/python3.7/site-packages (from matplotlib) (2020.6.20)\n", - "Requirement already satisfied: six>=1.5 in /home/jovyan/venv/pysequila/lib/python3.7/site-packages (from python-dateutil>=2.1->matplotlib) (1.15.0)\n", - "Installing collected packages: pyparsing, kiwisolver, cycler, pillow, matplotlib\n", - "Successfully installed cycler-0.10.0 kiwisolver-1.2.0 matplotlib-3.3.2 pillow-8.0.1 pyparsing-2.4.7\n", - "Note: you may need to restart the kernel to use updated packages.\n" - ] - } - ], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 23, + "execution_count": null, "metadata": { - "collapsed": false, "jupyter": { "outputs_hidden": false }, @@ -186,530 +122,7 @@ "name": "#%%\n" } }, - "outputs": [ - { - "data": { - "text/html": [ - "
| \n", - " | sample_id | \n", - "qname | \n", - "flag | \n", - "contig | \n", - "pos | \n", - "pos_start | \n", - "pos_end | \n", - "mapq | \n", - "cigar | \n", - "rnext | \n", - "pnext | \n", - "tlen | \n", - "seq | \n", - "qual | \n", - "tag_AM | \n", - "tag_AS | \n", - "tag_BC | \n", - "tag_BQ | \n", - "tag_BZ | \n", - "tag_CB | \n", - "tag_CC | \n", - "tag_CG | \n", - "tag_CM | \n", - "tag_CO | \n", - "tag_CP | \n", - "tag_CQ | \n", - "tag_CR | \n", - "tag_CS | \n", - "tag_CT | \n", - "tag_CY | \n", - "tag_E2 | \n", - "tag_FI | \n", - "tag_FS | \n", - "tag_FZ | \n", - "tag_H0 | \n", - "tag_H1 | \n", - "tag_H2 | \n", - "tag_HI | \n", - "tag_IH | \n", - "tag_LB | \n", - "tag_MC | \n", - "tag_MD | \n", - "tag_MI | \n", - "tag_MQ | \n", - "tag_NH | \n", - "tag_NM | \n", - "tag_OA | \n", - "tag_OC | \n", - "tag_OP | \n", - "tag_OQ | \n", - "tag_OX | \n", - "tag_PG | \n", - "tag_PQ | \n", - "tag_PT | \n", - "tag_PU | \n", - "tag_Q2 | \n", - "tag_QT | \n", - "tag_QX | \n", - "tag_R2 | \n", - "tag_RG | \n", - "tag_RX | \n", - "tag_SA | \n", - "tag_SM | \n", - "tag_TC | \n", - "tag_U2 | \n", - "tag_UQ | \n", - "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", - "NA12878 | \n", - "61DC0AAXX100127:8:58:8295:16397 | \n", - "99 | \n", - "MT | \n", - "7 | \n", - "7 | \n", - "107 | \n", - "99 | \n", - "101M | \n", - "MT | \n", - "27 | \n", - "101 | \n", - "AGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTT... | \n", - "ADEAEDBCFCEFFFCDEDFGHEGGHEGBIIFIHGHHIHFFIIFFGG... | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "CAAGAGCGACAAAGCGGCCAACAGACAATACTAGACAACGAACAGG... | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "66G34 | \n", - "None | \n", - "99.0 | \n", - "None | \n", - "1 | \n", - "None | \n", - "None | \n", - "None | \n", - "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC... | \n", - "None | \n", - "61DC0.8 | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "61DC0.8 | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "36 | \n", - "
| 1 | \n", - "NA12878 | \n", - "61DC0AAXX100127:8:35:1455:1743 | \n", - "1113 | \n", - "MT | \n", - "9 | \n", - "9 | \n", - "109 | \n", - "90 | \n", - "101M | \n", - "MT | \n", - "9 | \n", - "101 | \n", - "GTCTGTCACCCTTGTAGCCGCTCACGGGAGCTCTCCATGCATTTGG... | \n", - "##########################G1/7=@AEHB5=F=DFC@FE... | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "CAAACAATGTAGAAATAAACACACAAACCTAAAAATGGTTGAACTA... | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "4A7A0T2A2A44G36 | \n", - "None | \n", - "NaN | \n", - "None | \n", - "6 | \n", - "None | \n", - "None | \n", - "None | \n", - "##########################B1+8;>>BB>1<B9BB@>BB... | \n", - "None | \n", - "61DC0.8 | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "61DC0.8 | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "34 | \n", - "
| 2 | \n", - "NA12878 | \n", - "61DC0AAXX100127:8:1:5111:5929 | \n", - "1177 | \n", - "MT | \n", - "10 | \n", - "10 | \n", - "110 | \n", - "99 | \n", - "101M | \n", - "MT | \n", - "10 | \n", - "101 | \n", - "TCTATCCCCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGT... | \n", - "B?=<=:\"$.,/;;@?>CI@CAIEDIIGGGHDEFBIEGCIFHHGIHE... | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "GTCTAAAAAACTAATTAACAAATATTTGTTATAAATGTTTAGGTTG... | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "6A56G37 | \n", - "None | \n", - "NaN | \n", - "None | \n", - "2 | \n", - "None | \n", - "None | \n", - "None | \n", - "B<?79:&*0+.8:@=:@C=@=CBCDCBBBB@AD@CDCADCCCCCCC... | \n", - "None | \n", - "61DC0.8 | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "61DC0.8 | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "38 | \n", - "
| 3 | \n", - "NA12878 | \n", - "61DC0AAXX100127:8:47:17866:4686 | \n", - "153 | \n", - "MT | \n", - "20 | \n", - "20 | \n", - "120 | \n", - "99 | \n", - "101M | \n", - "MT | \n", - "20 | \n", - "101 | \n", - "TATTATCCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCT... | \n", - "###########ICA@GGG=DFDFHC=@GH>HG<IAFBHHFDCHCFC... | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "ATGATAAACTAACACTTGTTATATATGTATGGGTTGTGGGCTTCTG... | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "5A47G47 | \n", - "None | \n", - "NaN | \n", - "None | \n", - "2 | \n", - "None | \n", - "None | \n", - "None | \n", - "###########B@B@BBB=AB@BB>;?BB:CB<CAC@CCBACCABA... | \n", - "None | \n", - "61DC0.8 | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "61DC0.8 | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "37 | \n", - "
| 4 | \n", - "NA12878 | \n", - "61CC3AAXX100125:5:79:19223:6261 | \n", - "99 | \n", - "MT | \n", - "25 | \n", - "25 | \n", - "100 | \n", - "99 | \n", - "76M | \n", - "MT | \n", - "30 | \n", - "76 | \n", - "ACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGG... | \n", - "ABCADDFCE@GGEHGFHFHHEEIHEEFFII>DEFFFIBDHGIIIII... | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "CAACAGACATTTCTAGAGAATGAACGGGTTGCAGGGAAGAGTTTTT... | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "48G27 | \n", - "None | \n", - "99.0 | \n", - "None | \n", - "1 | \n", - "None | \n", - "None | \n", - "None | \n", - "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCC@CCCCCCCCCCCCCCC... | \n", - "None | \n", - "61CC3.5 | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "61CC3.5 | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "None | \n", - "29 | \n", - "