diff --git a/pysequila/aoh.ipynb b/pysequila/aoh.ipynb index fa3ee7a..47647aa 100644 --- a/pysequila/aoh.ipynb +++ b/pysequila/aoh.ipynb @@ -4,7 +4,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false, + "collapsed": true, "jupyter": { "outputs_hidden": false }, @@ -18,27 +18,38 @@ "mkdir data\n", "cd data\n", " \n", - "#wget http://biodatageeks.ii.pw.edu.pl/sequila/data/WES/NA12878.proper.wes.chr1.md.bam\n", "wget http://biodatageeks.ii.pw.edu.pl/sequila/data/WES/NA12878.proper.wes.chr1.md.bam\n", "wget http://biodatageeks.ii.pw.edu.pl/sequila/data/WES/Homo_sapiens_assembly18.chr1.fasta\n", - "wget http://biodatageeks.ii.pw.edu.pl/sequila/data/WES/Homo_sapiens_assembly18.chr1.fasta.fai\n" + "wget http://biodatageeks.ii.pw.edu.pl/sequila/data/WES/Homo_sapiens_assembly18.chr1.fasta.fai" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "pip install pandas" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ - "pip install pandas\n", "pip install matplotlib" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": { - "collapsed": false, + "collapsed": true, "jupyter": { "outputs_hidden": false }, @@ -59,9 +70,9 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": { - "collapsed": false, + "collapsed": true, "jupyter": { "outputs_hidden": false }, @@ -75,14 +86,14 @@ "spark = SparkSession \\\n", ".builder \\\n", ".appName(f'{app_name}') \\\n", + ".config('spark.driver.memory','16g') \\\n", ".getOrCreate()" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": { - "collapsed": false, "jupyter": { "outputs_hidden": false }, @@ -90,18 +101,7 @@ "name": "#%%\n" } }, - "outputs": [ - { - "data": { - "text/plain": [ - "DataFrame[]" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "from pysequila import SequilaSession\n", "ss = SequilaSession(spark)\n", @@ -113,72 +113,8 @@ }, { "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Collecting pandas\n", - " Downloading pandas-1.1.4-cp37-cp37m-manylinux1_x86_64.whl (9.5 MB)\n", - "\u001b[K |████████████████████████████████| 9.5 MB 1.6 MB/s eta 0:00:01 |▉ | 235 kB 451 kB/s eta 0:00:21\n", - "\u001b[?25hCollecting numpy>=1.15.4\n", - " Downloading numpy-1.19.3-cp37-cp37m-manylinux2010_x86_64.whl (14.9 MB)\n", - "\u001b[K |████████████████████████████████| 14.9 MB 1.2 MB/s eta 0:00:01\n", - "\u001b[?25hRequirement already satisfied: python-dateutil>=2.7.3 in /home/jovyan/venv/pysequila/lib/python3.7/site-packages (from pandas) (2.8.1)\n", - "Collecting pytz>=2017.2\n", - " Downloading pytz-2020.1-py2.py3-none-any.whl (510 kB)\n", - "\u001b[K |████████████████████████████████| 510 kB 3.3 MB/s eta 0:00:01\n", - "\u001b[?25hRequirement already satisfied: six>=1.5 in /home/jovyan/venv/pysequila/lib/python3.7/site-packages (from python-dateutil>=2.7.3->pandas) (1.15.0)\n", - "Installing collected packages: numpy, pytz, pandas\n", - "Successfully installed numpy-1.19.3 pandas-1.1.4 pytz-2020.1\n", - "Note: you may need to restart the kernel to use updated packages.\n" - ] - } - ], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Collecting matplotlib\n", - " Downloading matplotlib-3.3.2-cp37-cp37m-manylinux1_x86_64.whl (11.6 MB)\n", - "\u001b[K |████████████████████████████████| 11.6 MB 6.2 MB/s eta 0:00:01 |██████████████████████ | 8.0 MB 6.2 MB/s eta 0:00:01\n", - "\u001b[?25hCollecting pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3\n", - " Downloading pyparsing-2.4.7-py2.py3-none-any.whl (67 kB)\n", - "\u001b[K |████████████████████████████████| 67 kB 2.1 MB/s eta 0:00:01\n", - "\u001b[?25hCollecting kiwisolver>=1.0.1\n", - " Downloading kiwisolver-1.2.0-cp37-cp37m-manylinux1_x86_64.whl (88 kB)\n", - "\u001b[K |████████████████████████████████| 88 kB 2.6 MB/s eta 0:00:01\n", - "\u001b[?25hRequirement already satisfied: python-dateutil>=2.1 in /home/jovyan/venv/pysequila/lib/python3.7/site-packages (from matplotlib) (2.8.1)\n", - "Collecting cycler>=0.10\n", - " Downloading cycler-0.10.0-py2.py3-none-any.whl (6.5 kB)\n", - "Collecting pillow>=6.2.0\n", - " Downloading Pillow-8.0.1-cp37-cp37m-manylinux1_x86_64.whl (2.2 MB)\n", - "\u001b[K |████████████████████████████████| 2.2 MB 3.9 MB/s eta 0:00:01\n", - "\u001b[?25hRequirement already satisfied: numpy>=1.15 in /home/jovyan/venv/pysequila/lib/python3.7/site-packages (from matplotlib) (1.19.3)\n", - "Requirement already satisfied: certifi>=2020.06.20 in /home/jovyan/venv/pysequila/lib/python3.7/site-packages (from matplotlib) (2020.6.20)\n", - "Requirement already satisfied: six>=1.5 in /home/jovyan/venv/pysequila/lib/python3.7/site-packages (from python-dateutil>=2.1->matplotlib) (1.15.0)\n", - "Installing collected packages: pyparsing, kiwisolver, cycler, pillow, matplotlib\n", - "Successfully installed cycler-0.10.0 kiwisolver-1.2.0 matplotlib-3.3.2 pillow-8.0.1 pyparsing-2.4.7\n", - "Note: you may need to restart the kernel to use updated packages.\n" - ] - } - ], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 23, + "execution_count": null, "metadata": { - "collapsed": false, "jupyter": { "outputs_hidden": false }, @@ -186,530 +122,7 @@ "name": "#%%\n" } }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
sample_idqnameflagcontigpospos_startpos_endmapqcigarrnextpnexttlenseqqualtag_AMtag_AStag_BCtag_BQtag_BZtag_CBtag_CCtag_CGtag_CMtag_COtag_CPtag_CQtag_CRtag_CStag_CTtag_CYtag_E2tag_FItag_FStag_FZtag_H0tag_H1tag_H2tag_HItag_IHtag_LBtag_MCtag_MDtag_MItag_MQtag_NHtag_NMtag_OAtag_OCtag_OPtag_OQtag_OXtag_PGtag_PQtag_PTtag_PUtag_Q2tag_QTtag_QXtag_R2tag_RGtag_RXtag_SAtag_SMtag_TCtag_U2tag_UQ
0NA1287861DC0AAXX100127:8:58:8295:1639799MT7710799101MMT27101AGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTT...ADEAEDBCFCEFFFCDEDFGHEGGHEGBIIFIHGHHIHFFIIFFGG...NoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneCAAGAGCGACAAAGCGGCCAACAGACAATACTAGACAACGAACAGG...NoneNoneNoneNoneNoneNoneNoneNoneNoneNone66G34None99.0None1NoneNoneNoneCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...None61DC0.8NoneNoneNoneNoneNoneNoneNone61DC0.8NoneNoneNoneNoneNone36
1NA1287861DC0AAXX100127:8:35:1455:17431113MT9910990101MMT9101GTCTGTCACCCTTGTAGCCGCTCACGGGAGCTCTCCATGCATTTGG...##########################G1/7=@AEHB5=F=DFC@FE...NoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneCAAACAATGTAGAAATAAACACACAAACCTAAAAATGGTTGAACTA...NoneNoneNoneNoneNoneNoneNoneNoneNoneNone4A7A0T2A2A44G36NoneNaNNone6NoneNoneNone##########################B1+8;>>BB>1<B9BB@>BB...None61DC0.8NoneNoneNoneNoneNoneNoneNone61DC0.8NoneNoneNoneNoneNone34
2NA1287861DC0AAXX100127:8:1:5111:59291177MT101011099101MMT10101TCTATCCCCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGT...B?=<=:\"$.,/;;@?>CI@CAIEDIIGGGHDEFBIEGCIFHHGIHE...NoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneGTCTAAAAAACTAATTAACAAATATTTGTTATAAATGTTTAGGTTG...NoneNoneNoneNoneNoneNoneNoneNoneNoneNone6A56G37NoneNaNNone2NoneNoneNoneB<?79:&*0+.8:@=:@C=@=CBCDCBBBB@AD@CDCADCCCCCCC...None61DC0.8NoneNoneNoneNoneNoneNoneNone61DC0.8NoneNoneNoneNoneNone38
3NA1287861DC0AAXX100127:8:47:17866:4686153MT202012099101MMT20101TATTATCCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCT...###########ICA@GGG=DFDFHC=@GH>HG<IAFBHHFDCHCFC...NoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneATGATAAACTAACACTTGTTATATATGTATGGGTTGTGGGCTTCTG...NoneNoneNoneNoneNoneNoneNoneNoneNoneNone5A47G47NoneNaNNone2NoneNoneNone###########B@B@BBB=AB@BB>;?BB:CB<CAC@CCBACCABA...None61DC0.8NoneNoneNoneNoneNoneNoneNone61DC0.8NoneNoneNoneNoneNone37
4NA1287861CC3AAXX100125:5:79:19223:626199MT25251009976MMT3076ACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGG...ABCADDFCE@GGEHGFHFHHEEIHEEFFII>DEFFFIBDHGIIIII...NoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneCAACAGACATTTCTAGAGAATGAACGGGTTGCAGGGAAGAGTTTTT...NoneNoneNoneNoneNoneNoneNoneNoneNoneNone48G27None99.0None1NoneNoneNoneCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC@CCCCCCCCCCCCCCC...None61CC3.5NoneNoneNoneNoneNoneNoneNone61CC3.5NoneNoneNoneNoneNone29
\n", - "
" - ], - "text/plain": [ - " sample_id qname flag contig pos pos_start \\\n", - "0 NA12878 61DC0AAXX100127:8:58:8295:16397 99 MT 7 7 \n", - "1 NA12878 61DC0AAXX100127:8:35:1455:1743 1113 MT 9 9 \n", - "2 NA12878 61DC0AAXX100127:8:1:5111:5929 1177 MT 10 10 \n", - "3 NA12878 61DC0AAXX100127:8:47:17866:4686 153 MT 20 20 \n", - "4 NA12878 61CC3AAXX100125:5:79:19223:6261 99 MT 25 25 \n", - "\n", - " pos_end mapq cigar rnext pnext tlen \\\n", - "0 107 99 101M MT 27 101 \n", - "1 109 90 101M MT 9 101 \n", - "2 110 99 101M MT 10 101 \n", - "3 120 99 101M MT 20 101 \n", - "4 100 99 76M MT 30 76 \n", - "\n", - " seq \\\n", - "0 AGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTT... \n", - "1 GTCTGTCACCCTTGTAGCCGCTCACGGGAGCTCTCCATGCATTTGG... \n", - "2 TCTATCCCCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGT... \n", - "3 TATTATCCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCT... \n", - "4 ACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGG... \n", - "\n", - " qual tag_AM tag_AS tag_BC \\\n", - "0 ADEAEDBCFCEFFFCDEDFGHEGGHEGBIIFIHGHHIHFFIIFFGG... None None None \n", - "1 ##########################G1/7=@AEHB5=F=DFC@FE... None None None \n", - "2 B?=<=:\"$.,/;;@?>CI@CAIEDIIGGGHDEFBIEGCIFHHGIHE... None None None \n", - "3 ###########ICA@GGG=DFDFHC=@GH>HGDEFFFIBDHGIIIII... None None None \n", - "\n", - " tag_BQ tag_BZ tag_CB tag_CC tag_CG tag_CM tag_CO tag_CP tag_CQ tag_CR \\\n", - "0 None None None None None None None None None None \n", - "1 None None None None None None None None None None \n", - "2 None None None None None None None None None None \n", - "3 None None None None None None None None None None \n", - "4 None None None None None None None None None None \n", - "\n", - " tag_CS tag_CT tag_CY tag_E2 \\\n", - "0 None None None CAAGAGCGACAAAGCGGCCAACAGACAATACTAGACAACGAACAGG... \n", - "1 None None None CAAACAATGTAGAAATAAACACACAAACCTAAAAATGGTTGAACTA... \n", - "2 None None None GTCTAAAAAACTAATTAACAAATATTTGTTATAAATGTTTAGGTTG... \n", - "3 None None None ATGATAAACTAACACTTGTTATATATGTATGGGTTGTGGGCTTCTG... \n", - "4 None None None CAACAGACATTTCTAGAGAATGAACGGGTTGCAGGGAAGAGTTTTT... \n", - "\n", - " tag_FI tag_FS tag_FZ tag_H0 tag_H1 tag_H2 tag_HI tag_IH tag_LB tag_MC \\\n", - "0 None None None None None None None None None None \n", - "1 None None None None None None None None None None \n", - "2 None None None None None None None None None None \n", - "3 None None None None None None None None None None \n", - "4 None None None None None None None None None None \n", - "\n", - " tag_MD tag_MI tag_MQ tag_NH tag_NM tag_OA tag_OC tag_OP \\\n", - "0 66G34 None 99.0 None 1 None None None \n", - "1 4A7A0T2A2A44G36 None NaN None 6 None None None \n", - "2 6A56G37 None NaN None 2 None None None \n", - "3 5A47G47 None NaN None 2 None None None \n", - "4 48G27 None 99.0 None 1 None None None \n", - "\n", - " tag_OQ tag_OX tag_PG tag_PQ \\\n", - "0 CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC... None 61DC0.8 None \n", - "1 ##########################B1+8;>>BB>1BB... None 61DC0.8 None \n", - "2 B;?BB:CB 0.35) & (df.baf< 0.65)) | (df.baf > 0.9)) & (df.coverage>20) & (df.coverage<300)]\n", - "plt.rcParams['figure.figsize'] = [15,3]; plt.scatter(df2.pos_start, df2.baf, s=30,alpha=0.1);\n", - "plt.xlabel('Chromsome 1');plt.ylabel('B-allele frequency'); plt.ylim(0,1); plt.show()\n" + "df2 = df[(((df.baf > 0.35) & (df.baf< 0.65)) | (df.baf > 0.9)) & (df.coverage>20) & (df.coverage<300)]" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], - "source": [] + "source": [ + "import matplotlib.pyplot as plt\n", + "plt.rcParams['figure.figsize'] = [15,3]\n", + "plt.scatter(df2.pos_start, df2.baf, s=30,alpha=0.1)\n", + "plt.xlabel('Chromosome 1')\n", + "plt.ylabel('B-allele frequency')\n", + "plt.ylim(0,1)\n", + "plt.show()" + ] } ], "metadata": { "kernelspec": { - "display_name": "pysequila", + "display_name": "Python 3", "language": "python", - "name": "pysequila" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -762,7 +192,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.8" + "version": "3.6.7" } }, "nbformat": 4,