From b303c1be196d90538f7e8b8ca4c28e47b76c99d8 Mon Sep 17 00:00:00 2001 From: Chris Tomkins-Tinch Date: Thu, 28 Feb 2019 20:55:32 -0500 Subject: [PATCH] allow outlier_barcodes to act on single-index runs (#932) * initial changes to allow outlier_barcodes to act on single-index runs * cruft removal --- .../single_index/barcodes.txt | 1000 +++++++++++++++++ .../single_index/expected.txt | 5 + .../single_index/metrics.txt | 36 + test/unit/test_illumina.py | 14 + util/illumina_indices.py | 69 +- 5 files changed, 1096 insertions(+), 28 deletions(-) create mode 100644 test/input/TestIlluminaBarcodeHelper/single_index/barcodes.txt create mode 100644 test/input/TestIlluminaBarcodeHelper/single_index/expected.txt create mode 100644 test/input/TestIlluminaBarcodeHelper/single_index/metrics.txt diff --git a/test/input/TestIlluminaBarcodeHelper/single_index/barcodes.txt b/test/input/TestIlluminaBarcodeHelper/single_index/barcodes.txt new file mode 100644 index 000000000..5487c0bed --- /dev/null +++ b/test/input/TestIlluminaBarcodeHelper/single_index/barcodes.txt @@ -0,0 +1,1000 @@ +Barcode1 Likely_Index_Names1 Barcode2 Likely_Index_Names2 Count +CAGAGAGG N708 Unknown 2236892 +GGACTCCT N705 Unknown 1481969 +TAGGCATG N706 Unknown 1388954 +CAGGCGAT Unknown Unknown 1375668 +GCTACGCT N709 Unknown 1370424 +CTCTCTAC N707 Unknown 1181498 +GTAGAGGA N712 Unknown 1144504 +CGAGGCTG N710 Unknown 1078586 +AAGAGGCA N711 Unknown 910195 +TACAGCAT Unknown Unknown 670800 +CATTTTAT Unknown Unknown 627525 +TCATTCAT Unknown Unknown 624071 +ACTGATAT Unknown Unknown 613337 +ATTCCTAT Unknown Unknown 578494 +CGGAATAT Unknown Unknown 564153 +GACGACAT Unknown Unknown 555371 +TATAATAT Unknown Unknown 545928 +CATGGCAT Unknown Unknown 455796 +ATGAGCAT Unknown Unknown 449580 +CCAACAAT Unknown Unknown 446825 +TAATCGAT Unknown Unknown 409191 +CAACTAAT Unknown Unknown 402513 +TCCCGAAT Unknown Unknown 340067 +CAAAAGAT Unknown Unknown 305712 +CTAGCTAT Unknown Unknown 298921 +CACCGGAT Unknown Unknown 217902 +GAAGAGGA N712 Unknown 8740 +AGAGAGGA Unknown Unknown 8486 +GCTACGAT N709 Unknown 6491 +GGACTCAT N705 Unknown 5693 +AAGAGAGG N708 Unknown 5249 +CAGGCATG N706 Unknown 5051 +CAGGGAGG N708 Unknown 4417 +CAGAGAGA N708,S511 Unknown 4238 +CAGAGAGT N708 Unknown 4217 +GTAGAGGT N712 Unknown 4012 +TAATTCAT Unknown Unknown 4012 +GATACGCT N709 Unknown 3956 +CTTCCTAT Unknown Unknown 3741 +GGAGAGGA N712 Unknown 3650 +CTCTCTAA E502,N707,S502,[N|S|E]502 Unknown 3421 +CAGGCGAG Unknown Unknown 3409 +AAGAGGAA N711 Unknown 3380 +CAGACGAT Unknown Unknown 3260 +GTAGAGAA N712 Unknown 3247 +CAGAGAAG N708 Unknown 3185 +ACTGATCT Unknown Unknown 3119 +CATTTATC Unknown Unknown 3101 +TATAATCT Unknown Unknown 2948 +GCAACGCT N709 Unknown 2912 +CGAGGCTA N710 Unknown 2893 +TAGAGAGG N708 Unknown 2785 +CAGAGCGG N708 Unknown 2785 +GGACTCTA Unknown Unknown 2749 +CAGAGGGG N708 Unknown 2693 +TAGGCAAG N706 Unknown 2675 +CAAGAGGA Unknown Unknown 2615 +ATCTCTAC N707 Unknown 2608 +CGAGGCTT N710 Unknown 2566 +AAGGCGAT S520 Unknown 2561 +CTACGCTA Unknown Unknown 2553 +TAGGCATT N706 Unknown 2541 +GGACTACT N705 Unknown 2499 +CCAACATC Unknown Unknown 2479 +GGAATCCT N705 Unknown 2477 +CAGGAGAT Unknown Unknown 2476 +TATAAATC Unknown Unknown 2454 +GAGAGAGG N708 Unknown 2441 +CAAGCGAT Unknown Unknown 2437 +TATATATC Unknown Unknown 2436 +CTCTCTAT E502,S502,[N|S|E]502 Unknown 2419 +CAGGCGCT Unknown Unknown 2404 +ATTTTATC Unknown Unknown 2402 +GCACGCTA Unknown Unknown 2320 +CAGAGGCA N711 Unknown 2304 +CAAAGATC Unknown Unknown 2268 +GCTACCTA Unknown Unknown 2250 +CAGAGGGA Unknown Unknown 2217 +TAGGCGAT Unknown Unknown 2190 +CGAGGCAG N710 Unknown 2168 +TCATCATC Unknown Unknown 2148 +GAGGCTGA Unknown Unknown 2128 +GAGGCATG N706 Unknown 2123 +CTCTCACA Unknown Unknown 2080 +GGGCTCCT N705 Unknown 2028 +GCTCGCTA N716 Unknown 2027 +CGGAATCT Unknown Unknown 2024 +CTATCTAC N707 Unknown 2014 +TAGCATGA Unknown Unknown 2003 +GCTAGCTA Unknown Unknown 1985 +TAAATATC Unknown Unknown 1983 +GCTACGTA Unknown Unknown 1964 +CCTCTCTT Unknown Unknown 1964 +AGAGGCTG N710 Unknown 1937 +CAGGCGAA Unknown Unknown 1922 +CAGAGGAT Unknown Unknown 1888 +CTCTCTCC N707 Unknown 1872 +TCCCGATC Unknown Unknown 1863 +CAGCGAGG N708 Unknown 1857 +CAGAGAAT Unknown Unknown 1837 +CAGAAGGA Unknown Unknown 1786 +TAGGAATG N706 Unknown 1757 +AATGATAT Unknown Unknown 1745 +CGAGGATG N710 Unknown 1740 +CTCCTACA Unknown Unknown 1725 +CAACAATC Unknown Unknown 1723 +GCAGAGGA N712 Unknown 1702 +TAAGCATG N706 Unknown 1689 +TTCTCTAC N707 Unknown 1664 +GCGACGCT N709 Unknown 1664 +GTACGCTA Unknown Unknown 1663 +GTAGAGGC D502,N712 Unknown 1646 +CAGGCAAT Unknown Unknown 1641 +CAGGGGAT Unknown Unknown 1625 +AAGAGGCT N711 Unknown 1612 +CAAACAAT Unknown Unknown 1596 +AGGCGATC Unknown Unknown 1596 +CAGGCATC Unknown Unknown 1590 +CCTGATAT Unknown Unknown 1582 +TAGACATG N706 Unknown 1572 +GCTAAGCT N709 Unknown 1567 +GAACTCCT N705 Unknown 1567 +CAACTATC Unknown Unknown 1565 +CGAGGCGA Unknown Unknown 1553 +AGGCATGA Unknown Unknown 1526 +CCTTCTCT Unknown Unknown 1522 +CGAGCTGA Unknown Unknown 1519 +CATGCGAT Unknown Unknown 1508 +TGACTCCT N705 Unknown 1498 +CAGGAGGA Unknown Unknown 1498 +GTCTCTAC N707 Unknown 1487 +TACAGCAG Unknown Unknown 1474 +CTCTTACA Unknown Unknown 1469 +AAGGCATG N706 Unknown 1463 +GTAGCGGA N712 Unknown 1442 +CGAGAGGA Unknown Unknown 1433 +GAGGCGAT Unknown Unknown 1421 +TCTCTACA Unknown Unknown 1413 +CCTACGCT N709 Unknown 1402 +TACAGAAT Unknown Unknown 1395 +CATAATAT Unknown Unknown 1373 +TCTACGCT E504,N709,[N|S|E]504 Unknown 1364 +CGACTCCT N705 Unknown 1364 +ATGGCATC Unknown Unknown 1352 +TCTCCTCT E503,E504,S503,[N|S|E]503,[N|S|E]504 Unknown 1346 +GGAGGCTG N710 Unknown 1335 +CTGAGCAT Unknown Unknown 1325 +CTCTCTCA Unknown Unknown 1312 +CAAAGAGG N708 Unknown 1302 +GGTCTCCT N705 Unknown 1292 +GGCTCCTA Unknown Unknown 1285 +CGGGATAT Unknown Unknown 1284 +GTACTCCT N705 Unknown 1278 +CTCACTAC N707 Unknown 1278 +CAGCGATC Unknown Unknown 1275 +TAGGCATA N706 Unknown 1273 +TAAGCATC Unknown Unknown 1272 +CGGAGAGG N708,S511 Unknown 1266 +AGGAATAT Unknown Unknown 1261 +TAGGCGTG N706 Unknown 1259 +CAAGGCAT Unknown Unknown 1257 +ACTGAATC Unknown Unknown 1250 +CAGGCTAT S520 Unknown 1241 +CAGGCAAG Unknown Unknown 1241 +CAGACAGG N708 Unknown 1234 +GGACCCTA Unknown Unknown 1234 +TACCGAAT Unknown Unknown 1233 +AATTTTAT Unknown Unknown 1232 +TCTTCCTT Unknown Unknown 1225 +CACTCTAC N707 Unknown 1223 +CGGAAATC Unknown Unknown 1222 +CACAGCAT Unknown Unknown 1220 +AAAAGGCA N711 Unknown 1214 +ACTGCTAT Unknown Unknown 1209 +CATAGCAT Unknown Unknown 1205 +ACTGAGAT Unknown Unknown 1203 +CTTCTACA Unknown Unknown 1200 +GAAGACAT Unknown Unknown 1199 +GGACTGCT N705 Unknown 1198 +GACTCCTA Unknown Unknown 1197 +TAAAGCAT Unknown Unknown 1196 +TAGGGATG N706 Unknown 1187 +CCAAAATC Unknown Unknown 1168 +TGAGGCTG N710 Unknown 1160 +GGACTCCA N705 Unknown 1148 +ATTACTAT Unknown Unknown 1147 +CGGATATC Unknown Unknown 1135 +GGATCCTA Unknown Unknown 1133 +GTAGAGGG N712 Unknown 1123 +GCTACACT N709 Unknown 1122 +AGAGGCAA Unknown Unknown 1122 +TAGGCAGA Unknown Unknown 1120 +CAGTCGAT Unknown Unknown 1120 +CGAGGGTG N710 Unknown 1119 +AAGAGGCC N711 Unknown 1109 +ATTCATAT Unknown Unknown 1101 +GGACCCCT N705 Unknown 1100 +TCCCGCAT Unknown Unknown 1095 +CATTCTAT Unknown Unknown 1092 +AAGACGCA N711 Unknown 1086 +CAGAATAT Unknown Unknown 1070 +AAGAGCAA Unknown Unknown 1068 +AAGAGACA N711 Unknown 1065 +GTAAGGAA E505,S505,[N|S|E]505 Unknown 1062 +CCACAATC Unknown Unknown 1061 +CAGGGCAT Unknown Unknown 1057 +TACAGCAA Unknown Unknown 1052 +CAGGTGAT Unknown Unknown 1037 +TCCCTCCT Unknown Unknown 1034 +TACACATC Unknown Unknown 1033 +ACTGACAT Unknown Unknown 1032 +TAGAGGAA Unknown Unknown 1030 +CTCGCTAC N707 Unknown 1029 +GCTACTCT E504,N709,[N|S|E]504 Unknown 1027 +GGAGTCCT N705 Unknown 1025 +GGCCTCCT N705 Unknown 1022 +GCTACGCA N709 Unknown 1014 +CAATCGAT Unknown Unknown 1009 +CAGAGATG N708 Unknown 1002 +CAAGGCTG N710 Unknown 999 +CGGGGCTG N710 Unknown 999 +TAAACGAT Unknown Unknown 989 +TATACTAT Unknown Unknown 987 +TCACGAAT Unknown Unknown 977 +CAAAAATC Unknown Unknown 977 +GCTGCGCT N709 Unknown 976 +TATCGATC Unknown Unknown 971 +GGAATATC Unknown Unknown 969 +CCGGCGAT Unknown Unknown 964 +AAGGGGCA N711 Unknown 956 +TTATTCAT Unknown Unknown 956 +CACGACAT Unknown Unknown 950 +TAGGCCTG N706 Unknown 949 +TATAAGAT Unknown Unknown 943 +CTTCCTCT Unknown Unknown 942 +GACAACAT Unknown Unknown 942 +CAAGAGAT Unknown Unknown 941 +TCATTAAT Unknown Unknown 937 +TATGATAT Unknown Unknown 932 +CGGGCTGA Unknown Unknown 932 +AAGAGCAT Unknown Unknown 931 +GACGACAG Unknown Unknown 928 +GAGAGGAA Unknown Unknown 927 +CATATTAT Unknown Unknown 926 +GGCGACAT Unknown Unknown 925 +TAGGCTGA Unknown Unknown 924 +CGAGCCTG N710 Unknown 923 +CAAACGAT Unknown Unknown 922 +CCGAGAGG N708 Unknown 918 +CAGGGATG Unknown Unknown 917 +TCATTCAG Unknown Unknown 907 +TAGGCATC N706 Unknown 903 +TACGACAT Unknown Unknown 900 +CTCTTCTT Unknown Unknown 900 +TAATCGAG Unknown Unknown 898 +CCTTTTAT Unknown Unknown 897 +TATGCATG N706 Unknown 896 +GGACACCT N705 Unknown 896 +ATGAGAAT Unknown Unknown 885 +CGGAAGAT Unknown Unknown 878 +CAGAGTGG N708 Unknown 875 +ATTCCTCT Unknown Unknown 872 +GACGACAA Unknown Unknown 872 +GACGAAAT Unknown Unknown 871 +CATTTCAT Unknown Unknown 869 +TACAACAT Unknown Unknown 869 +TTCCTCCT Unknown Unknown 869 +CGAGGCAT Unknown Unknown 861 +CTCTATAC N707 Unknown 860 +CAGGCCAT Unknown Unknown 859 +ATGAGCAG N722 Unknown 858 +CGGACTAT Unknown Unknown 854 +TATAACAT Unknown Unknown 849 +TTTCCTAT Unknown Unknown 849 +ACTACGCT N709 Unknown 848 +AATGGCAT Unknown Unknown 848 +TCAATCAT Unknown Unknown 845 +GACGCCAT Unknown Unknown 843 +TGGCATGA Unknown Unknown 843 +GAGACATC Unknown Unknown 842 +CTACTATC Unknown Unknown 836 +GCTACGCG N709 Unknown 833 +CATGGCAG Unknown Unknown 832 +TAATAGAT Unknown Unknown 831 +ACAACAAT Unknown Unknown 830 +TAACGATC Unknown Unknown 825 +ATGATATC Unknown Unknown 825 +CAGAAGAT Unknown Unknown 816 +AAACTAAT Unknown Unknown 815 +TAGGCAGG N706 Unknown 813 +GACGAATC Unknown Unknown 813 +CTGGCGAT Unknown Unknown 812 +CCAGGCGA Unknown Unknown 811 +CAGAGCAT Unknown Unknown 808 +TAGGCAAT Unknown Unknown 802 +TATAAAAT Unknown Unknown 800 +GATACGAT Unknown Unknown 798 +CATTTTAG Unknown Unknown 798 +TATTTTAT Unknown Unknown 797 +CGGGCGAT Unknown Unknown 792 +GCTAGGCT N709 Unknown 790 +CGAAATAT Unknown Unknown 789 +TGATTCAT Unknown Unknown 788 +TACACCAT Unknown Unknown 788 +GTTCCTAT Unknown Unknown 786 +TCTAATAT Unknown Unknown 785 +TGGAATAT Unknown Unknown 782 +CAACTCAT Unknown Unknown 774 +TCCTTCCT Unknown Unknown 773 +GGACGCCT N705 Unknown 773 +TGGGCATG N706 Unknown 770 +CCTCTACA Unknown Unknown 770 +CCTTTTCT Unknown Unknown 765 +CAGCCGAT Unknown Unknown 763 +AAAAGATC Unknown Unknown 761 +TACAGCCT D501 Unknown 761 +CAAATAAT Unknown Unknown 759 +CAGGCGGT Unknown Unknown 758 +ATTCCGAT Unknown Unknown 757 +TACGGCAT Unknown Unknown 755 +TCATTCAA Unknown Unknown 754 +CTGAGAGG N708 Unknown 753 +TCAACAAT Unknown Unknown 753 +GCTACCCT N709 Unknown 752 +GCCACGCT N709 Unknown 750 +ACTGAAAT Unknown Unknown 749 +CATTTGAT Unknown Unknown 747 +CTCTCAAC N707 Unknown 743 +GCTATGCT N709 Unknown 737 +AACTAATC Unknown Unknown 736 +GCTACTAT Unknown Unknown 736 +CTGTCTAC N707 Unknown 736 +GTCGACAT Unknown Unknown 735 +TAATCGCT Unknown Unknown 734 +TAGGATGA Unknown Unknown 732 +AGACTCCT N705 Unknown 731 +TATAATAG Unknown Unknown 730 +CATTTAAT Unknown Unknown 726 +CATGCCAT Unknown Unknown 726 +CAGAAAGG N708 Unknown 726 +TATAGCAT D501 Unknown 723 +TATGGCAT Unknown Unknown 722 +GAGACGCT Unknown Unknown 721 +AAGAGGAT Unknown Unknown 718 +GGACTCCG N705 Unknown 718 +ATGAGCAA Unknown Unknown 717 +TCGGCATG N706 Unknown 716 +GACGAGAT Unknown Unknown 716 +GTAGAGAT Unknown Unknown 716 +AAGAGGGA N711 Unknown 713 +CGGAAAAT Unknown Unknown 711 +CCATTCAT Unknown Unknown 707 +ACCGGATC Unknown Unknown 704 +TAGCTATC Unknown Unknown 703 +ATCCTATC Unknown Unknown 701 +CATGGAAT Unknown Unknown 701 +TAAAAGAT Unknown Unknown 700 +ACTAATAT Unknown Unknown 697 +GCCGACAT Unknown Unknown 692 +TAGAGCAT Unknown Unknown 691 +CATGACAT Unknown Unknown 690 +TACAGGAT Unknown Unknown 688 +ACTGATAG Unknown Unknown 688 +CGAGGTGA Unknown Unknown 687 +TGTAATAT Unknown Unknown 685 +CGTTTTAT Unknown Unknown 685 +CCCCCCCC Unknown Unknown 684 +GCTCCGCT N709 Unknown 684 +TAAGCGAT Unknown Unknown 678 +CGAAGCTG N710 Unknown 676 +CGGCATAT Unknown Unknown 673 +TATAGTAT Unknown Unknown 669 +GGTACGCT N709 Unknown 666 +GACGACCT Unknown Unknown 664 +AAAAAAAA Unknown Unknown 664 +CAGGGGGG Unknown Unknown 661 +CAGGCGTT Unknown Unknown 659 +TCATTCCT Unknown Unknown 659 +CGGAATAG Unknown Unknown 658 +CAGTGAGG N708 Unknown 657 +CATAGAGG N708 Unknown 656 +GACAGCAT Unknown Unknown 655 +GTAAAGGA N712 Unknown 653 +TATAATAA Unknown Unknown 652 +AAAAAGAT Unknown Unknown 651 +ATAAGCAT Unknown Unknown 651 +TAACTAAT Unknown Unknown 651 +CTAGCGAT Unknown Unknown 650 +CGGAACAT Unknown Unknown 649 +GTTACGCT N709 Unknown 647 +GAAGAGAT Unknown Unknown 646 +TAATCGAA Unknown Unknown 645 +CAACTGAT Unknown Unknown 645 +AAGGAGAT Unknown Unknown 642 +CCAACGAT Unknown Unknown 640 +ATTCCCAT Unknown Unknown 635 +ACAGATAT Unknown Unknown 635 +GAAGAGAA Unknown Unknown 635 +GGACTTCT N705 Unknown 634 +TACATCAT Unknown Unknown 634 +CAGGCGAC Unknown Unknown 632 +ATTCCTAG Unknown Unknown 632 +TCAGTCAT Unknown Unknown 628 +AAGAAGCA N711 Unknown 625 +GTAGGGAA Unknown Unknown 624 +CAATTTAT Unknown Unknown 623 +AATCCTAT Unknown Unknown 623 +CAGGCAGG Unknown Unknown 623 +CATGTTAT Unknown Unknown 621 +CATTATAT Unknown Unknown 621 +AATAATAT Unknown Unknown 620 +TAATTGAT Unknown Unknown 620 +CGAGGCGG N710 Unknown 620 +CCCCCCCT Unknown Unknown 618 +TAAAATAT Unknown Unknown 617 +TAGGTATG N706 Unknown 611 +CTCTCGAC N707 Unknown 609 +CACGGATC Unknown Unknown 604 +GTCCTCCT Unknown Unknown 604 +TTTAATAT Unknown Unknown 603 +ATTCCTAA Unknown Unknown 598 +CGCGGCTG N710 Unknown 595 +CTCTCCAC N707 Unknown 595 +GGACTCTT N705 Unknown 593 +TAGGCACG N706 Unknown 591 +CACGGCAT Unknown Unknown 590 +CAGATAGG N708 Unknown 590 +GAGAGGCA N711 Unknown 589 +ATTCCAAT Unknown Unknown 589 +CAAAAAAT Unknown Unknown 589 +GCTCCTCT Unknown Unknown 589 +CATTTTCT Unknown Unknown 588 +GATGACAT Unknown Unknown 588 +CCACTAAT Unknown Unknown 585 +CACTAATC Unknown Unknown 585 +AATAGGCA N711 Unknown 585 +AAGAGGCG N711 Unknown 585 +GCACTCCT N705 Unknown 584 +CCCCTCCT Unknown Unknown 583 +TTCTCTTC Unknown Unknown 581 +GGACTCCC N705 Unknown 581 +CAGGGATC Unknown Unknown 580 +TCGTTCAT Unknown Unknown 578 +CCAACAAG Unknown Unknown 577 +TTGGCATG N706 Unknown 577 +CTATCTAT E502,S502,[N|S|E]502 Unknown 576 +CGTAATAT Unknown Unknown 574 +CCAACACT Unknown Unknown 573 +AAGGGCAA Unknown Unknown 570 +GGGAATAT Unknown Unknown 569 +ACTGATAA Unknown Unknown 569 +GCTACGCC N709 Unknown 568 +GAAGAGGT Unknown Unknown 567 +CATTTTAA Unknown Unknown 566 +TCTTTCAT Unknown Unknown 561 +TCACTCAT Unknown Unknown 561 +TAGTCATG N706 Unknown 559 +ATGGGCAT Unknown Unknown 557 +GTAGATGA N712 Unknown 557 +CTTTCTAC N707 Unknown 556 +AGGAGGCA N711 Unknown 551 +GCATTCAT Unknown Unknown 550 +ATTCGTAT Unknown Unknown 550 +CGAATATC Unknown Unknown 549 +CAACTACT Unknown Unknown 549 +ATAGCTAT Unknown Unknown 547 +CAGGCGTG Unknown Unknown 543 +CTAACTAT Unknown Unknown 543 +CTTTTTAT Unknown Unknown 542 +TCCAGAAT Unknown Unknown 541 +TTAGAGGA N712 Unknown 541 +TCATCGAT Unknown Unknown 540 +CAACTAAG Unknown Unknown 538 +CATGCTAT Unknown Unknown 537 +AAGAGCCA N711 Unknown 536 +CAAAAGAG Unknown Unknown 536 +GTGAGGAA Unknown Unknown 534 +ATTGATAT Unknown Unknown 533 +CATGGCAA Unknown Unknown 533 +TACCGCAT Unknown Unknown 533 +TAGTCGAT Unknown Unknown 531 +CGAGACTG N710 Unknown 531 +CAGGGAAG Unknown Unknown 531 +CTCCCTAC E505,N707,S505,[N|S|E]505 Unknown 531 +CGACTAAT S510 Unknown 530 +TCCCGACT Unknown Unknown 529 +ACGAGGCA N711 Unknown 526 +CAGAGACG N708 Unknown 525 +TCCTTCAT Unknown Unknown 524 +CGGCGATC Unknown Unknown 524 +ATACCTAT Unknown Unknown 523 +CCGCGCGG Unknown Unknown 523 +CATGATAT Unknown Unknown 521 +CCAACCAT Unknown Unknown 521 +CACAGGAT Unknown Unknown 520 +CAATAATC Unknown Unknown 519 +CTAGATAT Unknown Unknown 516 +CAGACAAT Unknown Unknown 515 +ATTGCTAT Unknown Unknown 515 +CCATTTTA Unknown Unknown 511 +CAAGCTAT Unknown Unknown 509 +AACGACAT Unknown Unknown 508 +TCCCGGAT Unknown Unknown 506 +ATGAGCCT Unknown Unknown 505 +TTCAGCAT Unknown Unknown 504 +CAACCAAT Unknown Unknown 502 +CCAAAAAT Unknown Unknown 500 +AACAGCAT Unknown Unknown 500 +TACAGATC Unknown Unknown 499 +GGATTCCT N705 Unknown 499 +CGCTCTAC N707 Unknown 497 +GATAAGAT Unknown Unknown 496 +GTAGGGGA N712 Unknown 493 +TCCCGAAG Unknown Unknown 493 +CATGCATC Unknown Unknown 490 +CTAACAAT Unknown Unknown 490 +TCCAGCAT Unknown Unknown 486 +AAGCGGCA N711 Unknown 485 +CTAGCTAG Unknown Unknown 485 +CATGGTAT Unknown Unknown 484 +ACTGGTAT Unknown Unknown 482 +TCATTGAT Unknown Unknown 481 +ATGACATC Unknown Unknown 480 +GCTATCTC Unknown Unknown 479 +ATTAGCAT Unknown Unknown 477 +CAAAGAAT Unknown Unknown 476 +ATTCTATC Unknown Unknown 475 +CGAGGCCG N710 Unknown 475 +GGAGAGGT Unknown Unknown 474 +TATCATAT Unknown Unknown 472 +GACGGCAT Unknown Unknown 472 +TGCAGCAT Unknown Unknown 472 +TACCGGAT Unknown Unknown 471 +GATAATAT Unknown Unknown 471 +CTAGGCTG N710 Unknown 470 +CTAGCAAT Unknown Unknown 468 +CATCTTAT Unknown Unknown 467 +CCAGCAAT Unknown Unknown 467 +CAACGGAT Unknown Unknown 465 +CAAAAGCT Unknown Unknown 465 +CACGCGAT Unknown Unknown 464 +ATGAGGAT Unknown Unknown 464 +CATGTCAT Unknown Unknown 463 +GTGGAGGA N712 Unknown 463 +CCCCGAAT Unknown Unknown 461 +TAATCAAT Unknown Unknown 459 +GAAAAGAT Unknown Unknown 457 +TACAGTAT Unknown Unknown 456 +AAGATGCA N711 Unknown 456 +CAGGCGGG Unknown Unknown 456 +GTAGAAGA N712 Unknown 455 +AAAGGCAA Unknown Unknown 455 +ATTCTTAT Unknown Unknown 454 +CATACGAT Unknown Unknown 454 +CCCAACAA Unknown Unknown 453 +ATGAGGCA N711 Unknown 452 +CGAGTCTG N710 Unknown 451 +CTCTGTAC N707 Unknown 449 +CCTCTTCC Unknown Unknown 449 +ATTCCCTA Unknown Unknown 447 +GCAACAAT Unknown Unknown 447 +CTAGCCAT Unknown Unknown 444 +CAGGGGAG Unknown Unknown 443 +CACCGAAT Unknown Unknown 442 +ATGACCAT Unknown Unknown 438 +AACCGGAT Unknown Unknown 437 +CAGAGCGT Unknown Unknown 435 +CTGGCTAT Unknown Unknown 434 +AGAGCATC Unknown Unknown 434 +CCTACGAT Unknown Unknown 434 +TCTGATAT Unknown Unknown 434 +GAGGCGCT Unknown Unknown 430 +TCCCGAAA Unknown Unknown 429 +CACCGGAG Unknown Unknown 429 +GCTACGTT N709 Unknown 429 +CATTGTAT Unknown Unknown 429 +CGGAATAA Unknown Unknown 428 +CTCTACAT Unknown Unknown 428 +CAGACGCT Unknown Unknown 427 +TAATTAAT Unknown Unknown 424 +CCAACAAA Unknown Unknown 420 +ATAGAGGA D502,N712 Unknown 420 +AAGAGTCA N711 Unknown 417 +GTAGACGA N712 Unknown 414 +CATGGGAT Unknown Unknown 414 +TCATTATC Unknown Unknown 413 +CGGAGTAT Unknown Unknown 412 +TAGAATAT Unknown Unknown 412 +CAAAGGAT Unknown Unknown 412 +GAAAAGGA Unknown Unknown 411 +GCAGAGGT Unknown Unknown 411 +CTACTAAT Unknown Unknown 411 +GGACTCGT N705 Unknown 411 +CAGGGCGG Unknown Unknown 409 +GAACTAAT Unknown Unknown 408 +TAATCCAT Unknown Unknown 408 +CGTGGCTG N710 Unknown 407 +CCTTTCCT D503 Unknown 406 +TCCGAATC Unknown Unknown 405 +CGACTCAT Unknown Unknown 404 +CACCGGCT Unknown Unknown 402 +TCTCGAAT Unknown Unknown 401 +GAAGAGGG Unknown Unknown 398 +CTCTCTGC N707 Unknown 397 +GCTGATAT Unknown Unknown 396 +CAGACAAG Unknown Unknown 396 +GATGGCAT Unknown Unknown 395 +CGGTATAT Unknown Unknown 394 +GTATAGGA N712 Unknown 394 +CAACTAAA Unknown Unknown 393 +CCTGGCAT Unknown Unknown 393 +CATACGCT Unknown Unknown 392 +CTGAATAT Unknown Unknown 391 +GATTTTAT Unknown Unknown 390 +CATTTTTA Unknown Unknown 390 +TACGCATG N706 Unknown 387 +TCATCCAT Unknown Unknown 387 +TCCTCTCT Unknown Unknown 387 +TAATGGAT Unknown Unknown 387 +CAATTCAT Unknown Unknown 386 +CTTTCTAT E502,S502,[N|S|E]502 Unknown 386 +CAGCTAAT Unknown Unknown 386 +GCTACGGT N709 Unknown 385 +CATGGCCT Unknown Unknown 383 +CAGGATCT Unknown Unknown 383 +GAGGACAT Unknown Unknown 382 +AAGAGAGA Unknown Unknown 382 +GAGGAGAT Unknown Unknown 382 +CAAAAGAA Unknown Unknown 381 +ATGAACAT Unknown Unknown 380 +CCAAGAAT Unknown Unknown 379 +TCATTTAT Unknown Unknown 379 +GAGCTCCT Unknown Unknown 378 +CCAATAAT Unknown Unknown 377 +TAGAGAAG Unknown Unknown 375 +GACGATAT Unknown Unknown 375 +CGTGGCAT Unknown Unknown 375 +GAGGAGGA Unknown Unknown 375 +CCAACTAA Unknown Unknown 374 +GTAGAGTA N712 Unknown 371 +CTCTTTAC E505,N707,S505,[N|S|E]505 Unknown 371 +GCAACGAT Unknown Unknown 369 +ATGAGTAT Unknown Unknown 368 +ACAGCATC Unknown Unknown 366 +CAGAGGGT Unknown Unknown 366 +CAAAATAT Unknown Unknown 365 +TAAAGAAT Unknown Unknown 365 +CAGGCTGA Unknown Unknown 363 +GAACTCAT Unknown Unknown 363 +GACGCTAT Unknown Unknown 357 +TAATCTAT Unknown Unknown 355 +CATTGCAT Unknown Unknown 355 +CAACAGAT Unknown Unknown 354 +GGAATCAT Unknown Unknown 353 +ACTTATAT Unknown Unknown 352 +CCCCCTCT Unknown Unknown 352 +GAATCGAT Unknown Unknown 351 +AAATCGAT Unknown Unknown 351 +CATTATCT Unknown Unknown 349 +CAGAGATC Unknown Unknown 348 +CAGGGCTG Unknown Unknown 348 +TAGAGGCA N711 Unknown 347 +CAGGGCGA Unknown Unknown 347 +TCCTCCCT Unknown Unknown 346 +CCGACAAT Unknown Unknown 345 +GAAGGGGA Unknown Unknown 345 +CAGAGGAA Unknown Unknown 345 +GTAGAGCA N712 Unknown 345 +TACGCTAT Unknown Unknown 344 +GCTAAGAT Unknown Unknown 343 +CGAGGTTG N710 Unknown 342 +GTCTCTTC Unknown Unknown 342 +TAGAGATG Unknown Unknown 341 +TTATCGAT Unknown Unknown 341 +TACGCATC Unknown Unknown 341 +TCCCGTAT Unknown Unknown 341 +CATTCATC Unknown Unknown 339 +TGATCGAT Unknown Unknown 339 +CAGAGTAG Unknown Unknown 337 +CAACGAAT Unknown Unknown 337 +CACCGATC Unknown Unknown 337 +ATGCGCAT N722 Unknown 336 +CATCTCGT Unknown Unknown 334 +CATGGATC Unknown Unknown 334 +CAGGGAAT Unknown Unknown 333 +CGAAAGAT Unknown Unknown 333 +ATTCCATC Unknown Unknown 333 +CACCCGAT Unknown Unknown 332 +GGACCTAT Unknown Unknown 332 +CATAGTAT Unknown Unknown 332 +ACATTCAT Unknown Unknown 331 +ACTCTTCC Unknown Unknown 331 +CTACCTAT Unknown Unknown 329 +GAAGCGGA Unknown Unknown 329 +CAAGCAAT Unknown Unknown 328 +CACCGCAT Unknown Unknown 328 +CAGACATG Unknown Unknown 327 +TCCCCGAA D710 Unknown 326 +GGAATGCT Unknown Unknown 325 +AAGAGGTA N711 Unknown 325 +TAATCATC Unknown Unknown 325 +CCTGCTAT Unknown Unknown 325 +CTCTCTCT E502,S502,[N|S|E]502 Unknown 323 +GAGAGGGG Unknown Unknown 322 +CTAGCTAA Unknown Unknown 322 +ACTGTATC Unknown Unknown 322 +ATTTCTAT Unknown Unknown 321 +CAAGTAAT Unknown Unknown 320 +CAGAGGAG Unknown Unknown 319 +ATGAGCCA Unknown Unknown 318 +TAGGCCAT Unknown Unknown 318 +ACTCCTAT Unknown Unknown 318 +ATTAATAT Unknown Unknown 316 +CTAGAGGA N712 Unknown 316 +ATGAGATC Unknown Unknown 316 +CCGGAATA Unknown Unknown 315 +CATAAGAT Unknown Unknown 315 +ATTCATCT Unknown Unknown 315 +CAGGCGGA Unknown Unknown 313 +TAGCCATG N706 Unknown 312 +CAGCGAGT Unknown Unknown 311 +CAGCGGAT Unknown Unknown 310 +CTAGCTAC Unknown Unknown 309 +TAAATCAT Unknown Unknown 308 +GCCGCTAT D502 Unknown 307 +TAACCGAT Unknown Unknown 307 +TACAGCCA Unknown Unknown 307 +CAGGCCGA Unknown Unknown 306 +CTTGCTAT Unknown Unknown 304 +GATAGGCT Unknown Unknown 304 +TCGGCCTG Unknown Unknown 302 +CAGATCTC Unknown Unknown 302 +GAGGGAGG Unknown Unknown 301 +CAGAGTAT Unknown Unknown 301 +TATGCATT E506,S506,[N|S|E]506 Unknown 300 +AAGAGACG Unknown Unknown 299 +TAGTTCAT Unknown Unknown 297 +CCTCCTCT Unknown Unknown 297 +TAGGGAGG Unknown Unknown 296 +ATGATCAT Unknown Unknown 296 +AGAGGATG E503,S503,[N|S|E]503 Unknown 294 +ACTATATC Unknown Unknown 292 +TACAATAT Unknown Unknown 292 +CATCCTAT Unknown Unknown 291 +CGAGTCTT Unknown Unknown 291 +GCCCGAAT Unknown Unknown 291 +CCAACTAT Unknown Unknown 290 +TACAGAAG Unknown Unknown 290 +CATAGAGT S516 Unknown 289 +GACTACAT Unknown Unknown 288 +GACGCATC Unknown Unknown 288 +TTGAGCAT Unknown Unknown 287 +GTGAGCAT Unknown Unknown 285 +TCCCCAAT Unknown Unknown 285 +TAGGAAAG Unknown Unknown 284 +TCATACAT Unknown Unknown 284 +CAAAACAT Unknown Unknown 283 +GGCCTCAT Unknown Unknown 282 +CAGAGCAG Unknown Unknown 282 +TAGAGAAT Unknown Unknown 281 +TCTTCATC Unknown Unknown 280 +CATACTAT Unknown Unknown 280 +GGAATAAT Unknown Unknown 279 +ACGATATC Unknown Unknown 278 +TCCCGATA Unknown Unknown 277 +AAGAGAAG Unknown Unknown 277 +AGTCCTAT Unknown Unknown 276 +GTAGCGGT Unknown Unknown 276 +GACCACAT Unknown Unknown 276 +CCAAAGAT Unknown Unknown 275 +GTAGCTAT Unknown Unknown 274 +AGTGATAT Unknown Unknown 273 +CTTTTATC Unknown Unknown 273 +TATTCGAT Unknown Unknown 273 +GGACTAAT Unknown Unknown 273 +CACCGGAA Unknown Unknown 272 +GAAGAAGA Unknown Unknown 272 +CCATGGCA Unknown Unknown 271 +TCATTCCA Unknown Unknown 271 +TTAGCTAT Unknown Unknown 270 +TATACGAT Unknown Unknown 270 +CATCTAAT S510 Unknown 269 +CATCGCAT Unknown Unknown 268 +GGAATACT Unknown Unknown 268 +AAGGCAAT S520 Unknown 267 +ATTTCCTA Unknown Unknown 267 +ATGCCTAT Unknown Unknown 267 +TCTCCGCT Unknown Unknown 267 +CAGGAGAG Unknown Unknown 266 +GCTGCTAT Unknown Unknown 265 +TCGCGAAT Unknown Unknown 265 +ATCTCGTA Unknown Unknown 265 +GAGGAAGG Unknown Unknown 264 +GGACTTAT Unknown Unknown 264 +TATATTAT Unknown Unknown 263 +TTCCTATC Unknown Unknown 262 +CAAGCATG Unknown Unknown 261 +CAACTCCT Unknown Unknown 260 +CTTCTCCT D505 Unknown 260 +AAGGCGCA Unknown Unknown 259 +CCACCAAT Unknown Unknown 259 +CCAAAAGA Unknown Unknown 258 +GCGACATC Unknown Unknown 257 +CAGAAAAT Unknown Unknown 256 +CATAGAAT Unknown Unknown 256 +ATCAGCAT Unknown Unknown 256 +AGGAGCAT Unknown Unknown 256 +CGGGGATG Unknown Unknown 256 +TACAGCAC Unknown Unknown 255 +CGAACAAT Unknown Unknown 254 +ACGGATAT Unknown Unknown 254 +CGATGCTG N710 Unknown 253 +CAGTCTAT Unknown Unknown 252 +CTAGCTCT Unknown Unknown 251 +CTTGGCAT Unknown Unknown 251 +TAGAGGAT Unknown Unknown 251 +CAAAGCAT Unknown Unknown 251 +CTCGCTAT E502,S502,[N|S|E]502 Unknown 250 +CCGGCGCT Unknown Unknown 250 +GCGCTATC Unknown Unknown 249 +AAGAGAAT Unknown Unknown 248 +CACAGAGG N708 Unknown 247 +TATTATAT Unknown Unknown 247 +TCCCAAAT Unknown Unknown 247 +CTCTCTTC N707 Unknown 246 +CATTCCTA Unknown Unknown 246 +GGAGTCAT Unknown Unknown 245 +GGAGACAT Unknown Unknown 245 +CAATTAAT Unknown Unknown 244 +CTATAATA Unknown Unknown 243 +ATTCCTAC Unknown Unknown 243 +GATCTCGT Unknown Unknown 242 +GAGAGGAT Unknown Unknown 242 +TAGGCGTT Unknown Unknown 241 +CATTCGAT Unknown Unknown 241 +CACCGTAT Unknown Unknown 241 +AAGGCGAA Unknown Unknown 241 +CAACAAAT Unknown Unknown 241 +CTAGGTAT Unknown Unknown 240 +CAAGGAGG Unknown Unknown 240 +TAGACGAT Unknown Unknown 240 +CGAGGATC Unknown Unknown 239 +GAGAGGCT Unknown Unknown 238 +GAAGCGAT Unknown Unknown 238 +TGACTCAT Unknown Unknown 238 +TAGGCTTG N706 Unknown 237 +GGGATCCT Unknown Unknown 237 +TACAGCGT Unknown Unknown 237 +CGGGAGAT Unknown Unknown 237 +TCCCTAAT Unknown Unknown 236 +CAGAGAGC N708 Unknown 236 +TACTCGAT Unknown Unknown 236 +CAGGGCAG Unknown Unknown 236 +CTCACTAT E502,S502,[N|S|E]502 Unknown 236 +GCCACGAT Unknown Unknown 236 +GGAGACCT Unknown Unknown 236 +GACCGGAT Unknown Unknown 235 +TAGACAGG Unknown Unknown 235 +CCCTCTAC N707 Unknown 235 +TACTTCAT Unknown Unknown 234 +CAAAGAAG Unknown Unknown 234 +GTAGGAAT Unknown Unknown 234 +CAGAGGCG Unknown Unknown 234 +CACTCTAT E502,S502,[N|S|E]502 Unknown 233 +TATTCATC Unknown Unknown 233 +TCATTTCA Unknown Unknown 233 +GAAGAAAT Unknown Unknown 233 +AACAGGCA N711 Unknown 233 +CGAGGAGG Unknown Unknown 232 +GGATTCAT Unknown Unknown 232 +GGAGAGGG Unknown Unknown 231 +CAGTTTAT Unknown Unknown 231 +ATATATAA Unknown Unknown 231 +CCCGAATC Unknown Unknown 230 +CAGGAGAA Unknown Unknown 229 +TCCTATCT Unknown Unknown 228 +CAGGCGCA Unknown Unknown 228 +GGAACGCT Unknown Unknown 227 +TAACTCAT Unknown Unknown 227 +GACGTCAT Unknown Unknown 227 +TATCTCGT Unknown Unknown 226 +TCATGCAT Unknown Unknown 226 +CAGAAAAG Unknown Unknown 225 +ATTCTCCT Unknown Unknown 225 +ATAATATC Unknown Unknown 225 +CAGAGACA Unknown Unknown 225 +GCGACGCG Unknown Unknown 225 +CAGAGGCT Unknown Unknown 224 +ACGACATC Unknown Unknown 224 +CAGGATAT Unknown Unknown 224 +AAGACGCT Unknown Unknown 224 +ACTTCTCT Unknown Unknown 224 +TACACGAT Unknown Unknown 223 +CACGCCAT Unknown Unknown 223 +CAAGGCAG Unknown Unknown 222 +GAGGCATT Unknown Unknown 220 +CTCATTCA Unknown Unknown 220 +TAATCGGT Unknown Unknown 220 +TCAGCATC Unknown Unknown 219 +CAGTCTAC Unknown Unknown 219 +ACTGTTAT Unknown Unknown 219 +ACCCGAAT Unknown Unknown 219 +TAAAGGAT Unknown Unknown 219 +GGAGGCTT Unknown Unknown 218 +CTTACTAT Unknown Unknown 217 +CACGGGAT Unknown Unknown 217 +CTACAGCA Unknown Unknown 216 +GAAGCATG Unknown Unknown 215 +TAGGCCTT Unknown Unknown 215 +GAACTACT Unknown Unknown 214 +CAGGCATT Unknown Unknown 214 +CAACTAGT Unknown Unknown 214 +GACGACCA Unknown Unknown 213 +CCTACAAT Unknown Unknown 213 +CTGATATC Unknown Unknown 213 +TATTTCAT Unknown Unknown 212 +ATCCCTAT Unknown Unknown 212 +TAAGCATT Unknown Unknown 211 +CAGCGAAT Unknown Unknown 210 +TTCCGAAT Unknown Unknown 210 +ACCGATAT Unknown Unknown 210 +GACGACGT Unknown Unknown 210 +CAGACGGT Unknown Unknown 209 +CAACCGAT Unknown Unknown 209 +TCCATTCA Unknown Unknown 208 +GTAGCGAA Unknown Unknown 208 +TATAATGT Unknown Unknown 208 +CTCCCTAT E502,S502,[N|S|E]502 Unknown 207 +GCAAAGCT Unknown Unknown 207 +GACGAAAG Unknown Unknown 206 +CAAGCCAT Unknown Unknown 206 +TAATCGAC Unknown Unknown 205 +GGAGACGA Unknown Unknown 205 +CAGATAGT Unknown Unknown 205 +CCTAGCTA Unknown Unknown 205 +CATGGCCA Unknown Unknown 205 +CATTTTTT Unknown Unknown 204 +CCAGCGAT Unknown Unknown 204 +CCTACTAT Unknown Unknown 204 +GCAACACT Unknown Unknown 203 +ACTCATAT Unknown Unknown 202 +GTAGATAA Unknown Unknown 202 +GATGATAT Unknown Unknown 202 +CATGAGAT Unknown Unknown 202 +GACACATC Unknown Unknown 202 +TGAGCATC Unknown Unknown 201 +CCGACGCT Unknown Unknown 201 +GTACAGGA N712 Unknown 200 +CGAGGCCT Unknown Unknown 200 +GTAGTGGA N712 Unknown 200 +CAAGTCAT Unknown Unknown 200 +CTCTCTAG E502,N707,S502,[N|S|E]502 Unknown 200 +TAGGTGAT Unknown Unknown 200 +GAGAGAGT Unknown Unknown 199 +GCAGCGCT Unknown Unknown 198 +AGGCGATA D501 Unknown 198 +CCCCTTCT Unknown Unknown 198 +CATCGGAT Unknown Unknown 197 +CTAGTTAT Unknown Unknown 197 +TAATCGTT Unknown Unknown 196 +TATACGCT Unknown Unknown 196 +GGTCCTAT Unknown Unknown 195 +TAAACAAT Unknown Unknown 195 +GCTCCTAT Unknown Unknown 195 +GGAAGCCT Unknown Unknown 195 +CGACGCTG N710 Unknown 195 +TAAGTCAT Unknown Unknown 194 +TATAAATA Unknown Unknown 194 +CATTTTAC Unknown Unknown 193 +CTAATCGA A505 Unknown 193 +GTTACGAT Unknown Unknown 193 +CACTGATA Unknown Unknown 193 +TCCTGAAT Unknown Unknown 193 +CAACTTAT Unknown Unknown 192 +CACAGAAT Unknown Unknown 192 +GGACTATC Unknown Unknown 192 +CATGAGCA Unknown Unknown 192 +CGGATCTC Unknown Unknown 192 +CTAGTATC Unknown Unknown 192 +ATAGCATC Unknown Unknown 192 +GCCTCCCT Unknown Unknown 192 +GTACTCAT Unknown Unknown 191 +ACCTGATA Unknown Unknown 191 +GAGGAATG Unknown Unknown 191 +TAAGGCAT Unknown Unknown 191 +CACCAGAT Unknown Unknown 190 +TAATATCT Unknown Unknown 189 +CCAACGCT Unknown Unknown 188 +CACTTTAT Unknown Unknown 187 +ATCTCTAA Unknown Unknown 187 +GGGCTACT Unknown Unknown 187 +TCTACGAT Unknown Unknown 186 +GTCTCTAT D502,E502,S502,[N|S|E]502 Unknown 186 +ATGAGCAC Unknown Unknown 186 +TCATTCAC Unknown Unknown 185 +CGAGCTAT Unknown Unknown 185 +TATGCTAT Unknown Unknown 185 +TACTGCAT Unknown Unknown 185 +CGAGGAAG Unknown Unknown 185 diff --git a/test/input/TestIlluminaBarcodeHelper/single_index/expected.txt b/test/input/TestIlluminaBarcodeHelper/single_index/expected.txt new file mode 100644 index 000000000..57f3b5e2c --- /dev/null +++ b/test/input/TestIlluminaBarcodeHelper/single_index/expected.txt @@ -0,0 +1,5 @@ +sample_name expected_barcode_1 expected_barcode_1_name guessed_barcode_1 guessed_barcode_1_name match_type +EBOV-VP24UTR_G10218A-C10220U.DMSO CAAAAGAT alternative_indices_uncertain +EBOV-VP24UTR_G10218A-C10242U.DMSO CACCGGAT alternative_indices_uncertain +EBOV-VP24UTR_G10218A-C10242U.NMIA CTAGCTAT alternative_indices_uncertain +EBOV-VP24UTR_G10218A-C10242U.DENAT TCCCGAAT alternative_indices_uncertain diff --git a/test/input/TestIlluminaBarcodeHelper/single_index/metrics.txt b/test/input/TestIlluminaBarcodeHelper/single_index/metrics.txt new file mode 100644 index 000000000..2b764f1e7 --- /dev/null +++ b/test/input/TestIlluminaBarcodeHelper/single_index/metrics.txt @@ -0,0 +1,36 @@ +## htsjdk.samtools.metrics.StringHeader +# ExtractIlluminaBarcodes BASECALLS_DIR=tmp/00_flowcells/171110_M04004_0212_000000000-BDTT8/Data/Intensities/BaseCalls OUTPUT_DIR=/tmp/8574290.1.broad/tmp-illumina-illumina_demux-8574290-0-5ctjr6gn/extracted_barcodes-qt71mddp LANE=1 READ_STRUCTURE=150T8B150T BARCODE_FILE=/tmp/8574290.1.broad/tmp-illumina-illumina_demux-8574290-0-5ctjr6gn/barcodeData.BDTT8.1x4fc_7rw.txt METRICS_FILE=/idi/sabeti-scratch/alin/software/viral-ngs-etc/projects/190206-LCMV-C2c2/reports/barcodes/barcodes-metrics-BDTT8.1.txt MAX_MISMATCHES=0 MINIMUM_BASE_QUALITY=20 NUM_PROCESSORS=0 MIN_MISMATCH_DELTA=1 MAX_NO_CALLS=2 MINIMUM_QUALITY=2 COMPRESS_OUTPUTS=false VERBOSITY=INFO QUIET=false VALIDATION_STRINGENCY=STRICT COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false +## htsjdk.samtools.metrics.StringHeader +# Started on: Sat Feb 16 00:09:44 EST 2019 + +## METRICS CLASS picard.illumina.ExtractIlluminaBarcodes$BarcodeMetric +BARCODE BARCODE_WITHOUT_DELIMITER BARCODE_NAME LIBRARY_NAME READS PF_READS PERFECT_MATCHES PF_PERFECT_MATCHES ONE_MISMATCH_MATCHES PF_ONE_MISMATCH_MATCHES PCT_MATCHES RATIO_THIS_BARCODE_TO_BEST_BARCODE_PCT PF_PCT_MATCHES PF_RATIO_THIS_BARCODE_TO_BEST_BARCODE_PCT PF_NORMALIZED_MATCHES +GGACTCCT GGACTCCT pTOPO-LCMV-Arm-emGFP-S-NoEnds_P3-IC-170825-cloned_171103 pTOPO-LCMV-Arm-emGFP-S-NoEnds_P3-IC-170825-cloned_171103.l1 1282976 1239382 1282976 1239382 0 0 0.060095 0.655219 0.065356 0.652624 1.904065 +TAGGCATG TAGGCATG pcDNA34-MGC-ADAR2-V5 pcDNA34-MGC-ADAR2-V5.l1 1192766 1159442 1192766 1159442 0 0 0.055869 0.609149 0.06114 0.61053 1.781253 +CTCTCTAC CTCTCTAC pT7CFE-MGC-ADAR2 pT7CFE-MGC-ADAR2.l1 1051975 1009354 1051975 1009354 0 0 0.049275 0.537247 0.053226 0.531497 1.550672 +CAGAGAGG CAGAGAGG p20_3E5E_fLuc2 p20_3E5E_fLuc2.l1 1958086 1899076 1958086 1899076 0 0 0.091717 1 0.100143 1 2.917554 +GCTACGCT GCTACGCT p20_3E5E_rLuc-fLuc2_6 p20_3E5E_rLuc-fLuc2_6.l1 1184157 1158385 1184157 1158385 0 0 0.055466 0.604752 0.061084 0.609973 1.779629 +CGAGGCTG CGAGGCTG p20_3E5E_rLuc-fLuc2_5 p20_3E5E_rLuc-fLuc2_5.l1 941504 917972 941504 917972 0 0 0.0441 0.480829 0.048407 0.483378 1.410282 +AAGAGGCA AAGAGGCA p20_3E5E_rLuc-fLuc2_7 p20_3E5E_rLuc-fLuc2_7.l1 765762 744083 765762 744083 0 0 0.035868 0.391077 0.039237 0.391813 1.143136 +GTAGAGGA GTAGAGGA p20_3E5E_rLuc-fLuc2_8 p20_3E5E_rLuc-fLuc2_8.l1 940017 917850 940017 917850 0 0 0.044031 0.480069 0.0484 0.483314 1.410095 +ACTGATAT ACTGATAT EBOV-VP24UTR_G10218.DMSO EBOV-VP24UTR_G10218.DMSO.l1 538711 517696 538711 517696 0 0 0.025233 0.275121 0.027299 0.272604 0.795337 +ATGAGCAT ATGAGCAT EBOV-VP24UTR_G10218A.DMSO EBOV-VP24UTR_G10218A.DMSO.l1 382966 366987 382966 366987 0 0 0.017938 0.195582 0.019352 0.193245 0.563803 +ATTCCTAT ATTCCTAT EBOV-VP24UTR_G10218A-A10232G.DMSO EBOV-VP24UTR_G10218A-A10232G.DMSO.l1 490275 472589 490275 472589 0 0 0.022965 0.250385 0.024921 0.248852 0.726039 +CAAAAGAT CAAAAGAT EBOV-VP24UTR_G10218A-C10220U.DMSO EBOV-VP24UTR_G10218A-C10220U.DMSO.l1 264750 255281 264750 255281 0 0 0.012401 0.135209 0.013462 0.134424 0.392189 +CAACTAAT CAACTAAT EBOV-VP24UTR_G10218A-A10240G.DMSO EBOV-VP24UTR_G10218A-A10240G.DMSO.l1 353697 337630 353697 337630 0 0 0.016567 0.180634 0.017804 0.177786 0.518702 +CACCGGAT CACCGGAT EBOV-VP24UTR_G10218A-C10242U.DMSO EBOV-VP24UTR_G10218A-C10242U.DMSO.l1 187070 179344 187070 179344 0 0 0.008762 0.095537 0.009457 0.094438 0.275526 +CAGGCGAT CAGGCGAT EBOV-VP24UTR_G10218.NMIA EBOV-VP24UTR_G10218.NMIA.l1 1179793 1125673 1179793 1125673 0 0 0.055262 0.602524 0.059359 0.592748 1.729373 +CATGGCAT CATGGCAT EBOV-VP24UTR_G10218A.NMIA EBOV-VP24UTR_G10218A.NMIA.l1 386930 371948 386930 371948 0 0 0.018124 0.197606 0.019614 0.195857 0.571424 +CATTTTAT CATTTTAT EBOV-VP24UTR_G10218A-A10232G.NMIA EBOV-VP24UTR_G10218A-A10232G.NMIA.l1 561855 537740 561855 537740 0 0 0.026317 0.286941 0.028356 0.283159 0.826131 +CCAACAAT CCAACAAT EBOV-VP24UTR_G10218A-C10220U.NMIA EBOV-VP24UTR_G10218A-C10220U.NMIA.l1 395259 379142 395259 379142 0 0 0.018514 0.20186 0.019993 0.199646 0.582477 +CGGAATAT CGGAATAT EBOV-VP24UTR_G10218A-A10240G.NMIA EBOV-VP24UTR_G10218A-A10240G.NMIA.l1 487124 470904 487124 470904 0 0 0.022817 0.248776 0.024832 0.247965 0.723451 +CTAGCTAT CTAGCTAT EBOV-VP24UTR_G10218A-C10242U.NMIA EBOV-VP24UTR_G10218A-C10242U.NMIA.l1 268529 257633 268529 257633 0 0 0.012578 0.137139 0.013586 0.135662 0.395802 +GACGACAT GACGACAT EBOV-VP24UTR_G10218.DENAT EBOV-VP24UTR_G10218.DENAT.l1 471442 454685 471442 454685 0 0 0.022082 0.240767 0.023977 0.239424 0.698533 +TAATCGAT TAATCGAT EBOV-VP24UTR_G10218A.DENAT EBOV-VP24UTR_G10218A.DENAT.l1 349709 334453 349709 334453 0 0 0.01638 0.178597 0.017637 0.176114 0.513821 +TACAGCAT TACAGCAT EBOV-VP24UTR_G10218A-A10232G.DENAT EBOV-VP24UTR_G10218A-A10232G.DENAT.l1 588492 559195 588492 559195 0 0 0.027565 0.300545 0.029488 0.294456 0.859092 +TATAATAT TATAATAT EBOV-VP24UTR_G10218A-C10220U.DENAT EBOV-VP24UTR_G10218A-C10220U.DENAT.l1 482453 460095 482453 460095 0 0 0.022598 0.24639 0.024262 0.242273 0.706845 +TCATTCAT TCATTCAT EBOV-VP24UTR_G10218A-A10240G.DENAT EBOV-VP24UTR_G10218A-A10240G.DENAT.l1 541051 519095 541051 519095 0 0 0.025343 0.276316 0.027373 0.273341 0.797487 +TCCCGAAT TCCCGAAT EBOV-VP24UTR_G10218A-C10242U.DENAT EBOV-VP24UTR_G10218A-C10242U.DENAT.l1 287559 278125 287559 278125 0 0 0.013469 0.146857 0.014666 0.146453 0.427284 +NNNNNNNN NNNNNNNN 3814272 2039896 0 0 0 0 0.178661 1.947959 0.107569 1.074152 0 + + diff --git a/test/unit/test_illumina.py b/test/unit/test_illumina.py index 30e08753c..d4df77b2e 100644 --- a/test/unit/test_illumina.py +++ b/test/unit/test_illumina.py @@ -264,6 +264,20 @@ def test_ambiguous(self): self.assertEqualContents(out_report, expected) + def test_single_index_run(self): + dir_prefix = "single_index" + in_dir = util.file.get_test_input_path(self) + in_barcodes = os.path.join(in_dir,dir_prefix,"barcodes.txt") + in_metrics = os.path.join(in_dir,dir_prefix,"metrics.txt") + out_report = util.file.mkstempfname('.txt') + expected = os.path.join(in_dir,dir_prefix,"expected.txt") + + args = [in_barcodes, in_metrics, out_report] + args = illumina.parser_guess_barcodes(argparse.ArgumentParser()).parse_args(args) + args.func_main(args) + + self.assertEqualContents(out_report, expected) + def test_few_assigned(self): dir_prefix = "few_assigned" in_dir = util.file.get_test_input_path(self) diff --git a/util/illumina_indices.py b/util/illumina_indices.py index cdb9e9d3b..554147c7f 100755 --- a/util/illumina_indices.py +++ b/util/illumina_indices.py @@ -576,7 +576,7 @@ def __init__(self, barcode_counts, picard_metrics, sample_name, rows_limit=1000) for row in util.file.read_tabfile_dict(picard_metrics, skip_prefix="#"): barcodes = tuple(row["BARCODE"].split("-")) if "BARCODE_NAME" in row: - self.sample_to_barcodes[row["BARCODE_NAME"]] = barcodes + self.sample_to_barcodes[row["BARCODE_NAME"]] = barcodes self.samples.append(row["BARCODE_NAME"]) self.sample_to_read_counts[row["BARCODE_NAME"]] = int(row["READS"]) elif all(re.match(r'^N+$',barcode) for barcode in barcodes): @@ -587,9 +587,11 @@ def __init__(self, barcode_counts, picard_metrics, sample_name, rows_limit=1000) #Barcode1 Likely_Index_Names1 Barcode2 Likely_Index_Names2 Count #CTCTCTAC N707 AAGGAGTA S507,[N|S|E]507 40324834 for row in util.file.read_tabfile_dict(barcode_counts, rowcount_limit=rows_limit): - self.barcodes_seen[(row["Barcode1"],row["Barcode2"])] = int(row["Count"]) + if (row["Barcode1"],row.get("Barcode2",None)) not in self.barcodes_seen: + self.barcodes_seen[(row["Barcode1"],row.get("Barcode2",None))] = int(row["Count"]) self.barcode_name_map[row["Barcode1"]] = row["Likely_Index_Names1"] - self.barcode_name_map[row["Barcode2"]] = row["Likely_Index_Names2"] + if "Barcode2" in row and row["Barcode2"]: + self.barcode_name_map[row["Barcode2"]] = row["Likely_Index_Names2"] def outlier_barcodes(self, outlier_threshold=0.675, expected_assigned_fraction=0.7, number_of_negative_controls=1): """ @@ -711,30 +713,35 @@ def guess_barcodes_for_sample(self, sample_name): else: del barcodes_seen_novel[barcode_pair] + is_dual_index = len(self.sample_to_barcodes[sample_name]) > 1 + out_dict["expected_barcode_1"] = self.sample_to_barcodes[sample_name][0] out_dict["expected_barcode_1_name"] = ",".join(self.index_reference.guess_index(self.sample_to_barcodes[sample_name][0])) - out_dict["expected_barcode_2"] = self.sample_to_barcodes[sample_name][1] - out_dict["expected_barcode_2_name"] = ",".join(self.index_reference.guess_index(self.sample_to_barcodes[sample_name][1])) - out_dict["expected_barcodes_read_count"] = self.sample_to_read_counts[sample_name] + if is_dual_index: + out_dict["expected_barcode_2"] = self.sample_to_barcodes[sample_name][1] + out_dict["expected_barcode_2_name"] = ",".join(self.index_reference.guess_index(self.sample_to_barcodes[sample_name][1])) + out_dict["expected_barcodes_read_count"] = self.sample_to_read_counts[sample_name] claimed_barcodes = self.sample_to_barcodes[sample_name] found_partial_match = False putative_match = None - # barcodes_seen_novel is sorted by read count, desc - for (barcode_pair,count) in barcodes_seen_novel.items(): - if barcode_pair[0]==claimed_barcodes[0] or barcode_pair[1]==claimed_barcodes[1]: - found_partial_match=True - putative_match = barcode_pair - out_dict["match_type"] = "one_barcode_match" - break - # find index of match to help determine if it is a reasonable guess - idx_of_match = -1 - for (idx,(barcode_pair,count)) in enumerate(self.barcodes_seen.items()): - if barcode_pair==putative_match: - idx_of_match=idx - break + if is_dual_index: + # barcodes_seen_novel is sorted by read count, desc + for (barcode_pair,count) in barcodes_seen_novel.items(): + if barcode_pair[0]==claimed_barcodes[0] or barcode_pair[1]==claimed_barcodes[1]: + found_partial_match=True + putative_match = barcode_pair + out_dict["match_type"] = "one_barcode_match" + break + + # find index of match to help determine if it is a reasonable guess + idx_of_match = -1 + for (idx,(barcode_pair,count)) in enumerate(self.barcodes_seen.items()): + if barcode_pair==putative_match: + idx_of_match=idx + break # if the one-barcode match is too far down the list of barcode pairs seen # (farther down than 1.5x the number of samples) @@ -747,9 +754,10 @@ def guess_barcodes_for_sample(self, sample_name): out_dict["guessed_barcode_1"] = putative_match[0] out_dict["guessed_barcode_1_name"] = self.barcode_name_map[putative_match[0]] - out_dict["guessed_barcode_2"] = putative_match[1] - out_dict["guessed_barcode_2_name"] = self.barcode_name_map[putative_match[1]] - out_dict["guessed_barcodes_read_count"] = self.barcodes_seen[(putative_match[0],putative_match[1])] + if is_dual_index: + out_dict["guessed_barcode_2"] = putative_match[1] + out_dict["guessed_barcode_2_name"] = self.barcode_name_map[putative_match[1]] + out_dict["guessed_barcodes_read_count"] = self.barcodes_seen[(putative_match[0],putative_match[1])] return out_dict @@ -781,7 +789,7 @@ def find_uncertain_barcodes(self, sample_names=None, outlier_threshold=0.675, ex consolidated_guesses = defaultdict(list) for row in guessed_barcodes: - consolidated_guesses[(row["guessed_barcode_1"],row["guessed_barcode_2"])].append(row) + consolidated_guesses[(row["guessed_barcode_1"],row.get("guessed_barcode_2",None))].append(row) final_guesses = [] @@ -790,7 +798,8 @@ def clear_guessed_fields(sample,match_reason,fields_to_clear=None): "guessed_barcode_1_name", "guessed_barcode_2_name", "guessed_barcodes_read_count"] for field in fields_to_clear: - sample[field] = None + if field in sample: + sample[field] = None sample["match_type"] = match_reason return sample @@ -798,7 +807,7 @@ def clear_guessed_fields(sample,match_reason,fields_to_clear=None): if len(samples) > 1: log.warning("Ambiguous! Multiple samples corresponding to guessed barcodes %s:", barcode_pair) for sample in samples: - log.warning("\t%s expected (%s,%s); -> Guessed (%s,%s); match type: %s", sample["sample_name"], sample["expected_barcode_1"],sample["expected_barcode_2"],sample["guessed_barcode_1"],sample["guessed_barcode_2"],sample["match_type"]) + log.warning("\t%s expected (%s,%s); -> Guessed (%s,%s); match type: %s", sample["sample_name"], sample["expected_barcode_1"],sample.get("expected_barcode_2",""),sample["guessed_barcode_1"],sample.get("guessed_barcode_2",""),sample["match_type"]) final_guesses.append(clear_guessed_fields(sample, "alternative_indices_uncertain")) else: @@ -807,11 +816,10 @@ def clear_guessed_fields(sample,match_reason,fields_to_clear=None): final_guesses.append(clear_guessed_fields(sample, "alternatives_have_lower_read_counts")) else: final_guesses.append(sample) - return final_guesses def write_guessed_barcodes(self, out_tsv, guessed_barcodes): - output_header = ["sample_name", + possible_header_cols = ["sample_name", "expected_barcode_1","expected_barcode_2", "expected_barcode_1_name","expected_barcode_2_name", "expected_barcodes_read_count", @@ -820,10 +828,15 @@ def write_guessed_barcodes(self, out_tsv, guessed_barcodes): "guessed_barcodes_read_count", "match_type" ] + output_header_columns = [] + for header_key in possible_header_cols: + for row in guessed_barcodes: + if header_key in row.keys() and header_key not in output_header_columns: + output_header_columns.append(header_key) with open(out_tsv, 'w') as tsvfile: csv.register_dialect('dict_tsv', quoting=csv.QUOTE_MINIMAL, delimiter="\t") - writer = csv.DictWriter(tsvfile, fieldnames=output_header, dialect="dict_tsv") + writer = csv.DictWriter(tsvfile, fieldnames=output_header_columns, dialect="dict_tsv") writer.writeheader() writer.writerows(guessed_barcodes)