-
Notifications
You must be signed in to change notification settings - Fork 66
/
AlignerOptions.h
167 lines (135 loc) · 6.63 KB
/
AlignerOptions.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
/*++
Module Name:
AlignerOptions.h
Abstract:
Common parameters for running single & paired alignment.
Authors:
Ravi Pandya, May, 2012
Environment:
User mode service.
Revision History:
Integrated from SingleAligner.cpp & PairedAligner.cpp
--*/
#pragma once
#include "stdafx.h"
#include "options.h"
#include "Genome.h"
#include "Read.h"
#define INSTRUMENTATION_FOR_PAPER 0 // Turn this on to generate raw data about hit sets and their intersections for the paper
#if INSTRUMENTATION_FOR_PAPER
#define MAX_HIT_SIZE_LOG_2 15 // This is for the instrumentation
extern _int64 g_alignmentTimeByHitCountsOfEachSeed[MAX_HIT_SIZE_LOG_2+1][MAX_HIT_SIZE_LOG_2+1]; // In the paired-end aligner, if you have seeds A and B with hit set sizes |A| and |B| then the total time in ns gets added into g_alignmentTimeByHitCountsOfEachSeed[log2(|A|)][log2(|B|)]
extern _int64 g_alignmentCountByHitCountsOfEachSeed[MAX_HIT_SIZE_LOG_2 + 1][MAX_HIT_SIZE_LOG_2 + 1]; // Same as above, but just add one per time.
extern _int64 g_scoreCountByHitCountsOfEachSeed[MAX_HIT_SIZE_LOG_2 + 1][MAX_HIT_SIZE_LOG_2 + 1];
extern _int64 g_setIntersectionSizeByHitCountsOfEachSeed[MAX_HIT_SIZE_LOG_2 + 1][MAX_HIT_SIZE_LOG_2 + 1];
extern _int64 g_100xtotalRatioOfSetIntersectionSizeToSmallerSeedHitCountByCountsOfEachSeed[MAX_HIT_SIZE_LOG_2 + 1][MAX_HIT_SIZE_LOG_2 + 1];
extern _int64 g_totalSizeOfSmallerHitSet;
extern _int64 g_totalSizeOfSetIntersection;
extern _int64 g_alignmentsWithMoreThanOneCandidateWhereTheBestCandidateIsScoredFirst;
extern _int64 g_alignmentsWithMoreThanOneCandidate;
#endif // INSTRUMENTATION_FOR_PAPER
#define MAPQ_LIMIT_FOR_SINGLE_HIT 10
struct AbstractOptions
{
virtual void usageMessage() = 0;
virtual bool parse(const char** argv, int argc, int& n, bool *done) = 0;
};
enum FileType {UnknownFileType, SAMFile, FASTQFile, BAMFile, InterleavedFASTQFile, CRAMFile}; // Add more as needed
struct SNAPFile {
SNAPFile() : fileName(NULL), secondFileName(NULL), fileType(UnknownFileType), isStdio(false), omitSQLines(false) {}
const char *fileName;
const char *secondFileName;
FileType fileType;
bool isCompressed;
bool isStdio; // Only applies to the first file for two-file inputs
bool omitSQLines; // Special (formerly) undocumented option for Charles Chiu's group. Mostly a bad idea.
PairedReadSupplierGenerator *createPairedReadSupplierGenerator(int numThreads, bool quicklyDropUnpairedReads, const ReaderContext& context);
ReadSupplierGenerator *createReadSupplierGenerator(int numThreads, const ReaderContext& context);
static bool generateFromCommandLine(const char **args, int nArgs, int *argsConsumed, SNAPFile *snapFile, bool paired, bool isInput);
};
struct AlignerOptions : public AbstractOptions
{
AlignerOptions(const char* i_commandLine, bool forPairedEnd = false);
const char *commandLine;
const char *indexDir;
const char *similarityMapFile;
int numThreads;
unsigned maxDist;
float maxDistFraction;
unsigned maxDistForIndels;
unsigned numSeedsFromCommandLine;
double seedCoverage; // Exclusive with numSeeds; this is readSize/seedSize
bool seedCountSpecified; // Has either -n or -sc been specified? This bool is used to make sure they're not both specified on the command line
unsigned maxHits;
int minWeightToCheck;
bool bindToProcessors;
bool ignoreMismatchedIDs;
SNAPFile outputFile;
int nInputs;
SNAPFile *inputs;
ReadClippingType clipping;
bool sortOutput;
bool noIndex;
bool noDuplicateMarking;
bool noQualityCalibration; // This doesn't appear to be used.
unsigned sortMemory; // total output sorting buffer size in Gb
unsigned filterFlags;
bool explorePopularSeeds;
bool stopOnFirstHit;
bool useM; // Should we generate CIGAR strings using = and X, or using the old-style M?
unsigned gapPenalty; // if non-zero use gap penalty aligner
AbstractOptions *extra; // extra options
const char *rgLineContents;
const char *perfFileName;
bool useTimingBarrier;
unsigned extraSearchDepth;
const char *defaultReadGroup; // if not specified in input
bool ignoreSecondaryAlignments; // on input, default true
int maxSecondaryAlignmentAdditionalEditDistance;
int maxSecondaryAlignments;
int maxSecondaryAlignmentsPerContig;
int flattenMAPQAtOrBelow;
bool preserveClipping;
float expansionFactor;
bool noUkkonen;
bool noOrderedEvaluation;
bool noTruncation;
bool useAffineGap;
bool useSoftClipping;
unsigned matchReward;
unsigned subPenalty;
unsigned gapOpenPenalty;
unsigned gapExtendPenalty;
unsigned fivePrimeEndBonus;
unsigned threePrimeEndBonus;
unsigned minReadLength;
bool mapIndex;
bool prefetchIndex;
size_t writeBufferSize;
bool dropIndexBeforeSort;
bool killIfTooSlow;
const char * sortIntermediateDirectory;
bool profile;
bool profileAffineGap;
bool ignoreAlignmentAdjustmentsForOm;
bool emitInternalScore;
char internalScoreTag[3];
bool altAwareness;
int maxScoreGapToPreferNonALTAlignment;
bool emitALTAlignments;
static bool useHadoopErrorMessages; // This is static because it's global (and I didn't want to push the options object to every place in the code)
static bool outputToStdout; // Likewise
void usage();
virtual void usageMessage();
virtual bool parse(const char** argv, int argc, int& n, bool *done);
enum FilterFlags
{
FilterUnaligned = 0x0001,
FilterSingleHit = 0x0002,
FilterMultipleHits = 0x0004,
FilterBothMatesMatch = 0x0008,
FilterTooShort = 0x0010
};
bool passFilter(Read* read, AlignmentResult result, bool tooShort, bool secondaryAlignment);
virtual bool isPaired() { return false; }
};