Skip to content
This repository has been archived by the owner on May 12, 2021. It is now read-only.

Commit

Permalink
CHUKWA-642. Added regular expression validation. (Eric Spishak via Er…
Browse files Browse the repository at this point in the history
…ic Yang)

git-svn-id: https://svn.apache.org/repos/asf/incubator/chukwa/trunk@1411813 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information
macroadster committed Nov 20, 2012
1 parent 88379bf commit ed0c274
Show file tree
Hide file tree
Showing 13 changed files with 178 additions and 25 deletions.
2 changes: 2 additions & 0 deletions CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ Trunk (unreleased changes)

BUGS

CHUKWA-642. Added regular expression validation. (Eric Spishak via Eric Yang)

CHUKWA-632. Added getter method for startOffset in AbstractProcessor. (Abhijit Dhar via Eric Yang)

CHUKWA-637. Fixed default cluster selection in hicc. (Eric Yang)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.chukwa.util.DatabaseWriter;
import org.apache.hadoop.chukwa.util.RegexUtil;

public class DataExpiration {
private static DatabaseConfig dbc = null;
Expand All @@ -50,6 +51,12 @@ public void dropTables(long start, long end) {
while (ki.hasNext()) {
String name = ki.next();
String tableName = dbNames.get(name);
if (!RegexUtil.isRegex(tableName)) {
log.warn("Skipping tableName: '" + tableName
+ "' because there was an error parsing it as a regex: "
+ RegexUtil.regexError(tableName));
return;
}
String[] tableList = dbc.findTableName(tableName, start, end);
for (String tl : tableList) {
log.debug("table name: " + tableList[0]);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.chukwa.util.DatabaseWriter;
import org.apache.hadoop.chukwa.util.ExceptionUtil;
import org.apache.hadoop.chukwa.util.RegexUtil;

public class TableCreator {
private static DatabaseConfig dbc = null;
Expand Down Expand Up @@ -56,6 +57,12 @@ public void createTables(long start, long end) throws Exception {
while (ki.hasNext()) {
String name = ki.next();
String tableName = dbNames.get(name);
if (!RegexUtil.isRegex(tableName)) {
log.warn("Skipping tableName: '" + tableName
+ "' because there was an error parsing it as a regex: "
+ RegexUtil.regexError(tableName));
return;
}
String[] tableList = dbc.findTableName(tableName, start, end);
log.debug("table name: " + tableList[0]);
try {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
import java.util.regex.PatternSyntaxException;
import org.apache.hadoop.chukwa.Chunk;
import org.apache.hadoop.chukwa.util.Filter;
import org.apache.hadoop.chukwa.util.RegexUtil;
import org.apache.hadoop.chukwa.util.RegexUtil.CheckedPatternSyntaxException;
import org.apache.hadoop.conf.Configuration;
import org.apache.log4j.Logger;
import java.net.ServerSocket;
Expand Down Expand Up @@ -167,16 +169,16 @@ public void run() {
*/
public void setup() {
try { //outer try catches IOExceptions
try { //inner try catches Pattern Syntax errors
try { //inner try catches bad command syntax errors
sock.setSoTimeout(timeout);
sock.setKeepAlive(USE_KEEPALIVE);
in = new BufferedReader(new InputStreamReader(sock.getInputStream()));
out = new DataOutputStream(sock.getOutputStream());
String cmd = in.readLine();
if(!cmd.contains(" ")) {

throw new PatternSyntaxException(
"command should be keyword pattern, but no ' ' seen", cmd, -1);
throw new IllegalArgumentException(
"command should be keyword pattern, but no ' ' seen: " + cmd);
}
String uppercased = cmd.substring(0, cmd.indexOf(' ')).toUpperCase();
if(RAW.equals(uppercased))
Expand All @@ -186,24 +188,35 @@ else if(WRITABLE.equals(uppercased))
else if(ASCII_HEADER.equals(uppercased))
fmt = DataFormat.Header;
else {
throw new PatternSyntaxException("bad command '" + uppercased+
throw new IllegalArgumentException("bad command '" + uppercased+
"' -- starts with neither '"+ RAW+ "' nor '"+ WRITABLE + " nor "
+ ASCII_HEADER+"'.", cmd, -1);
+ ASCII_HEADER+"':" + cmd);
}

String cmdAfterSpace = cmd.substring(cmd.indexOf(' ')+1);
if(cmdAfterSpace.toLowerCase().equals("all"))
rules = Filter.ALL;
else
rules = new Filter(cmdAfterSpace);
try {
rules = new Filter(cmdAfterSpace);
} catch (CheckedPatternSyntaxException pse) {
out.write("Error parsing command as a regex: ".getBytes());
out.write(pse.getMessage().getBytes());
out.writeByte('\n');
out.close();
in.close();
sock.close();
log.warn(pse);
return;
}

//now that we read everything OK we can add ourselves to list, and return.
synchronized(tees) {
tees.add(this);
}
out.write("OK\n".getBytes());
log.info("tee to " + sock.getInetAddress() + " established");
} catch(PatternSyntaxException e) {
} catch(IllegalArgumentException e) {
out.write(e.toString().getBytes());
out.writeByte('\n');
out.close();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
import org.apache.hadoop.chukwa.util.ClusterConfig;
import org.apache.hadoop.chukwa.util.DatabaseWriter;
import org.apache.hadoop.chukwa.util.ExceptionUtil;
import org.apache.hadoop.chukwa.util.RegexUtil;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
Expand Down Expand Up @@ -220,7 +221,13 @@ public boolean run() throws IOException {
String dbKey = "report.db.name." + recordType;
Matcher m = p.matcher(recordType);
if (dbTables.containsKey(dbKey)) {
String[] tmp = mdlConfig.findTableName(mdlConfig.get(dbKey), record
String tableName = mdlConfig.get(dbKey);
if (!RegexUtil.isRegex(tableName)) {
log.error("Error parsing 'tableName' as a regex: "
+ RegexUtil.regexError(tableName));
return false;
}
String[] tmp = mdlConfig.findTableName(tableName, record
.getTime(), record.getTime());
table = tmp[0];
} else if(m.matches()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import org.apache.hadoop.chukwa.extraction.engine.ChukwaRecord;
import org.apache.hadoop.chukwa.extraction.engine.ChukwaRecordKey;
import org.apache.hadoop.chukwa.extraction.demux.Demux;
import org.apache.hadoop.chukwa.util.RegexUtil;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.JobConf;
Expand Down Expand Up @@ -65,6 +66,8 @@ public class TsProcessor extends AbstractProcessor {
static Logger log = Logger.getLogger(TsProcessor.class);

public static final String DEFAULT_DATE_FORMAT = "yyyy-MM-dd HH:mm:ss,SSS";
public static final String DEFAULT_TIME_REGEX = "TsProcessor.default.time.regex";
public static final String TIME_REGEX = "TsProcessor.time.regex.";

private Map<String, Pattern> datePatternMap;
private Map<String, SimpleDateFormat> dateFormatMap;
Expand Down Expand Up @@ -155,14 +158,24 @@ private Pattern fetchDateLocationPattern(String dataType) {

JobConf jobConf = Demux.jobConf;
String datePattern = null;
Pattern pattern = null;

if (jobConf != null) {
datePattern = jobConf.get("TsProcessor.default.time.regex", null);
datePattern = jobConf.get("TsProcessor.time.regex." + chunk.getDataType(),
datePattern);
String timeRegexProperty = TIME_REGEX + chunk.getDataType();
datePattern = jobConf.get(DEFAULT_TIME_REGEX, null);
datePattern = jobConf.get(timeRegexProperty, datePattern);
if (datePattern != null) {
if (!RegexUtil.isRegex(datePattern, 1)) {
log.warn("Error parsing '" + DEFAULT_TIME_REGEX + "' or '"
+ timeRegexProperty + "' properties as a regex: "
+ RegexUtil.regexError(datePattern, 1)
+ ". This date pattern will be skipped.");
return null;
}
pattern = Pattern.compile(datePattern);
}
}

Pattern pattern = datePattern != null ? Pattern.compile(datePattern) : null;
datePatternMap.put(dataType, pattern);

return pattern;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import java.io.IOException;
import java.util.regex.*;
import org.apache.hadoop.chukwa.*;
import org.apache.hadoop.chukwa.util.RegexUtil;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.*;
Expand All @@ -46,6 +47,9 @@ public static class ChukwaRecordReader implements

static Logger LOG = Logger.getLogger(ChukwaInputFormat.class);

public static final String DATATYPE_PROPERTY = "chukwa.inputfilter.datatype";
public static final String DATATYPE_PROPERTY_DEFAULT = ".*";

private SequenceFileRecordReader<ChukwaArchiveKey, Chunk> sfrr;
private long lineInFile = 0;
private Chunk curChunk = null;
Expand All @@ -58,8 +62,14 @@ public static class ChukwaRecordReader implements
public ChukwaRecordReader(Configuration conf, FileSplit split)
throws IOException {
sfrr = new SequenceFileRecordReader<ChukwaArchiveKey, Chunk>(conf, split);
dtPattern = Pattern
.compile(conf.get("chukwa.inputfilter.datatype", ".*"));
String datatype = conf.get(DATATYPE_PROPERTY, DATATYPE_PROPERTY_DEFAULT);
if (!RegexUtil.isRegex(datatype)) {
LOG.warn("Error parsing '" + DATATYPE_PROPERTY
+ "' property as a regex: " + RegexUtil.regexError(datatype)
+ ". Using default instead: " + DATATYPE_PROPERTY_DEFAULT);
datatype = DATATYPE_PROPERTY_DEFAULT;
}
dtPattern = Pattern.compile(datatype);
}

@Override
Expand Down
8 changes: 5 additions & 3 deletions src/main/java/org/apache/hadoop/chukwa/util/DumpChunks.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import java.io.*;
import org.apache.hadoop.chukwa.*;
import org.apache.hadoop.chukwa.conf.ChukwaConfiguration;
import org.apache.hadoop.chukwa.util.RegexUtil.CheckedPatternSyntaxException;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FileUtil;
Expand Down Expand Up @@ -84,13 +85,14 @@ static void dump(String[] args, Configuration conf, PrintStream out) throws IOEx
Filter patterns = null;
if(args[filterArg].toLowerCase().equals("all"))
patterns = Filter.ALL;
else
else {
try {
patterns = new Filter(args[filterArg]);
} catch (PatternSyntaxException pse) {
} catch (CheckedPatternSyntaxException pse) {
System.err.println("Error parsing \"tags\" regular expression: " + pse.getMessage());
System.exit(-1);
return;
}
}

System.err.println("Patterns:" + patterns);
ArrayList<Path> filesToSearch = new ArrayList<Path>();
Expand Down
36 changes: 29 additions & 7 deletions src/main/java/org/apache/hadoop/chukwa/util/Filter.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,14 @@
import org.apache.commons.lang.ArrayUtils;
import org.apache.hadoop.chukwa.Chunk;
import org.apache.hadoop.chukwa.extraction.engine.RecordUtil;
import org.apache.hadoop.chukwa.util.RegexUtil.CheckedPatternSyntaxException;
import org.apache.log4j.Logger;


public class Filter {


static Logger log = Logger.getLogger(Filter.class);

private static final String[] SEARCH_TARGS =
{"datatype", "name", "host", "cluster", "content"};
static final String SEPARATOR="&";
Expand Down Expand Up @@ -56,6 +60,11 @@ boolean matches(Chunk chunk) {
return p.matcher(content).matches();
} else if(targ.startsWith("tags.")) {
String tagName = targ.substring("tags.".length());
if (!RegexUtil.isRegex(tagName)) {
log.warn("Error parsing 'tagName' as a regex: "
+ RegexUtil.regexError(tagName));
return false;
}
String tagVal = chunk.getTag(tagName);
if(tagVal == null)
return false;
Expand All @@ -73,25 +82,30 @@ public String toString() {

List<SearchRule> compiledPatterns;

public Filter(String listOfPatterns) throws PatternSyntaxException{
public Filter(String listOfPatterns) throws CheckedPatternSyntaxException {
compiledPatterns = new ArrayList<SearchRule>();
//FIXME: could escape these
String[] patterns = listOfPatterns.split(SEPARATOR);
for(String p: patterns) {
int equalsPos = p.indexOf('=');

if(equalsPos < 0 || equalsPos > (p.length() -2)) {
throw new PatternSyntaxException(
throw new CheckedPatternSyntaxException(
"pattern must be of form targ=pattern", p, -1);
}

String targ = p.substring(0, equalsPos);
if(!targ.startsWith("tags.") && !ArrayUtils.contains(SEARCH_TARGS, targ)) {
throw new PatternSyntaxException(
throw new CheckedPatternSyntaxException(
"pattern doesn't start with recognized search target", p, -1);
}

Pattern pat = Pattern.compile(p.substring(equalsPos+1), Pattern.DOTALL);
String regex = p.substring(equalsPos+1);
if (!RegexUtil.isRegex(regex)) {
throw new CheckedPatternSyntaxException(RegexUtil.regexException(regex));
}

Pattern pat = Pattern.compile(regex, Pattern.DOTALL);
compiledPatterns.add(new SearchRule(pat, targ));
}
}
Expand Down Expand Up @@ -119,7 +133,7 @@ public String toString() {
}

private static final class MatchAll extends Filter {
public MatchAll() {
public MatchAll() throws CheckedPatternSyntaxException {
super("datatype=.*");
}

Expand All @@ -131,6 +145,14 @@ public String toString() {
return "ALL";
}
}
public static final Filter ALL = new MatchAll();

public static final Filter ALL = newMatchAll();
private static Filter newMatchAll() {
try {
return new MatchAll();
} catch (CheckedPatternSyntaxException e) {
throw new RuntimeException("Illegal MatchAll regular expression.", e);
}
}

}//end class
Original file line number Diff line number Diff line change
Expand Up @@ -165,4 +165,24 @@ private static ChukwaRecordKey buildKey(Date date, String dataSource, String dat
return key;
}

public void testParseIllegalRegex() {
jobConf.set(TsProcessor.DEFAULT_TIME_REGEX, "(");

ChunkBuilder cb = new ChunkBuilder();
cb.addRecord("2012-10-25 00:18:44,818 some sample record data".getBytes());
Chunk chunk = cb.getChunk();

TsProcessor p = new TsProcessor();
p.reset(chunk);

ChukwaTestOutputCollector<ChukwaRecordKey, ChukwaRecord> output =
new ChukwaTestOutputCollector<ChukwaRecordKey, ChukwaRecord>();

p.process(null, chunk, output, Reporter.NULL);

assertEquals("Output data size not correct.", 1, output.data.size());
ChukwaRecordKey key = output.data.keySet().iterator().next();
ChukwaRecord record = output.data.get(key);
assertNull("Output should not be error.", record.getValue("cchunkData"));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@


import java.io.IOException;
import java.util.regex.PatternSyntaxException;

import org.apache.hadoop.mapred.Reporter;
import junit.framework.TestCase;
import org.apache.hadoop.chukwa.*;
Expand Down Expand Up @@ -82,4 +84,25 @@ public void testInputFormat() {
}
}

public void testInputFormatIllegalRegex() {
try {
JobConf conf = new JobConf();
conf.set("chukwa.inputfilter.datatype", "(");
String TMP_DIR = System.getProperty("test.build.data", "/tmp");
Path filename = new Path("file:///" + TMP_DIR + "/tmpSeqFile");
long len = FileSystem.getLocal(conf).getFileStatus(filename).getLen();
InputSplit split = new FileSplit(filename, 0, len, (String[]) null);

ChukwaInputFormat in = new ChukwaInputFormat();
RecordReader<LongWritable, Text> r = in.getRecordReader(split, conf,
Reporter.NULL);

} catch (PatternSyntaxException e) {
e.printStackTrace();
fail("Illegal regular expression caused PatternSyntaxException: " + e);
} catch (IOException e) {
e.printStackTrace();
fail("IO exception " + e);
}
}
}
Loading

0 comments on commit ed0c274

Please sign in to comment.