Skip to content
This repository has been archived by the owner on Jul 15, 2019. It is now read-only.

Commit

Permalink
HIVE-693. Add a AWS S3 log format deserializer
Browse files Browse the repository at this point in the history
(Zheng Shao and Andraz Tori via namit)



git-svn-id: https://svn.apache.org/repos/asf/hadoop/hive/trunk@804035 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information
Namit Jain committed Aug 13, 2009
1 parent a712257 commit 986f58d
Show file tree
Hide file tree
Showing 7 changed files with 294 additions and 8 deletions.
3 changes: 3 additions & 0 deletions CHANGES.txt
Expand Up @@ -12,6 +12,9 @@ Trunk - Unreleased
HIVE-749. add hive.optimize.pruner
(Zheng Shao via namit)

HIVE-693. Add a AWS S3 log format deserializer
(Zheng Shao and Andraz Tori via namit)

IMPROVEMENTS

OPTIMIZATIONS
Expand Down
20 changes: 12 additions & 8 deletions contrib/build.xml
Expand Up @@ -62,39 +62,43 @@

<mkdir dir="${test.build.src}/org/apache/hadoop/hive/ql/parse"/>
<mkdir dir="${test.build.src}/org/apache/hadoop/hive/cli"/>
<mkdir dir="${test.log.dir}/contribpositive"/>
<mkdir dir="${test.log.dir}/contribnegative"/>
<mkdir dir="${test.log.dir}/contribclientpositive"/>
<mkdir dir="${test.log.dir}/contribclientnegative"/>

<qtestgen outputDirectory="${test.build.src}/org/apache/hadoop/hive/ql/parse"
templatePath="${ql.test.template.dir}" template="TestParse.vm"
queryDirectory="${contrib.test.query.dir}/positive"
queryFile="${qfile}"
resultsDirectory="${contrib.test.results.dir}/compiler" className="TestContribParse"
logFile="${test.log.dir}/testparsegen.log"
logDirectory="${test.log.dir}/positive"/>
logFile="${test.log.dir}/testcontribparsegen.log"
logDirectory="${test.log.dir}/contribpositive"/>

<qtestgen outputDirectory="${test.build.src}/org/apache/hadoop/hive/ql/parse"
templatePath="${ql.test.template.dir}" template="TestParseNegative.vm"
queryDirectory="${contrib.test.query.dir}/negative"
queryFile="${qfile}"
resultsDirectory="${contrib.test.results.dir}/compiler/errors" className="TestContribParseNegative"
logFile="${test.log.dir}/testparseneggen.log"
logDirectory="${test.log.dir}/negative"/>
logFile="${test.log.dir}/testcontribparseneggen.log"
logDirectory="${test.log.dir}/contribnegative"/>

<qtestgen outputDirectory="${test.build.src}/org/apache/hadoop/hive/cli"
templatePath="${ql.test.template.dir}" template="TestCliDriver.vm"
queryDirectory="${contrib.test.query.dir}/clientpositive"
queryFile="${qfile}"
clusterMode="${clustermode}"
resultsDirectory="${contrib.test.results.dir}/clientpositive" className="TestContribCliDriver"
logFile="${test.log.dir}/testclidrivergen.log"
logDirectory="${test.log.dir}/clientpositive"/>
logFile="${test.log.dir}/testcontribclidrivergen.log"
logDirectory="${test.log.dir}/contribclientpositive"/>

<qtestgen outputDirectory="${test.build.src}/org/apache/hadoop/hive/cli"
templatePath="${ql.test.template.dir}" template="TestNegativeCliDriver.vm"
queryDirectory="${contrib.test.query.dir}/clientnegative"
queryFile="${qfile}"
resultsDirectory="${contrib.test.results.dir}/clientnegative" className="TestContribNegativeCliDriver"
logFile="${test.log.dir}/testnegclidrivergen.log"
logDirectory="${test.log.dir}/clientnegative"/>
logFile="${test.log.dir}/testcontribnegclidrivergen.log"
logDirectory="${test.log.dir}/contribclientnegative"/>

</target>

Expand Down
9 changes: 9 additions & 0 deletions contrib/data/files/s3.log
@@ -0,0 +1,9 @@
04ff331638adc13885d6c42059584deabbdeabcd55bf0bee491172a79a87b196 img.zemanta.com [09/Apr/2009:22:00:01 +0000] 212.143.99.188 65a011a29cdf8ec533ec3d1ccaae921c D987234E52141DE7 REST.GET.OBJECT pixy.gif "GET /pixy.gif?x-id=4560525f-2864-495c-842c-159ede7143f8 HTTP/1.1" 200 - 828 828 3 2 "http://www.mediafuturist.com/2009/02/marc-andreessen-on-the-charlie-rose-show-talking-about-mobile-technology-video.html" "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.53 Safari/525.19"
04ff331638adc13885d6c42059584deabbdeabcd55bf0bee491172a79a87b196 img.zemanta.com [09/Apr/2009:22:00:01 +0000] 74.244.182.35 65a011a29cdf8ec533ec3d1ccaae921c 626EECA20AB12A5C REST.GET.OBJECT pixy.gif "GET /pixy.gif HTTP/1.1" 200 - 828 828 20 20 "http://trueslant.com/" "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_6; en-us) AppleWebKit/528.16 (KHTML, like Gecko) Version/4 Public Beta Safari/528.16"
04ff331638adc13885d6c42059584deabbdeabcd55bf0bee491172a79a87b196 img.zemanta.com [09/Apr/2009:22:00:02 +0000] 62.149.175.120 65a011a29cdf8ec533ec3d1ccaae921c 3E93D70E69292C98 REST.GET.OBJECT pixy.gif "GET /pixy.gif?x-id=9fec752e-2318-4da3-864e-ac5b9e47c4ae HTTP/1.0" 200 - 828 828 4 3 "-" "-"
04ff331638adc13885d6c42059584deabbdeabcd55bf0bee491172a79a87b196 img.zemanta.com [09/Apr/2009:22:00:02 +0000] 77.116.56.145 65a011a29cdf8ec533ec3d1ccaae921c 2FD33BC70C93C97A REST.GET.OBJECT pixie.png "GET /pixie.png?x-id=3300b26b-4455-47cb-800f-8fe7d80a6b39 HTTP/1.1" 200 - 900 900 5 5 "http://greenerloudoun.wordpress.com/2008/05/29/swarming-behavior/" "Mozilla/5.0 (Windows; U; Windows NT 5.1; de; rv:1.9.0.8) Gecko/2009032609 Firefox/3.0.8"
04ff331638adc13885d6c42059584deabbdeabcd55bf0bee491172a79a87b196 img.zemanta.com [09/Apr/2009:22:00:03 +0000] 62.149.175.120 65a011a29cdf8ec533ec3d1ccaae921c 2AC19D72E1DD76E1 REST.GET.OBJECT pixy.gif "GET /pixy.gif?x-id=9fec752e-2318-4da3-864e-ac5b9e47c4ae HTTP/1.0" 200 - 828 828 7 6 "-" "-"
04ff331638adc13885d6c42059584deabbdeabcd55bf0bee491172a79a87b196 img.zemanta.com [09/Apr/2009:22:00:03 +0000] 62.149.175.120 65a011a29cdf8ec533ec3d1ccaae921c C6DB555CD238EEA0 REST.GET.OBJECT pixy.gif "GET /pixy.gif?x-id=9fec752e-2318-4da3-864e-ac5b9e47c4ae HTTP/1.0" 200 - 828 828 6 5 "-" "-"
04ff331638adc13885d6c42059584deabbdeabcd55bf0bee491172a79a87b196 img.zemanta.com [09/Apr/2009:22:00:06 +0000] 68.209.136.23 65a011a29cdf8ec533ec3d1ccaae921c 42C8E7EEF98D46BC REST.GET.OBJECT pixy.gif "GET /pixy.gif?x-id=78ad075e-832f-4006-a89e-7b5e2978674e HTTP/1.1" 200 - 828 828 5 4 "http://continuityblog.wordpress.com/category/ragdoll/" "Mozilla/5.0 (Windows; U; [] Windows NT 5.1; en-US; rv:1.9.0.8) Gecko/2009032609 Firefox/3.0.8"
04ff331638adc13885d6c42059584deabbdeabcd55bf0bee491172a79a87b196 img.zemanta.com [09/Apr/2009:22:00:06 +0000] 68.209.136.23 65a011a29cdf8ec533ec3d1ccaae921c E5781B471524E1BB REST.GET.OBJECT pixy.gif "GET /pixy.gif?x-id=f596477d-b317-8882-8d14-0b8f168f5e8e HTTP/1.1" 200 - 828 828 4 3 "http://continuityblog.wordpress.com/category/ragdoll/" "Mozilla/5.0 (Windows; U; Win[ s]dows NT 5.1; en-US; rv:1.9.0.8) Gecko/2009032609 Firefox/3.0.8"
04ff331638adc13885d6c42059584deabbdeabcd55bf0bee491172a79a87b196 img.zemanta.com [09/Apr/2009:22:00:07 +0000] 190.225.84.114 65a011a29cdf8ec533ec3d1ccaae921c F4FC3FEAD8C00024 REST.GET.OBJECT pixy.gif "GET /pixy.gif?x-id=23d25db1-160b-48bb-a932-e7dc1e88c321 HTTP/1.1" 304 - - 828 3 - "http://www.viamujer.com/2009/03/horoscopo-acuario-abril-mayo-y-junio-2009/" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)"
@@ -0,0 +1,196 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.hadoop.hive.contrib.serde2.s3;


import java.nio.charset.CharacterCodingException;
import java.util.Date;
import java.util.List;
import java.util.Properties;
import java.util.regex.*;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.contrib.serde2.s3.S3LogStruct;
import org.apache.hadoop.hive.serde2.Deserializer;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.ReflectionStructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import java.text.SimpleDateFormat;


public class S3LogDeserializer implements Deserializer {

public static final Log LOG = LogFactory.getLog(S3LogDeserializer.class.getName());

static {
StackTraceElement[] sTrace = new Exception().getStackTrace();
String className = sTrace[0].getClassName();
}

private ObjectInspector cachedObjectInspector;

public String toString() {
return "S3ZemantaDeserializer[]";
}

public S3LogDeserializer() throws SerDeException {
}

// This regex is a bit lax in order to compensate for lack of any escaping done by Amazon S3 ... for example useragent string can have double quotes inside!
static Pattern regexpat = Pattern.compile( "(\\S+) (\\S+) \\[(.*?)\\] (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) \"(.+)\" (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) \"(.*)\" \"(.*)\"");
//static Pattern regexrid = Pattern.compile("x-id=([-0-9a-f]{36})");
//static SimpleDateFormat dateparser = new SimpleDateFormat("dd/MMM/yyyy:hh:mm:ss ZZZZZ");

S3LogStruct deserializeCache = new S3LogStruct();
public void initialize(Configuration job, Properties tbl) throws SerDeException {

cachedObjectInspector = ObjectInspectorFactory.getReflectionObjectInspector(
S3LogStruct.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);

LOG.debug(getClass().getName() + ": initialized");
}

public static Integer toInt(String s)
{
if (s.compareTo("-") == 0)
return null;
else
return Integer.valueOf(s);
}

public static Object deserialize(S3LogStruct c, String row) throws Exception {
Matcher match = regexpat.matcher(row);
int t = 1;
try {
match.matches();
c.bucketowner = match.group(t++);
c.bucketname = match.group(t++);
} catch (Exception e) {
throw new SerDeException("S3 Log Regex did not match:" + row, e);
}
c.rdatetime = match.group(t++);

// Should we convert the datetime to the format Hive understands by default - either yyyy-mm-dd HH:MM:SS or seconds since epoch?
//Date d = dateparser.parse(c.rdatetime);
//c.rdatetimeepoch = d.getTime() / 1000;

c.rip = match.group(t++);
c.requester = match.group(t++);
c.requestid = match.group(t++);
c.operation = match.group(t++);
c.rkey = match.group(t++);
c.requesturi= match.group(t++);
// System.err.println(c.requesturi);
/*// Zemanta specific data extractor
try {
Matcher m2 = regexrid.matcher(c.requesturi);
m2.find();
c.rid = m2.group(1);
} catch (Exception e) {
c.rid = null;
}
*/
c.httpstatus = toInt(match.group(t++));
c.errorcode = match.group(t++);
c.bytessent = toInt(match.group(t++));
c.objsize = toInt(match.group(t++));
c.totaltime = toInt(match.group(t++));
c.turnaroundtime = toInt(match.group(t++));
c.referer = match.group(t++);
c.useragent = match.group(t++);


return (c);
}

public Object deserialize(Writable field) throws SerDeException {
String row = null;
if (field instanceof BytesWritable) {
BytesWritable b = (BytesWritable)field;
try {
row = Text.decode(b.get(), 0, b.getSize());
} catch (CharacterCodingException e) {
throw new SerDeException(e);
}
} else if (field instanceof Text) {
row = field.toString();
}
try {
deserialize(deserializeCache, row);
return deserializeCache;
} catch (ClassCastException e) {
throw new SerDeException( this.getClass().getName() + " expects Text or BytesWritable", e);
} catch (Exception e) {
throw new SerDeException(e);
}
}


public ObjectInspector getObjectInspector() throws SerDeException {
return cachedObjectInspector;
}



/**
* @param args
*/
public static void main(String[] args) {
System.err.println("This is only a test run");
try {
S3LogDeserializer serDe = new S3LogDeserializer();
Configuration conf = new Configuration();
Properties tbl = new Properties();
// Some nasty examples that show how S3 log format is broken ... and to test the regex
// These are all sourced from genuine S3 logs
//Text sample = new Text("04ff331638adc13885d6c42059584deabbdeabcd55bf0bee491172a79a87b196 img.zemanta.com [09/Apr/2009:22:00:07 +0000] 190.225.84.114 65a011a29cdf8ec533ec3d1ccaae921c F4FC3FEAD8C00024 REST.GET.OBJECT pixy.gif \"GET /pixy.gif?x-id=23d25db1-160b-48bb-a932-e7dc1e88c321 HTTP/1.1\" 304 - - 828 3 - \"http://www.viamujer.com/2009/03/horoscopo-acuario-abril-mayo-y-junio-2009/\" \"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)\"");
//Text sample = new Text("04ff331638adc13885d6c42059584deabbdeabcd55bf0bee491172a79a87b196 img.zemanta.com [09/Apr/2009:22:19:49 +0000] 60.28.204.7 65a011a29cdf8ec533ec3d1ccaae921c 7D87B6835125671E REST.GET.OBJECT pixy.gif \"GET /pixy.gif?x-id=b50a4544-938b-4a63-992c-721d1a644b28 HTTP/1.1\" 200 - 828 828 4 3 \"\" \"ZhuaXia.com\"");
//Text sample = new Text("04ff331638adc13885d6c42059584deabbdeabcd55bf0bee491172a79a87b196 static.zemanta.com [09/Apr/2009:23:12:39 +0000] 65.94.12.181 65a011a29cdf8ec533ec3d1ccaae921c EEE6FFE9B9F9EA29 REST.HEAD.OBJECT readside/loader.js%22+defer%3D%22defer \"HEAD /readside/loader.js\"+defer=\"defer HTTP/1.0\" 403 AccessDenied 231 - 7 - \"-\" \"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0)\"");
Text sample = new Text("04ff331638adc13885d6c42059584deabbdeabcd55bf0bee491172a79a87b196 img.zemanta.com [10/Apr/2009:05:34:01 +0000] 70.32.81.92 65a011a29cdf8ec533ec3d1ccaae921c F939A7D698D27C63 REST.GET.OBJECT reblog_b.png \"GET /reblog_b.png?x-id=79ca9376-6326-41b7-9257-eea43d112eb2 HTTP/1.0\" 200 - 1250 1250 160 159 \"-\" \"Firefox 0.8 (Linux)\" useragent=\"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.6) Gecko/20040614 Firefox/0.8\"");
serDe.initialize(conf, tbl);
Object row = serDe.deserialize(sample);
System.err.println(serDe.getObjectInspector().getClass().toString());
ReflectionStructObjectInspector oi = (ReflectionStructObjectInspector)serDe.getObjectInspector();
List<? extends StructField> fieldRefs = oi.getAllStructFieldRefs();
for (int i = 0; i < fieldRefs.size(); i++) {
System.err.println(fieldRefs.get(i).toString());
Object fieldData = oi.getStructFieldData(row, fieldRefs.get(i));
if (fieldData == null)
System.err.println("null");
else
System.err.println(fieldData.toString());
}

} catch (Exception e) {
System.err.println("Caught: " + e);
e.printStackTrace();
}

}



}
@@ -0,0 +1,24 @@
package org.apache.hadoop.hive.contrib.serde2.s3;

public class S3LogStruct {

public String bucketowner;
public String bucketname;
public String rdatetime;
// public Long rdatetimeepoch; // The format Hive understands by default, should we convert?
public String rip;
public String requester;
public String requestid;
public String operation;
public String rkey;
public String requesturi;
public Integer httpstatus;
public String errorcode;
public Integer bytessent;
public Integer objsize;
public Integer totaltime;
public Integer turnaroundtime;
public String referer;
public String useragent;
// public String rid; // Specific Zemanta use
}
14 changes: 14 additions & 0 deletions contrib/src/test/queries/clientpositive/serde_s3.q
@@ -0,0 +1,14 @@
add jar ../build/contrib/hive_contrib.jar;

DROP TABLE s3log;
CREATE TABLE s3log
ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.s3.S3LogDeserializer'
STORED AS TEXTFILE;

DESCRIBE s3log;

LOAD DATA LOCAL INPATH '../contrib/data/files/s3.log' INTO TABLE s3log;

SELECT a.* FROM s3log a;

DROP TABLE s3log;

0 comments on commit 986f58d

Please sign in to comment.