This repository has been archived by the owner on Jul 15, 2019. It is now read-only.
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
HIVE-693. Add a AWS S3 log format deserializer
(Zheng Shao and Andraz Tori via namit) git-svn-id: https://svn.apache.org/repos/asf/hadoop/hive/trunk@804035 13f79535-47bb-0310-9956-ffa450edef68
- Loading branch information
Namit Jain
committed
Aug 13, 2009
1 parent
a712257
commit 986f58d
Showing
7 changed files
with
294 additions
and
8 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
04ff331638adc13885d6c42059584deabbdeabcd55bf0bee491172a79a87b196 img.zemanta.com [09/Apr/2009:22:00:01 +0000] 212.143.99.188 65a011a29cdf8ec533ec3d1ccaae921c D987234E52141DE7 REST.GET.OBJECT pixy.gif "GET /pixy.gif?x-id=4560525f-2864-495c-842c-159ede7143f8 HTTP/1.1" 200 - 828 828 3 2 "http://www.mediafuturist.com/2009/02/marc-andreessen-on-the-charlie-rose-show-talking-about-mobile-technology-video.html" "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.53 Safari/525.19" | ||
04ff331638adc13885d6c42059584deabbdeabcd55bf0bee491172a79a87b196 img.zemanta.com [09/Apr/2009:22:00:01 +0000] 74.244.182.35 65a011a29cdf8ec533ec3d1ccaae921c 626EECA20AB12A5C REST.GET.OBJECT pixy.gif "GET /pixy.gif HTTP/1.1" 200 - 828 828 20 20 "http://trueslant.com/" "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_6; en-us) AppleWebKit/528.16 (KHTML, like Gecko) Version/4 Public Beta Safari/528.16" | ||
04ff331638adc13885d6c42059584deabbdeabcd55bf0bee491172a79a87b196 img.zemanta.com [09/Apr/2009:22:00:02 +0000] 62.149.175.120 65a011a29cdf8ec533ec3d1ccaae921c 3E93D70E69292C98 REST.GET.OBJECT pixy.gif "GET /pixy.gif?x-id=9fec752e-2318-4da3-864e-ac5b9e47c4ae HTTP/1.0" 200 - 828 828 4 3 "-" "-" | ||
04ff331638adc13885d6c42059584deabbdeabcd55bf0bee491172a79a87b196 img.zemanta.com [09/Apr/2009:22:00:02 +0000] 77.116.56.145 65a011a29cdf8ec533ec3d1ccaae921c 2FD33BC70C93C97A REST.GET.OBJECT pixie.png "GET /pixie.png?x-id=3300b26b-4455-47cb-800f-8fe7d80a6b39 HTTP/1.1" 200 - 900 900 5 5 "http://greenerloudoun.wordpress.com/2008/05/29/swarming-behavior/" "Mozilla/5.0 (Windows; U; Windows NT 5.1; de; rv:1.9.0.8) Gecko/2009032609 Firefox/3.0.8" | ||
04ff331638adc13885d6c42059584deabbdeabcd55bf0bee491172a79a87b196 img.zemanta.com [09/Apr/2009:22:00:03 +0000] 62.149.175.120 65a011a29cdf8ec533ec3d1ccaae921c 2AC19D72E1DD76E1 REST.GET.OBJECT pixy.gif "GET /pixy.gif?x-id=9fec752e-2318-4da3-864e-ac5b9e47c4ae HTTP/1.0" 200 - 828 828 7 6 "-" "-" | ||
04ff331638adc13885d6c42059584deabbdeabcd55bf0bee491172a79a87b196 img.zemanta.com [09/Apr/2009:22:00:03 +0000] 62.149.175.120 65a011a29cdf8ec533ec3d1ccaae921c C6DB555CD238EEA0 REST.GET.OBJECT pixy.gif "GET /pixy.gif?x-id=9fec752e-2318-4da3-864e-ac5b9e47c4ae HTTP/1.0" 200 - 828 828 6 5 "-" "-" | ||
04ff331638adc13885d6c42059584deabbdeabcd55bf0bee491172a79a87b196 img.zemanta.com [09/Apr/2009:22:00:06 +0000] 68.209.136.23 65a011a29cdf8ec533ec3d1ccaae921c 42C8E7EEF98D46BC REST.GET.OBJECT pixy.gif "GET /pixy.gif?x-id=78ad075e-832f-4006-a89e-7b5e2978674e HTTP/1.1" 200 - 828 828 5 4 "http://continuityblog.wordpress.com/category/ragdoll/" "Mozilla/5.0 (Windows; U; [] Windows NT 5.1; en-US; rv:1.9.0.8) Gecko/2009032609 Firefox/3.0.8" | ||
04ff331638adc13885d6c42059584deabbdeabcd55bf0bee491172a79a87b196 img.zemanta.com [09/Apr/2009:22:00:06 +0000] 68.209.136.23 65a011a29cdf8ec533ec3d1ccaae921c E5781B471524E1BB REST.GET.OBJECT pixy.gif "GET /pixy.gif?x-id=f596477d-b317-8882-8d14-0b8f168f5e8e HTTP/1.1" 200 - 828 828 4 3 "http://continuityblog.wordpress.com/category/ragdoll/" "Mozilla/5.0 (Windows; U; Win[ s]dows NT 5.1; en-US; rv:1.9.0.8) Gecko/2009032609 Firefox/3.0.8" | ||
04ff331638adc13885d6c42059584deabbdeabcd55bf0bee491172a79a87b196 img.zemanta.com [09/Apr/2009:22:00:07 +0000] 190.225.84.114 65a011a29cdf8ec533ec3d1ccaae921c F4FC3FEAD8C00024 REST.GET.OBJECT pixy.gif "GET /pixy.gif?x-id=23d25db1-160b-48bb-a932-e7dc1e88c321 HTTP/1.1" 304 - - 828 3 - "http://www.viamujer.com/2009/03/horoscopo-acuario-abril-mayo-y-junio-2009/" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)" |
196 changes: 196 additions & 0 deletions
196
contrib/src/java/org/apache/hadoop/hive/contrib/serde2/s3/S3LogDeserializer.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,196 @@ | ||
/** | ||
* Licensed to the Apache Software Foundation (ASF) under one | ||
* or more contributor license agreements. See the NOTICE file | ||
* distributed with this work for additional information | ||
* regarding copyright ownership. The ASF licenses this file | ||
* to you under the Apache License, Version 2.0 (the | ||
* "License"); you may not use this file except in compliance | ||
* with the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.apache.hadoop.hive.contrib.serde2.s3; | ||
|
||
|
||
import java.nio.charset.CharacterCodingException; | ||
import java.util.Date; | ||
import java.util.List; | ||
import java.util.Properties; | ||
import java.util.regex.*; | ||
|
||
import org.apache.commons.logging.Log; | ||
import org.apache.commons.logging.LogFactory; | ||
import org.apache.hadoop.conf.Configuration; | ||
import org.apache.hadoop.hive.contrib.serde2.s3.S3LogStruct; | ||
import org.apache.hadoop.hive.serde2.Deserializer; | ||
import org.apache.hadoop.hive.serde2.SerDeException; | ||
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; | ||
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; | ||
import org.apache.hadoop.hive.serde2.objectinspector.ReflectionStructObjectInspector; | ||
import org.apache.hadoop.hive.serde2.objectinspector.StructField; | ||
import org.apache.hadoop.io.BytesWritable; | ||
import org.apache.hadoop.io.Text; | ||
import org.apache.hadoop.io.Writable; | ||
import java.text.SimpleDateFormat; | ||
|
||
|
||
public class S3LogDeserializer implements Deserializer { | ||
|
||
public static final Log LOG = LogFactory.getLog(S3LogDeserializer.class.getName()); | ||
|
||
static { | ||
StackTraceElement[] sTrace = new Exception().getStackTrace(); | ||
String className = sTrace[0].getClassName(); | ||
} | ||
|
||
private ObjectInspector cachedObjectInspector; | ||
|
||
public String toString() { | ||
return "S3ZemantaDeserializer[]"; | ||
} | ||
|
||
public S3LogDeserializer() throws SerDeException { | ||
} | ||
|
||
// This regex is a bit lax in order to compensate for lack of any escaping done by Amazon S3 ... for example useragent string can have double quotes inside! | ||
static Pattern regexpat = Pattern.compile( "(\\S+) (\\S+) \\[(.*?)\\] (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) \"(.+)\" (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) \"(.*)\" \"(.*)\""); | ||
//static Pattern regexrid = Pattern.compile("x-id=([-0-9a-f]{36})"); | ||
//static SimpleDateFormat dateparser = new SimpleDateFormat("dd/MMM/yyyy:hh:mm:ss ZZZZZ"); | ||
|
||
S3LogStruct deserializeCache = new S3LogStruct(); | ||
public void initialize(Configuration job, Properties tbl) throws SerDeException { | ||
|
||
cachedObjectInspector = ObjectInspectorFactory.getReflectionObjectInspector( | ||
S3LogStruct.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); | ||
|
||
LOG.debug(getClass().getName() + ": initialized"); | ||
} | ||
|
||
public static Integer toInt(String s) | ||
{ | ||
if (s.compareTo("-") == 0) | ||
return null; | ||
else | ||
return Integer.valueOf(s); | ||
} | ||
|
||
public static Object deserialize(S3LogStruct c, String row) throws Exception { | ||
Matcher match = regexpat.matcher(row); | ||
int t = 1; | ||
try { | ||
match.matches(); | ||
c.bucketowner = match.group(t++); | ||
c.bucketname = match.group(t++); | ||
} catch (Exception e) { | ||
throw new SerDeException("S3 Log Regex did not match:" + row, e); | ||
} | ||
c.rdatetime = match.group(t++); | ||
|
||
// Should we convert the datetime to the format Hive understands by default - either yyyy-mm-dd HH:MM:SS or seconds since epoch? | ||
//Date d = dateparser.parse(c.rdatetime); | ||
//c.rdatetimeepoch = d.getTime() / 1000; | ||
|
||
c.rip = match.group(t++); | ||
c.requester = match.group(t++); | ||
c.requestid = match.group(t++); | ||
c.operation = match.group(t++); | ||
c.rkey = match.group(t++); | ||
c.requesturi= match.group(t++); | ||
// System.err.println(c.requesturi); | ||
/*// Zemanta specific data extractor | ||
try { | ||
Matcher m2 = regexrid.matcher(c.requesturi); | ||
m2.find(); | ||
c.rid = m2.group(1); | ||
} catch (Exception e) { | ||
c.rid = null; | ||
} | ||
*/ | ||
c.httpstatus = toInt(match.group(t++)); | ||
c.errorcode = match.group(t++); | ||
c.bytessent = toInt(match.group(t++)); | ||
c.objsize = toInt(match.group(t++)); | ||
c.totaltime = toInt(match.group(t++)); | ||
c.turnaroundtime = toInt(match.group(t++)); | ||
c.referer = match.group(t++); | ||
c.useragent = match.group(t++); | ||
|
||
|
||
return (c); | ||
} | ||
|
||
public Object deserialize(Writable field) throws SerDeException { | ||
String row = null; | ||
if (field instanceof BytesWritable) { | ||
BytesWritable b = (BytesWritable)field; | ||
try { | ||
row = Text.decode(b.get(), 0, b.getSize()); | ||
} catch (CharacterCodingException e) { | ||
throw new SerDeException(e); | ||
} | ||
} else if (field instanceof Text) { | ||
row = field.toString(); | ||
} | ||
try { | ||
deserialize(deserializeCache, row); | ||
return deserializeCache; | ||
} catch (ClassCastException e) { | ||
throw new SerDeException( this.getClass().getName() + " expects Text or BytesWritable", e); | ||
} catch (Exception e) { | ||
throw new SerDeException(e); | ||
} | ||
} | ||
|
||
|
||
public ObjectInspector getObjectInspector() throws SerDeException { | ||
return cachedObjectInspector; | ||
} | ||
|
||
|
||
|
||
/** | ||
* @param args | ||
*/ | ||
public static void main(String[] args) { | ||
System.err.println("This is only a test run"); | ||
try { | ||
S3LogDeserializer serDe = new S3LogDeserializer(); | ||
Configuration conf = new Configuration(); | ||
Properties tbl = new Properties(); | ||
// Some nasty examples that show how S3 log format is broken ... and to test the regex | ||
// These are all sourced from genuine S3 logs | ||
//Text sample = new Text("04ff331638adc13885d6c42059584deabbdeabcd55bf0bee491172a79a87b196 img.zemanta.com [09/Apr/2009:22:00:07 +0000] 190.225.84.114 65a011a29cdf8ec533ec3d1ccaae921c F4FC3FEAD8C00024 REST.GET.OBJECT pixy.gif \"GET /pixy.gif?x-id=23d25db1-160b-48bb-a932-e7dc1e88c321 HTTP/1.1\" 304 - - 828 3 - \"http://www.viamujer.com/2009/03/horoscopo-acuario-abril-mayo-y-junio-2009/\" \"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)\""); | ||
//Text sample = new Text("04ff331638adc13885d6c42059584deabbdeabcd55bf0bee491172a79a87b196 img.zemanta.com [09/Apr/2009:22:19:49 +0000] 60.28.204.7 65a011a29cdf8ec533ec3d1ccaae921c 7D87B6835125671E REST.GET.OBJECT pixy.gif \"GET /pixy.gif?x-id=b50a4544-938b-4a63-992c-721d1a644b28 HTTP/1.1\" 200 - 828 828 4 3 \"\" \"ZhuaXia.com\""); | ||
//Text sample = new Text("04ff331638adc13885d6c42059584deabbdeabcd55bf0bee491172a79a87b196 static.zemanta.com [09/Apr/2009:23:12:39 +0000] 65.94.12.181 65a011a29cdf8ec533ec3d1ccaae921c EEE6FFE9B9F9EA29 REST.HEAD.OBJECT readside/loader.js%22+defer%3D%22defer \"HEAD /readside/loader.js\"+defer=\"defer HTTP/1.0\" 403 AccessDenied 231 - 7 - \"-\" \"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0)\""); | ||
Text sample = new Text("04ff331638adc13885d6c42059584deabbdeabcd55bf0bee491172a79a87b196 img.zemanta.com [10/Apr/2009:05:34:01 +0000] 70.32.81.92 65a011a29cdf8ec533ec3d1ccaae921c F939A7D698D27C63 REST.GET.OBJECT reblog_b.png \"GET /reblog_b.png?x-id=79ca9376-6326-41b7-9257-eea43d112eb2 HTTP/1.0\" 200 - 1250 1250 160 159 \"-\" \"Firefox 0.8 (Linux)\" useragent=\"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.6) Gecko/20040614 Firefox/0.8\""); | ||
serDe.initialize(conf, tbl); | ||
Object row = serDe.deserialize(sample); | ||
System.err.println(serDe.getObjectInspector().getClass().toString()); | ||
ReflectionStructObjectInspector oi = (ReflectionStructObjectInspector)serDe.getObjectInspector(); | ||
List<? extends StructField> fieldRefs = oi.getAllStructFieldRefs(); | ||
for (int i = 0; i < fieldRefs.size(); i++) { | ||
System.err.println(fieldRefs.get(i).toString()); | ||
Object fieldData = oi.getStructFieldData(row, fieldRefs.get(i)); | ||
if (fieldData == null) | ||
System.err.println("null"); | ||
else | ||
System.err.println(fieldData.toString()); | ||
} | ||
|
||
} catch (Exception e) { | ||
System.err.println("Caught: " + e); | ||
e.printStackTrace(); | ||
} | ||
|
||
} | ||
|
||
|
||
|
||
} |
24 changes: 24 additions & 0 deletions
24
contrib/src/java/org/apache/hadoop/hive/contrib/serde2/s3/S3LogStruct.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
package org.apache.hadoop.hive.contrib.serde2.s3; | ||
|
||
public class S3LogStruct { | ||
|
||
public String bucketowner; | ||
public String bucketname; | ||
public String rdatetime; | ||
// public Long rdatetimeepoch; // The format Hive understands by default, should we convert? | ||
public String rip; | ||
public String requester; | ||
public String requestid; | ||
public String operation; | ||
public String rkey; | ||
public String requesturi; | ||
public Integer httpstatus; | ||
public String errorcode; | ||
public Integer bytessent; | ||
public Integer objsize; | ||
public Integer totaltime; | ||
public Integer turnaroundtime; | ||
public String referer; | ||
public String useragent; | ||
// public String rid; // Specific Zemanta use | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
add jar ../build/contrib/hive_contrib.jar; | ||
|
||
DROP TABLE s3log; | ||
CREATE TABLE s3log | ||
ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.s3.S3LogDeserializer' | ||
STORED AS TEXTFILE; | ||
|
||
DESCRIBE s3log; | ||
|
||
LOAD DATA LOCAL INPATH '../contrib/data/files/s3.log' INTO TABLE s3log; | ||
|
||
SELECT a.* FROM s3log a; | ||
|
||
DROP TABLE s3log; |
Oops, something went wrong.