Permalink
Browse files

Initial import; extracted from the rest of the project

  • Loading branch information...
1 parent 3ebd2b9 commit 194d8b9e4f83827dac01857f1139bd3a60f63743 @bsdfish committed Jul 18, 2010
Showing with 510 additions and 0 deletions.
  1. +51 −0 README
  2. +52 −0 build.xml
  3. +69 −0 src/Examples.scala
  4. +39 −0 src/ImplicitConversion.scala
  5. +291 −0 src/ScalaHadoop.scala
  6. +8 −0 src/ScalaHadoopTool.scala
View
51 README
@@ -0,0 +1,51 @@
+This code provides some syntactic sugar on top of Hadoop in order to make
+it more usable from Scala. Take a look at Examples.scala for more
+details.
+
+A basic mapper looks like
+
+object TokenizerMap extends TypedMapper[LongWritable, Text, Text, LongWritable] {
+ override def map(k: LongWritable, v: Text, context: ContextType) : Unit =
+ v split " \t" foreach ((word) => context.write(word, 1L))
+}
+
+or, you can also write it as
+
+object TokenizerMap1 extends TypedMapper[LongWritable, Text, Text, LongWritable] {
+ override def doMap : Unit = v split " |\t" foreach ((word) => context.write(word, 1L))
+}
+
+and a reducer
+
+object SumReducer1 extends TypedReducer[Text, LongWritable, Text, LongWritable] {
+ override def doReduce :Unit = context.write(k, (0L /: v) ((total, next) => total+next))
+}
+
+Note that implicit conversion is used to convert between LongWritable and longs, as well as Text
+and Strings. The types of the input and output parameters only need to be stated as the
+generic specliazers of the class it extends.
+
+These mappers and reducers can be chained together with the --> operator
+
+object WordCount extends ScalaHadoopTool{
+ def run(args: Array[String]) : Int = {
+ (MapReduceTaskChain.init() -->
+ IO.Text[LongWritable, Text](args(0)).input -->
+ MapReduceTask.MapReduceTask(TokenizerMap1, SumReducer) -->
+ IO.Text[Text, LongWritable](args(1)).output) execute;
+ return 0;
+ }
+}
+
+Multiple map/reduce runs can be chained together
+
+object WordsWithSameCount extends ScalaHadoopTool {
+ def run(args: Array[String]) : Int = {
+ (MapReduceTaskChain.init() -->
+ IO.Text[LongWritable, Text](args(0)).input -->
+ MapReduceTask.MapReduceTask(TokenizerMap1, SumReducer) -->
+ MapReduceTask.MapReduceTask(FlipKeyValueMap, WordListReducer) -->
+ IO.Text[LongWritable, Text](args(1)).output) execute;
+ return 0;
+ }
+}
View
@@ -0,0 +1,52 @@
+<project default="jar">
+ <target name="clean">
+ <delete dir="build"/>
+ </target>
+
+
+ <target name="compile">
+ <mkdir dir="build/classes"/>
+ <scalac srcdir="src" destdir="build/classes" classpathref="classpath" deprecation="yes"/>
+ <javac srcdir="src" destdir="build/classes" classpathref="classpath"/>
+ <copy todir="build/classes/lib">
+ <fileset dir="/opt/local/share/scala-2.8/lib/" includes="scala-lib*"/>
+ </copy>
+ </target>
+
+ <target name="doc">
+ <mkdir dir="doc"/>
+ <scaladoc srcdir="src/ScalaHadoop" destdir="doc" classpathref="classpath">
+ <include name="*.scala"/>
+ <include name="*.java"/>
+ </scaladoc>
+ </target>
+
+ <target name="jar" depends="compile">
+ <jar destfile="ScalaHadoop.jar"
+ basedir="build/classes"
+ />
+ </target>
+
+ <target name="run">
+ <exec executable="/Users/asimma/local/hadoop-0.20.2/bin/hadoop">
+ <arg line="jar ScalaHadoop.jar com.asimma.ScalaHadoop.WordCount"/>
+ </exec>
+ </target>
+
+ <path id="classpath">
+ <fileset dir="/Users/asimma/local/hadoop-0.20.2" includes="**/*.jar"/>
+ <fileset dir="/opt/local/share/scala-2.8/lib" includes="*.jar"/>
+ </path>
+
+
+
+ <property name="scala.lib.dir" location="/opt/local/share/scala-2.8/lib"/>
+ <taskdef resource="scala/tools/ant/antlib.xml">
+ <classpath>
+ <pathelement location="${scala.lib.dir}/scala-compiler.jar"/>
+ <pathelement location="${scala.lib.dir}/scala-library.jar"/>
+ <pathelement location="${scala.lib.dir}/scalap.jar"/>
+ </classpath>
+ </taskdef>
+
+</project>
View
@@ -0,0 +1,69 @@
+package com.asimma.ScalaHadoop;
+
+import org.apache.hadoop.io._;
+import MapReduceTaskChain._;
+import ImplicitConversion._;
+import scala.reflect.Manifest;
+import scala.collection.JavaConversions._
+
+
+
+
+object TokenizerMap extends TypedMapper[LongWritable, Text, Text, LongWritable] {
+ override def map(k: LongWritable, v: Text, context: ContextType) : Unit =
+ v split " \t" foreach ((word) => context.write(word, 1L))
+}
+
+object TokenizerMap1 extends TypedMapper[LongWritable, Text, Text, LongWritable] {
+ override def doMap : Unit = v split " |\t" foreach ((word) => context.write(word, 1L))
+}
+
+object FlipKeyValueMap extends TypedMapper[Text, LongWritable, LongWritable, Text] {
+ override def map(k: Text, v:LongWritable, context: ContextType) : Unit =
+ context.write(v, k);
+}
+
+object SumReducer extends TypedReducer[Text, LongWritable, Text, LongWritable] {
+ override def reduce(k: Text, v: java.lang.Iterable[LongWritable], context: ContextType) : Unit =
+ context.write(k, (0L /: v) ((total, next) => total+next))
+}
+
+
+object SumReducer1 extends TypedReducer[Text, LongWritable, Text, LongWritable] {
+ override def doReduce :Unit = context.write(k, (0L /: v) ((total, next) => total+next))
+}
+
+object WordListReducer extends TypedReducer[LongWritable, Text, LongWritable, Text] {
+ override def doReduce: Unit = context write (k, (new StringBuilder /: v) ((soFar, newString) => soFar.append(newString + " ")));
+
+ /*If you're not comfortable with reduce, could also be written as
+ val builder = newStringBuilder;
+ v foreach(t => builder.append (t + " "))
+ context write (k,v toString)
+ */
+}
+
+
+
+object WordCount extends ScalaHadoopTool{
+ def run(args: Array[String]) : Int = {
+ val c = MapReduceTaskChain.init() -->
+ IO.Text[LongWritable, Text](args(0)).input -->
+ MapReduceTask.MapReduceTask(TokenizerMap1, SumReducer) -->
+ IO.Text[Text, LongWritable](args(1)).output;
+ c.execute();
+ return 0;
+ }
+}
+
+object WordsWithSameCount extends ScalaHadoopTool {
+ def run(args: Array[String]) : Int = {
+ val c = MapReduceTaskChain.init() -->
+ IO.Text[LongWritable, Text](args(0)).input -->
+ MapReduceTask.MapReduceTask(TokenizerMap1, SumReducer) -->
+ MapReduceTask.MapReduceTask(FlipKeyValueMap, WordListReducer) -->
+ IO.Text[LongWritable, Text](args(1)).output;
+ c.execute();
+ return 0;
+ }
+}
@@ -0,0 +1,39 @@
+// This is inspired by Shadoop
+// (http://blog.jonhnnyweslley.net/2008/05/shadoop.html)
+package com.asimma.ScalaHadoop
+
+import org.apache.hadoop.io._
+
+object ImplicitConversion {
+ // Handle BooleanWritable
+ implicit def BooleanWritableUnbox(v: BooleanWritable) = v.get
+ implicit def BooleanWritableBox (v: Boolean) = new BooleanWritable(v)
+
+ // Handle DoubleWritable
+ implicit def DoubleWritableUnbox(v: DoubleWritable) = v.get
+ implicit def DoubleWritableBox (v: Double) = new DoubleWritable(v)
+
+ // Handle FloatWritable
+ implicit def FloatWritableUnbox(v: FloatWritable) = v.get
+ implicit def FloatWritableBox (v: Float) = new FloatWritable(v)
+
+ // Handle IntWritable
+ implicit def IntWritableUnbox(v: IntWritable) = v.get
+ implicit def IntWritableBox (v: Int) = new IntWritable(v)
+
+ // Handle LongWritable
+ implicit def LongWritableUnbox(v: LongWritable) = v.get
+ implicit def LongWritableBox (v: Long) = new LongWritable(v)
+
+ // Handle Text
+ implicit def TextUnbox(v: Text) = v.toString
+ implicit def TextBox (v: String) = new Text(v)
+ implicit def StringBuilderBox (v: StringBuilder) = new Text(v.toString)
+ implicit def StringBufferBox (v: StringBuffer) = new Text(v.toString)
+
+
+ implicit def MapWritableBox[X <: Writable,Y <: Writable](value: scala.collection.Map[X,Y]): MapWritable = {
+ var newMap = new MapWritable();
+ value.foreach{case (k, v) => newMap.put(k, v)};
+ return newMap; }
+}
Oops, something went wrong.

0 comments on commit 194d8b9

Please sign in to comment.