Initial import; extracted from the rest of the project

bsdfish · Jul 18, 2010 · 194d8b9 · 194d8b9
1 parent 3ebd2b9
commit 194d8b9
Show file tree

Hide file tree

Showing 6 changed files with 510 additions and 0 deletions.
diff --git a/README b/README
@@ -0,0 +1,51 @@
+This code provides some syntactic sugar on top of Hadoop in order to make
+it more usable from Scala.  Take a look at Examples.scala for more
+details.
+
+A basic mapper looks like
+
+object TokenizerMap extends TypedMapper[LongWritable, Text, Text, LongWritable] {
+  override def map(k: LongWritable, v: Text, context: ContextType) : Unit =  
+    v split " \t" foreach ((word) => context.write(word, 1L))
+}
+
+or, you can also write it as 
+
+object TokenizerMap1 extends TypedMapper[LongWritable, Text, Text, LongWritable] {
+  override def doMap : Unit = v split " |\t" foreach ((word) => context.write(word, 1L))
+}
+
+and a reducer
+
+object SumReducer1 extends TypedReducer[Text, LongWritable, Text, LongWritable] {
+  override def  doReduce :Unit = context.write(k, (0L /: v) ((total, next) => total+next))
+}
+
+Note that implicit conversion is used to convert between LongWritable and longs, as well as Text
+and Strings.  The types of the input and output parameters only need to be stated as the
+generic specliazers of the class it extends.
+
+These mappers and reducers can be chained together with the --> operator 
+
+object WordCount extends ScalaHadoopTool{ 
+  def run(args: Array[String]) : Int = {  
+    (MapReduceTaskChain.init() -->
+     IO.Text[LongWritable, Text](args(0)).input                    -->  
+     MapReduceTask.MapReduceTask(TokenizerMap1, SumReducer)         -->
+     IO.Text[Text, LongWritable](args(1)).output) execute;
+    return 0;
+  }
+}
+
+Multiple map/reduce runs can be chained together
+
+object WordsWithSameCount extends ScalaHadoopTool {
+  def run(args: Array[String]) : Int = {  
+    (MapReduceTaskChain.init() -->
+    IO.Text[LongWritable, Text](args(0)).input                    -->  
+    MapReduceTask.MapReduceTask(TokenizerMap1, SumReducer)        -->
+    MapReduceTask.MapReduceTask(FlipKeyValueMap, WordListReducer) -->
+    IO.Text[LongWritable, Text](args(1)).output) execute;
+    return 0;
+  }
+}
diff --git a/build.xml b/build.xml
@@ -0,0 +1,52 @@
+<project default="jar">
+  <target name="clean">
+    <delete dir="build"/>
+  </target>
+
+
+  <target name="compile">
+    <mkdir dir="build/classes"/>
+    <scalac srcdir="src" destdir="build/classes" classpathref="classpath" deprecation="yes"/>
+    <javac srcdir="src" destdir="build/classes" classpathref="classpath"/>
+    <copy todir="build/classes/lib">
+      <fileset dir="/opt/local/share/scala-2.8/lib/" includes="scala-lib*"/>
+    </copy>
+  </target>
+
+  <target name="doc">
+    <mkdir dir="doc"/>
+    <scaladoc srcdir="src/ScalaHadoop" destdir="doc" classpathref="classpath">
+        <include name="*.scala"/>
+        <include name="*.java"/>
+    </scaladoc>
+  </target>
+
+  <target name="jar" depends="compile">
+    <jar destfile="ScalaHadoop.jar"
+      basedir="build/classes"
+    />
+  </target>
+
+  <target name="run">
+    <exec executable="/Users/asimma/local/hadoop-0.20.2/bin/hadoop">
+       <arg line="jar ScalaHadoop.jar  com.asimma.ScalaHadoop.WordCount"/>
+     </exec>
+  </target>
+
+  <path id="classpath">
+    <fileset dir="/Users/asimma/local/hadoop-0.20.2" includes="**/*.jar"/>
+    <fileset dir="/opt/local/share/scala-2.8/lib" includes="*.jar"/>
+  </path>
+
+
+
+  <property name="scala.lib.dir" location="/opt/local/share/scala-2.8/lib"/>
+  <taskdef resource="scala/tools/ant/antlib.xml">
+   <classpath>
+    <pathelement location="${scala.lib.dir}/scala-compiler.jar"/>
+    <pathelement location="${scala.lib.dir}/scala-library.jar"/>
+    <pathelement location="${scala.lib.dir}/scalap.jar"/>
+   </classpath>
+  </taskdef>
+
+</project>
diff --git a/src/Examples.scala b/src/Examples.scala
@@ -0,0 +1,69 @@
+package com.asimma.ScalaHadoop;
+
+import org.apache.hadoop.io._;
+import MapReduceTaskChain._;
+import ImplicitConversion._;
+import scala.reflect.Manifest;
+import scala.collection.JavaConversions._ 
+
+
+
+
+object TokenizerMap extends TypedMapper[LongWritable, Text, Text, LongWritable] {
+  override def map(k: LongWritable, v: Text, context: ContextType) : Unit =  
+    v split " \t" foreach ((word) => context.write(word, 1L))
+}
+
+object TokenizerMap1 extends TypedMapper[LongWritable, Text, Text, LongWritable] {
+  override def doMap : Unit = v split " |\t" foreach ((word) => context.write(word, 1L))
+}
+
+object FlipKeyValueMap extends TypedMapper[Text, LongWritable, LongWritable, Text] {
+  override def map(k: Text, v:LongWritable, context: ContextType) : Unit = 
+    context.write(v, k);
+}
+
+object SumReducer extends TypedReducer[Text, LongWritable, Text, LongWritable] {
+  override def reduce(k: Text, v: java.lang.Iterable[LongWritable], context: ContextType) : Unit =
+    context.write(k, (0L /: v) ((total, next) => total+next))
+}
+
+
+object SumReducer1 extends TypedReducer[Text, LongWritable, Text, LongWritable] {
+  override def  doReduce :Unit = context.write(k, (0L /: v) ((total, next) => total+next))
+}
+
+object WordListReducer extends TypedReducer[LongWritable, Text, LongWritable, Text] {
+  override def doReduce: Unit = context write (k, (new StringBuilder /: v) ((soFar, newString) => soFar.append(newString + " ")));
+
+  /*If you're not comfortable with reduce, could also be written as
+    val builder = newStringBuilder;
+    v foreach(t => builder.append  (t + " "))
+    context write (k,v toString)
+    */
+}
+
+
+
+object WordCount extends ScalaHadoopTool{ 
+  def run(args: Array[String]) : Int = {  
+    val c = MapReduceTaskChain.init() -->
+    IO.Text[LongWritable, Text](args(0)).input                    -->  
+    MapReduceTask.MapReduceTask(TokenizerMap1, SumReducer)         -->
+    IO.Text[Text, LongWritable](args(1)).output;
+    c.execute();
+    return 0;
+  }
+}
+
+object WordsWithSameCount extends ScalaHadoopTool {
+  def run(args: Array[String]) : Int = {  
+    val c = MapReduceTaskChain.init() -->
+    IO.Text[LongWritable, Text](args(0)).input                    -->  
+    MapReduceTask.MapReduceTask(TokenizerMap1, SumReducer)        -->
+    MapReduceTask.MapReduceTask(FlipKeyValueMap, WordListReducer) -->
+    IO.Text[LongWritable, Text](args(1)).output;
+    c.execute();
+    return 0;
+  }
+}
diff --git a/src/ImplicitConversion.scala b/src/ImplicitConversion.scala
@@ -0,0 +1,39 @@
+// This is inspired by Shadoop 
+// (http://blog.jonhnnyweslley.net/2008/05/shadoop.html) 
+package com.asimma.ScalaHadoop
+
+import org.apache.hadoop.io._
+
+object ImplicitConversion {
+  // Handle BooleanWritable
+  implicit def BooleanWritableUnbox(v: BooleanWritable) = v.get
+  implicit def BooleanWritableBox  (v: Boolean) = new BooleanWritable(v)
+
+  // Handle DoubleWritable
+  implicit def DoubleWritableUnbox(v: DoubleWritable) = v.get
+  implicit def DoubleWritableBox  (v: Double) = new DoubleWritable(v)
+
+  // Handle FloatWritable
+  implicit def FloatWritableUnbox(v: FloatWritable) = v.get
+  implicit def FloatWritableBox  (v: Float) = new FloatWritable(v)
+
+  // Handle IntWritable
+  implicit def IntWritableUnbox(v: IntWritable) = v.get
+  implicit def IntWritableBox  (v: Int) = new IntWritable(v)
+
+  // Handle LongWritable
+  implicit def LongWritableUnbox(v: LongWritable) = v.get
+  implicit def LongWritableBox  (v: Long) = new LongWritable(v)
+
+  // Handle Text
+  implicit def TextUnbox(v: Text) = v.toString
+  implicit def TextBox  (v: String) = new Text(v)
+  implicit def StringBuilderBox  (v: StringBuilder) = new Text(v.toString)
+  implicit def StringBufferBox  (v: StringBuffer) = new Text(v.toString)
+
+
+  implicit def MapWritableBox[X <: Writable,Y <: Writable](value: scala.collection.Map[X,Y]): MapWritable  =  {
+      var newMap = new MapWritable();
+      value.foreach{case (k, v)  => newMap.put(k, v)};
+      return newMap; }
+}