Skip to content

Commit

Permalink
Initial import; extracted from the rest of the project
Browse files Browse the repository at this point in the history
  • Loading branch information
bsdfish committed Jul 18, 2010
1 parent 3ebd2b9 commit 194d8b9
Show file tree
Hide file tree
Showing 6 changed files with 510 additions and 0 deletions.
51 changes: 51 additions & 0 deletions README
@@ -0,0 +1,51 @@
This code provides some syntactic sugar on top of Hadoop in order to make
it more usable from Scala. Take a look at Examples.scala for more
details.

A basic mapper looks like

object TokenizerMap extends TypedMapper[LongWritable, Text, Text, LongWritable] {
override def map(k: LongWritable, v: Text, context: ContextType) : Unit =
v split " \t" foreach ((word) => context.write(word, 1L))
}

or, you can also write it as

object TokenizerMap1 extends TypedMapper[LongWritable, Text, Text, LongWritable] {
override def doMap : Unit = v split " |\t" foreach ((word) => context.write(word, 1L))
}

and a reducer

object SumReducer1 extends TypedReducer[Text, LongWritable, Text, LongWritable] {
override def doReduce :Unit = context.write(k, (0L /: v) ((total, next) => total+next))
}

Note that implicit conversion is used to convert between LongWritable and longs, as well as Text
and Strings. The types of the input and output parameters only need to be stated as the
generic specliazers of the class it extends.

These mappers and reducers can be chained together with the --> operator

object WordCount extends ScalaHadoopTool{
def run(args: Array[String]) : Int = {
(MapReduceTaskChain.init() -->
IO.Text[LongWritable, Text](args(0)).input -->
MapReduceTask.MapReduceTask(TokenizerMap1, SumReducer) -->
IO.Text[Text, LongWritable](args(1)).output) execute;
return 0;
}
}

Multiple map/reduce runs can be chained together

object WordsWithSameCount extends ScalaHadoopTool {
def run(args: Array[String]) : Int = {
(MapReduceTaskChain.init() -->
IO.Text[LongWritable, Text](args(0)).input -->
MapReduceTask.MapReduceTask(TokenizerMap1, SumReducer) -->
MapReduceTask.MapReduceTask(FlipKeyValueMap, WordListReducer) -->
IO.Text[LongWritable, Text](args(1)).output) execute;
return 0;
}
}
52 changes: 52 additions & 0 deletions build.xml
@@ -0,0 +1,52 @@
<project default="jar">
<target name="clean">
<delete dir="build"/>
</target>


<target name="compile">
<mkdir dir="build/classes"/>
<scalac srcdir="src" destdir="build/classes" classpathref="classpath" deprecation="yes"/>
<javac srcdir="src" destdir="build/classes" classpathref="classpath"/>
<copy todir="build/classes/lib">
<fileset dir="/opt/local/share/scala-2.8/lib/" includes="scala-lib*"/>
</copy>
</target>

<target name="doc">
<mkdir dir="doc"/>
<scaladoc srcdir="src/ScalaHadoop" destdir="doc" classpathref="classpath">
<include name="*.scala"/>
<include name="*.java"/>
</scaladoc>
</target>

<target name="jar" depends="compile">
<jar destfile="ScalaHadoop.jar"
basedir="build/classes"
/>
</target>

<target name="run">
<exec executable="/Users/asimma/local/hadoop-0.20.2/bin/hadoop">
<arg line="jar ScalaHadoop.jar com.asimma.ScalaHadoop.WordCount"/>
</exec>
</target>

<path id="classpath">
<fileset dir="/Users/asimma/local/hadoop-0.20.2" includes="**/*.jar"/>
<fileset dir="/opt/local/share/scala-2.8/lib" includes="*.jar"/>
</path>



<property name="scala.lib.dir" location="/opt/local/share/scala-2.8/lib"/>
<taskdef resource="scala/tools/ant/antlib.xml">
<classpath>
<pathelement location="${scala.lib.dir}/scala-compiler.jar"/>
<pathelement location="${scala.lib.dir}/scala-library.jar"/>
<pathelement location="${scala.lib.dir}/scalap.jar"/>
</classpath>
</taskdef>

</project>
69 changes: 69 additions & 0 deletions src/Examples.scala
@@ -0,0 +1,69 @@
package com.asimma.ScalaHadoop;

import org.apache.hadoop.io._;
import MapReduceTaskChain._;
import ImplicitConversion._;
import scala.reflect.Manifest;
import scala.collection.JavaConversions._




object TokenizerMap extends TypedMapper[LongWritable, Text, Text, LongWritable] {
override def map(k: LongWritable, v: Text, context: ContextType) : Unit =
v split " \t" foreach ((word) => context.write(word, 1L))
}

object TokenizerMap1 extends TypedMapper[LongWritable, Text, Text, LongWritable] {
override def doMap : Unit = v split " |\t" foreach ((word) => context.write(word, 1L))
}

object FlipKeyValueMap extends TypedMapper[Text, LongWritable, LongWritable, Text] {
override def map(k: Text, v:LongWritable, context: ContextType) : Unit =
context.write(v, k);
}

object SumReducer extends TypedReducer[Text, LongWritable, Text, LongWritable] {
override def reduce(k: Text, v: java.lang.Iterable[LongWritable], context: ContextType) : Unit =
context.write(k, (0L /: v) ((total, next) => total+next))
}


object SumReducer1 extends TypedReducer[Text, LongWritable, Text, LongWritable] {
override def doReduce :Unit = context.write(k, (0L /: v) ((total, next) => total+next))
}

object WordListReducer extends TypedReducer[LongWritable, Text, LongWritable, Text] {
override def doReduce: Unit = context write (k, (new StringBuilder /: v) ((soFar, newString) => soFar.append(newString + " ")));

/*If you're not comfortable with reduce, could also be written as
val builder = newStringBuilder;
v foreach(t => builder.append (t + " "))
context write (k,v toString)
*/
}



object WordCount extends ScalaHadoopTool{
def run(args: Array[String]) : Int = {
val c = MapReduceTaskChain.init() -->
IO.Text[LongWritable, Text](args(0)).input -->
MapReduceTask.MapReduceTask(TokenizerMap1, SumReducer) -->
IO.Text[Text, LongWritable](args(1)).output;
c.execute();
return 0;
}
}

object WordsWithSameCount extends ScalaHadoopTool {
def run(args: Array[String]) : Int = {
val c = MapReduceTaskChain.init() -->
IO.Text[LongWritable, Text](args(0)).input -->
MapReduceTask.MapReduceTask(TokenizerMap1, SumReducer) -->
MapReduceTask.MapReduceTask(FlipKeyValueMap, WordListReducer) -->
IO.Text[LongWritable, Text](args(1)).output;
c.execute();
return 0;
}
}
39 changes: 39 additions & 0 deletions src/ImplicitConversion.scala
@@ -0,0 +1,39 @@
// This is inspired by Shadoop
// (http://blog.jonhnnyweslley.net/2008/05/shadoop.html)
package com.asimma.ScalaHadoop

import org.apache.hadoop.io._

object ImplicitConversion {
// Handle BooleanWritable
implicit def BooleanWritableUnbox(v: BooleanWritable) = v.get
implicit def BooleanWritableBox (v: Boolean) = new BooleanWritable(v)

// Handle DoubleWritable
implicit def DoubleWritableUnbox(v: DoubleWritable) = v.get
implicit def DoubleWritableBox (v: Double) = new DoubleWritable(v)

// Handle FloatWritable
implicit def FloatWritableUnbox(v: FloatWritable) = v.get
implicit def FloatWritableBox (v: Float) = new FloatWritable(v)

// Handle IntWritable
implicit def IntWritableUnbox(v: IntWritable) = v.get
implicit def IntWritableBox (v: Int) = new IntWritable(v)

// Handle LongWritable
implicit def LongWritableUnbox(v: LongWritable) = v.get
implicit def LongWritableBox (v: Long) = new LongWritable(v)

// Handle Text
implicit def TextUnbox(v: Text) = v.toString
implicit def TextBox (v: String) = new Text(v)
implicit def StringBuilderBox (v: StringBuilder) = new Text(v.toString)
implicit def StringBufferBox (v: StringBuffer) = new Text(v.toString)


implicit def MapWritableBox[X <: Writable,Y <: Writable](value: scala.collection.Map[X,Y]): MapWritable = {
var newMap = new MapWritable();
value.foreach{case (k, v) => newMap.put(k, v)};
return newMap; }
}

0 comments on commit 194d8b9

Please sign in to comment.