add

amumu-dev · Sep 1, 2011 · 33f8cfd · 33f8cfd
1 parent a9bc9a5
commit 33f8cfd
Show file tree

Hide file tree

Showing 5 changed files with 207 additions and 0 deletions.
diff --git a/python/mapred/perceptron/local.sh b/python/mapred/perceptron/local.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+if [ $# != '2' ]
+then
+    echo "local.sh input_path output_path"
+    exit
+fi
+
+for i in `seq 1 10`; do
+    cat $1 | ./perceptron.py -w $2/weight`expr $i - 1`.txt | sort | ./reduce.py > $2/weight$i.txt;
+done;
+
diff --git a/python/mapred/perceptron/perceptron.py b/python/mapred/perceptron/perceptron.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python
+#encoding: utf-8
+
+from sys import stdin
+from optparse import OptionParser
+from collections import defaultdict
+from random import shuffle, seed
+
+
+def parse(line):
+    pair = line.strip().split(" ", 1)
+    if len(pair) != 2: return None
+    label, document = pair
+
+    features = defaultdict(float)
+    for feature in document.split(" "):
+        key, value = feature.split(":", 1)
+        features[key] = float(value)
+
+    return (label, features)
+
+def dot(x, y):
+    return sum(x.get(i, 0.) * y[i] for i in y.iterkeys())
+
+class Perceptron:
+    def __init__(self, weight):
+        self.weight = weight
+
+    def train(self, documents, iteration, learn):
+        for label, features in documents:
+            if not label in self.weight:
+                self.weight[label] = defaultdict(float)
+        for i in range(iteration):
+            shuffle(documents)
+            for label, features in documents:
+                # prediction
+                prediction, scores = self.predict(features)
+
+                # update
+                if prediction != label:
+                    for key, value in features.iteritems(): 
+                        self.weight[label][key] += learn * value
+                        self.weight[prediction][key] -= learn * value
+
+
+    def predict(self, features):
+        max_label = None
+        scores = {}
+        for label, weight in self.weight.iteritems():
+            scores[label] = dot(weight, features)
+            if max_label == None or scores[label] > scores[max_label]:
+                max_label = label
+        return (max_label, scores)
+
+    def output(self):
+        result = ""
+        for label, weight in self.weight.iteritems():
+            weight_str = " ".join(key + ":" + str(value) for key, value in weight.iteritems())
+            result += label + " " + weight_str + "\n"
+        return result
+
+if __name__ == '__main__':
+    parser = OptionParser()
+    parser.add_option("-i", dest="iteration", type="int", default=1)
+    parser.add_option("-e", dest="eta", type="float", default=0.001)
+    parser.add_option("-w", dest="weight")
+    (options, args) = parser.parse_args()
+
+    # Load file
+    documents = []
+    for line in stdin:
+        documents.append(parse(line))
+
+    # Load weights from Distributed Cache
+    weight = defaultdict(lambda: defaultdict(float))
+    if options.weight != None:
+        for line in open(options.weight):
+            parsed = parse(line)
+            if parsed == None: continue
+            key, value = parsed
+            weight[key] = value
+
+    # Train perceptron
+    perceptron = Perceptron(weight)
+    perceptron.train(documents, options.iteration, options.eta)
+
+    # Output weight file
+    print perceptron.output(),
+
diff --git a/python/mapred/perceptron/predict.py b/python/mapred/perceptron/predict.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python
+#encoding: utf-8
+
+from perceptron import *
+
+if __name__ == '__main__':
+    parser = OptionParser()
+    parser.add_option("-w", dest="weight")
+    (options, args) = parser.parse_args()
+
+    # Load file
+    documents = []
+    for line in stdin:
+        documents.append(parse(line))
+
+    # Load weights from Distributed Cache
+    weight = defaultdict(lambda: defaultdict(float))
+    if options.weight != None:
+        for line in open(options.weight):
+            parsed = parse(line)
+            if parsed == None: continue
+            key, value = parsed
+            weight[key] = value
+
+    perceptron = Perceptron(weight)
+
+    # Test prediction
+    correct = 0.
+    for label, features in documents:
+        prediction, scores = perceptron.predict(features)
+        if prediction == label:
+            correct += 1.
+
+    # Output accuracy
+    print correct / len(documents)
+
diff --git a/python/mapred/perceptron/reduce.py b/python/mapred/perceptron/reduce.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python
+from sys import stdin
+from optparse import OptionParser
+from collections import defaultdict
+
+def reduce(label, values):
+    weight = defaultdict(float)
+
+    # summation
+    for value in values:
+        for feature in value.split(" "):
+            pair = feature.split(":")
+            if len(pair) != 2: continue
+            k, v = pair
+            weight[k] += float(v)
+
+    # averaging
+    for k in weight.iterkeys():
+        weight[k] /= len(values)
+
+    weight_str = " ".join(k + ":" + str(v) for k, v in weight.iteritems())
+    print "%s %s" % (label, weight_str)
+
+if __name__ == "__main__":
+    current = None
+    values = []
+
+    for line in stdin:
+        splited = line.strip().split(" ", 1)
+        if len(splited) != 2:
+            continue
+        key, value = splited
+
+        if current != None and current != key:
+            reduce(current, values)
+            values = []
+
+        current = key
+        values += [value]
+
+    reduce(current, values)
diff --git a/python/mapred/perceptron/stream.sh b/python/mapred/perceptron/stream.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+if [ $# != '2' ]
+then
+    echo "stream.sh input_path output_path"
+    exit
+fi
+
+# First iteration
+echo "Iteration 1"
+hadoop jar \
+    $HADOOP_HOME/hadoop-streaming.jar \
+    -files perceptron.py,reduce.py \
+    -input $1 \
+    -output $2/weight1 \
+    -mapper ./perceptron.py \
+    -reducer ./reduce.py
+
+# Run iteration
+for i in `seq 2 10`
+do
+    echo "Iteration $i"
+    hadoop jar \
+        $HADOOP_HOME/hadoop-streaming.jar \
+        -files perceptron.py,reduce.py,hdfs:///user/yookuno/$2/weight`expr $i - 1`/part-00000 \
+        -input $1 \
+        -output $2/weight$i \
+        -mapper "./perceptron.py -w part-00000" \
+        -reducer ./reduce.py
+done
+