Skip to content

Commit

Permalink
add
Browse files Browse the repository at this point in the history
  • Loading branch information
nokuno committed Sep 1, 2011
1 parent a9bc9a5 commit 33f8cfd
Show file tree
Hide file tree
Showing 5 changed files with 207 additions and 0 deletions.
11 changes: 11 additions & 0 deletions python/mapred/perceptron/local.sh
@@ -0,0 +1,11 @@
#!/bin/bash
if [ $# != '2' ]
then
echo "local.sh input_path output_path"
exit
fi

for i in `seq 1 10`; do
cat $1 | ./perceptron.py -w $2/weight`expr $i - 1`.txt | sort | ./reduce.py > $2/weight$i.txt;
done;

89 changes: 89 additions & 0 deletions python/mapred/perceptron/perceptron.py
@@ -0,0 +1,89 @@
#!/usr/bin/env python
#encoding: utf-8

from sys import stdin
from optparse import OptionParser
from collections import defaultdict
from random import shuffle, seed


def parse(line):
pair = line.strip().split(" ", 1)
if len(pair) != 2: return None
label, document = pair

features = defaultdict(float)
for feature in document.split(" "):
key, value = feature.split(":", 1)
features[key] = float(value)

return (label, features)

def dot(x, y):
return sum(x.get(i, 0.) * y[i] for i in y.iterkeys())

class Perceptron:
def __init__(self, weight):
self.weight = weight

def train(self, documents, iteration, learn):
for label, features in documents:
if not label in self.weight:
self.weight[label] = defaultdict(float)
for i in range(iteration):
shuffle(documents)
for label, features in documents:
# prediction
prediction, scores = self.predict(features)

# update
if prediction != label:
for key, value in features.iteritems():
self.weight[label][key] += learn * value
self.weight[prediction][key] -= learn * value


def predict(self, features):
max_label = None
scores = {}
for label, weight in self.weight.iteritems():
scores[label] = dot(weight, features)
if max_label == None or scores[label] > scores[max_label]:
max_label = label
return (max_label, scores)

def output(self):
result = ""
for label, weight in self.weight.iteritems():
weight_str = " ".join(key + ":" + str(value) for key, value in weight.iteritems())
result += label + " " + weight_str + "\n"
return result

if __name__ == '__main__':
parser = OptionParser()
parser.add_option("-i", dest="iteration", type="int", default=1)
parser.add_option("-e", dest="eta", type="float", default=0.001)
parser.add_option("-w", dest="weight")
(options, args) = parser.parse_args()

# Load file
documents = []
for line in stdin:
documents.append(parse(line))

# Load weights from Distributed Cache
weight = defaultdict(lambda: defaultdict(float))
if options.weight != None:
for line in open(options.weight):
parsed = parse(line)
if parsed == None: continue
key, value = parsed
weight[key] = value

# Train perceptron
perceptron = Perceptron(weight)
perceptron.train(documents, options.iteration, options.eta)

# Output weight file
print perceptron.output(),

36 changes: 36 additions & 0 deletions python/mapred/perceptron/predict.py
@@ -0,0 +1,36 @@
#!/usr/bin/env python
#encoding: utf-8

from perceptron import *

if __name__ == '__main__':
parser = OptionParser()
parser.add_option("-w", dest="weight")
(options, args) = parser.parse_args()

# Load file
documents = []
for line in stdin:
documents.append(parse(line))

# Load weights from Distributed Cache
weight = defaultdict(lambda: defaultdict(float))
if options.weight != None:
for line in open(options.weight):
parsed = parse(line)
if parsed == None: continue
key, value = parsed
weight[key] = value

perceptron = Perceptron(weight)

# Test prediction
correct = 0.
for label, features in documents:
prediction, scores = perceptron.predict(features)
if prediction == label:
correct += 1.

# Output accuracy
print correct / len(documents)

41 changes: 41 additions & 0 deletions python/mapred/perceptron/reduce.py
@@ -0,0 +1,41 @@
#!/usr/bin/env python
from sys import stdin
from optparse import OptionParser
from collections import defaultdict

def reduce(label, values):
weight = defaultdict(float)

# summation
for value in values:
for feature in value.split(" "):
pair = feature.split(":")
if len(pair) != 2: continue
k, v = pair
weight[k] += float(v)

# averaging
for k in weight.iterkeys():
weight[k] /= len(values)

weight_str = " ".join(k + ":" + str(v) for k, v in weight.iteritems())
print "%s %s" % (label, weight_str)

if __name__ == "__main__":
current = None
values = []

for line in stdin:
splited = line.strip().split(" ", 1)
if len(splited) != 2:
continue
key, value = splited

if current != None and current != key:
reduce(current, values)
values = []

current = key
values += [value]

reduce(current, values)
30 changes: 30 additions & 0 deletions python/mapred/perceptron/stream.sh
@@ -0,0 +1,30 @@
#!/bin/bash
if [ $# != '2' ]
then
echo "stream.sh input_path output_path"
exit
fi

# First iteration
echo "Iteration 1"
hadoop jar \
$HADOOP_HOME/hadoop-streaming.jar \
-files perceptron.py,reduce.py \
-input $1 \
-output $2/weight1 \
-mapper ./perceptron.py \
-reducer ./reduce.py

# Run iteration
for i in `seq 2 10`
do
echo "Iteration $i"
hadoop jar \
$HADOOP_HOME/hadoop-streaming.jar \
-files perceptron.py,reduce.py,hdfs:///user/yookuno/$2/weight`expr $i - 1`/part-00000 \
-input $1 \
-output $2/weight$i \
-mapper "./perceptron.py -w part-00000" \
-reducer ./reduce.py
done

0 comments on commit 33f8cfd

Please sign in to comment.