From 43a4060d8fccc22c65295d00f32a8458ff19391f Mon Sep 17 00:00:00 2001 From: roengram Date: Mon, 4 Aug 2014 15:38:56 +0900 Subject: [PATCH 1/3] Use HDFS instead of local dir --- examples/bin/factorize-movielens-1M.sh | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/examples/bin/factorize-movielens-1M.sh b/examples/bin/factorize-movielens-1M.sh index 8c6aa68053..dd9c277306 100644 --- a/examples/bin/factorize-movielens-1M.sh +++ b/examples/bin/factorize-movielens-1M.sh @@ -46,32 +46,40 @@ mkdir -p ${WORK_DIR}/movielens echo "Converting ratings..." cat $1 |sed -e s/::/,/g| cut -d, -f1,2,3 > ${WORK_DIR}/movielens/ratings.csv +hadoop dfs -rm -r ${WORK_DIR}/movielens +hadoop dfs -mkdir -p ${WORK_DIR}/movielens +hadoop dfs -copyFromLocal ${WORK_DIR}/movielens/ratings.csv ${WORK_DIR}/movielens/ratings.csv # create a 90% percent training set and a 10% probe set +hadoop dfs -rm -r ${WORK_DIR}/dataset $MAHOUT splitDataset --input ${WORK_DIR}/movielens/ratings.csv --output ${WORK_DIR}/dataset \ --trainingPercentage 0.9 --probePercentage 0.1 --tempDir ${WORK_DIR}/dataset/tmp # run distributed ALS-WR to factorize the rating matrix defined by the training set +hadoop dfs -rm -r ${WORK_DIR}/als $MAHOUT parallelALS --input ${WORK_DIR}/dataset/trainingSet/ --output ${WORK_DIR}/als/out \ --tempDir ${WORK_DIR}/als/tmp --numFeatures 20 --numIterations 10 --lambda 0.065 --numThreadsPerSolver 2 # compute predictions against the probe set, measure the error +hadoop dfs -rm -r ${WORK_DIR}/als/rmse $MAHOUT evaluateFactorization --input ${WORK_DIR}/dataset/probeSet/ --output ${WORK_DIR}/als/rmse/ \ --userFeatures ${WORK_DIR}/als/out/U/ --itemFeatures ${WORK_DIR}/als/out/M/ --tempDir ${WORK_DIR}/als/tmp # compute recommendations +hadoop dfs -rm -r ${WORK_DIR}/recommendations $MAHOUT recommendfactorized --input ${WORK_DIR}/als/out/userRatings/ --output ${WORK_DIR}/recommendations/ \ --userFeatures ${WORK_DIR}/als/out/U/ --itemFeatures ${WORK_DIR}/als/out/M/ \ --numRecommendations 6 --maxRating 5 --numThreads 2 # print the error echo -e "\nRMSE is:\n" -cat ${WORK_DIR}/als/rmse/rmse.txt +hadoop dfs -cat ${WORK_DIR}/als/rmse/rmse.txt echo -e "\n" echo -e "\nSample recommendations:\n" -shuf ${WORK_DIR}/recommendations/part-m-00000 |head +hadoop dfs -cat ${WORK_DIR}/recommendations/part-m-00000 |shuf |head echo -e "\n\n" echo "removing work directory" -rm -rf ${WORK_DIR} \ No newline at end of file +rm -rf ${WORK_DIR} +hadoop dfs -rm -r ${WORK_DIR} From b5a225aa66a4b58590e6beb1967c3cc7903817f0 Mon Sep 17 00:00:00 2001 From: JAEHOON Ko Date: Mon, 4 Aug 2014 22:30:25 -0700 Subject: [PATCH 2/3] add MAHOUT_LOCAL flag --- examples/bin/factorize-movielens-1M.sh | 37 +++++++++++++++++--------- 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/examples/bin/factorize-movielens-1M.sh b/examples/bin/factorize-movielens-1M.sh index dd9c277306..1b821d1358 100644 --- a/examples/bin/factorize-movielens-1M.sh +++ b/examples/bin/factorize-movielens-1M.sh @@ -39,47 +39,60 @@ then fi MAHOUT="../../bin/mahout" - +MAHOUT_LOCAL=1 WORK_DIR=/tmp/mahout-work-${USER} +HDFS_WORK_DIR=/tmp/mahout-work-${USER} + echo "creating work directory at ${WORK_DIR}" mkdir -p ${WORK_DIR}/movielens echo "Converting ratings..." cat $1 |sed -e s/::/,/g| cut -d, -f1,2,3 > ${WORK_DIR}/movielens/ratings.csv -hadoop dfs -rm -r ${WORK_DIR}/movielens -hadoop dfs -mkdir -p ${WORK_DIR}/movielens -hadoop dfs -copyFromLocal ${WORK_DIR}/movielens/ratings.csv ${WORK_DIR}/movielens/ratings.csv + +if [ $MAHOUT_LOCAL -eq 1 ] +then + CAT='cat' +else + hadoop dfs -rm -r ${HDFS_WORK_DIR} + echo "creating hdfs work directory at ${HDFS_WORK_DIR}" + hadoop dfs -mkdir -p ${HDFS_WORK_DIR}/movielens + hadoop dfs -copyFromLocal ${WORK_DIR}/movielens/ratings.csv ${HDFS_WORK_DIR}/movielens/ratings.csv + rm -rf ${WORK_DIR} + WORK_DIR=$HDFS_WORK_DIR + CAT='hadoop dfs -cat' +fi + # create a 90% percent training set and a 10% probe set -hadoop dfs -rm -r ${WORK_DIR}/dataset $MAHOUT splitDataset --input ${WORK_DIR}/movielens/ratings.csv --output ${WORK_DIR}/dataset \ --trainingPercentage 0.9 --probePercentage 0.1 --tempDir ${WORK_DIR}/dataset/tmp # run distributed ALS-WR to factorize the rating matrix defined by the training set -hadoop dfs -rm -r ${WORK_DIR}/als $MAHOUT parallelALS --input ${WORK_DIR}/dataset/trainingSet/ --output ${WORK_DIR}/als/out \ --tempDir ${WORK_DIR}/als/tmp --numFeatures 20 --numIterations 10 --lambda 0.065 --numThreadsPerSolver 2 # compute predictions against the probe set, measure the error -hadoop dfs -rm -r ${WORK_DIR}/als/rmse $MAHOUT evaluateFactorization --input ${WORK_DIR}/dataset/probeSet/ --output ${WORK_DIR}/als/rmse/ \ --userFeatures ${WORK_DIR}/als/out/U/ --itemFeatures ${WORK_DIR}/als/out/M/ --tempDir ${WORK_DIR}/als/tmp # compute recommendations -hadoop dfs -rm -r ${WORK_DIR}/recommendations $MAHOUT recommendfactorized --input ${WORK_DIR}/als/out/userRatings/ --output ${WORK_DIR}/recommendations/ \ --userFeatures ${WORK_DIR}/als/out/U/ --itemFeatures ${WORK_DIR}/als/out/M/ \ --numRecommendations 6 --maxRating 5 --numThreads 2 # print the error echo -e "\nRMSE is:\n" -hadoop dfs -cat ${WORK_DIR}/als/rmse/rmse.txt +$CAT ${WORK_DIR}/als/rmse/rmse.txt echo -e "\n" echo -e "\nSample recommendations:\n" -hadoop dfs -cat ${WORK_DIR}/recommendations/part-m-00000 |shuf |head +$CAT ${WORK_DIR}/recommendations/part-m-00000 |shuf |head echo -e "\n\n" echo "removing work directory" -rm -rf ${WORK_DIR} -hadoop dfs -rm -r ${WORK_DIR} +if [ $MAHOUT_LOCAL -eq 1 ] +then + rm -rf ${WORK_DIR} +else + hadoop dfs -rm -r ${WORK_DIR} +fi From 4a66b31da8287bcb9bf2df7538bd2fd2ecaeba50 Mon Sep 17 00:00:00 2001 From: JAEHOON Ko Date: Fri, 8 Aug 2014 01:14:03 -0700 Subject: [PATCH 3/3] Check existence of MAHOUT_LOCAL env var, not its value --- examples/bin/factorize-movielens-1M.sh | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/examples/bin/factorize-movielens-1M.sh b/examples/bin/factorize-movielens-1M.sh index 1b821d1358..81479e1ae1 100644 --- a/examples/bin/factorize-movielens-1M.sh +++ b/examples/bin/factorize-movielens-1M.sh @@ -39,7 +39,6 @@ then fi MAHOUT="../../bin/mahout" -MAHOUT_LOCAL=1 WORK_DIR=/tmp/mahout-work-${USER} HDFS_WORK_DIR=/tmp/mahout-work-${USER} @@ -49,10 +48,8 @@ mkdir -p ${WORK_DIR}/movielens echo "Converting ratings..." cat $1 |sed -e s/::/,/g| cut -d, -f1,2,3 > ${WORK_DIR}/movielens/ratings.csv -if [ $MAHOUT_LOCAL -eq 1 ] +if [ "$MAHOUT_LOCAL" == "" ] then - CAT='cat' -else hadoop dfs -rm -r ${HDFS_WORK_DIR} echo "creating hdfs work directory at ${HDFS_WORK_DIR}" hadoop dfs -mkdir -p ${HDFS_WORK_DIR}/movielens @@ -60,6 +57,8 @@ else rm -rf ${WORK_DIR} WORK_DIR=$HDFS_WORK_DIR CAT='hadoop dfs -cat' +else + CAT='cat' fi @@ -90,9 +89,9 @@ $CAT ${WORK_DIR}/recommendations/part-m-00000 |shuf |head echo -e "\n\n" echo "removing work directory" -if [ $MAHOUT_LOCAL -eq 1 ] +if [ "$MAHOUT_LOCAL" == "" ] then - rm -rf ${WORK_DIR} -else hadoop dfs -rm -r ${WORK_DIR} +else + rm -rf ${WORK_DIR} fi