From f2875ac5890564213d5f055d710976d1fede3962 Mon Sep 17 00:00:00 2001 From: p4nna Date: Mon, 27 Mar 2017 11:47:39 +0200 Subject: [PATCH 01/12] Add files via upload --- .../flink-ml/src/main/java/Imputer.java | 374 ++++++++++++++++++ .../flink-ml/src/main/java/Strategy.java | 5 + 2 files changed, 379 insertions(+) create mode 100644 flink-libraries/flink-ml/src/main/java/Imputer.java create mode 100644 flink-libraries/flink-ml/src/main/java/Strategy.java diff --git a/flink-libraries/flink-ml/src/main/java/Imputer.java b/flink-libraries/flink-ml/src/main/java/Imputer.java new file mode 100644 index 0000000000000..69e4246e01b46 --- /dev/null +++ b/flink-libraries/flink-ml/src/main/java/Imputer.java @@ -0,0 +1,374 @@ +package Imputer; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +//import java.util.Set; +//import java.util.SortedSet; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; + +import org.apache.commons.collections.ListUtils; +import org.apache.flink.api.common.functions.FlatMapFunction; +import org.apache.flink.api.common.functions.GroupReduceFunction; +import org.apache.flink.api.common.functions.MapFunction; +import org.apache.flink.api.common.functions.ReduceFunction; +import org.apache.flink.api.java.DataSet; +import org.apache.flink.api.java.ExecutionEnvironment; +import org.apache.flink.api.java.operators.AggregateOperator; +import org.apache.flink.api.java.operators.MapOperator; +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.api.java.tuple.Tuple3; +import org.apache.flink.ml.math.DenseVector; +import org.apache.flink.shaded.com.google.common.collect.Lists; +import org.apache.flink.util.Collector; +import org.apache.hadoop.mapreduce.Reducer; + +import scala.collection.mutable.LinkedList; +import scala.reflect.internal.Trees.New; + +public class Imputer { + + + static DenseVector testvec1= new DenseVector(new double[]{Double.NaN,3.0,1.0, 3.0}); + static DenseVector testvec2= new DenseVector(new double[]{1.0,7.0,Double.NaN, 1.0}); + static DenseVector testvec3= new DenseVector(new double[]{0.0,5.0,Double.NaN, 2.0}); + static DenseVector testvec4= new DenseVector(new double[]{6.5,Double.NaN,0.5, 0.5}); + static DenseVector testvec5= new DenseVector(new double[]{6.5,1.0,0.5, 0.5}); + + static ExecutionEnvironment env= ExecutionEnvironment.getExecutionEnvironment(); + + static DataSet ds = env.fromElements(testvec1, testvec2, testvec3, testvec4, testvec5); +// static DataSet ds = env.fromElements( testvec2, testvec3); + private static double[] meansA; + private static double[] medians; + private static double[] mostValues; +// final static ConcurrentHashMap meansHM= new ConcurrentHashMap<>(); + + + public static void main(String[] args){ + try { +// DataSet dsMean = impute(ds, Strategy.MEAN, 1); +// System.out.println("data set mean "); +// dsMean.print(); +// + DataSet dsMedian = impute(ds, Strategy.MEDIAN, 1); + System.out.println("data set median "); + dsMedian.print(); +// +// DataSet dsMost = impute(ds, Strategy.MOST_FREQUENT, 1); +// System.out.println("data set most frequent "); +// dsMost.print(); +// +// DataSet dsMean0 = impute(ds, Strategy.MEAN, 0); +// System.out.println("data set mean "); +// dsMean0.print(); +// +// DataSet dsMedian0 = impute(ds, Strategy.MEDIAN, 0); +// System.out.println("data set median "); +// dsMedian0.print(); +// +// DataSet dsMax0 = impute(ds, Strategy.MOST_FREQUENT, 0); +// System.out.println("data set max "); +// dsMax0.print(); + + } catch (Exception e) { + System.out.println("here happened an exception"); + e.printStackTrace(); + } + } + + /** + * + * @param sparseData + * @param strategy use MEAN, MEDIAN or the MOST_FREQUENT value to impute missing values + * @param axis 0: impute along columns, 1: imput along rows + * @return dataset without zeroes / missing values + * @throws Exception + */ + public static DataSet impute(DataSet sparseData, Strategy strategy, int axis) throws Exception{ + double val; + DataSet ret = sparseData; + if(axis==0){ //columnwise + switch (strategy){ + case MEAN: + ret=sparseData.map(new MapFunction() { + @Override + public DenseVector map(DenseVector vec) throws Exception { + for(int i = 0; i() { + @Override + public DenseVector map(DenseVector vec) throws Exception { + for(int i = 0; i() { + @Override + public DenseVector map(DenseVector vec) throws Exception { + for(int i = 0; i() { + double v; + @Override + public DenseVector map(DenseVector vec) { + for(int i = 0; i() { + @Override + public DenseVector map(DenseVector vec) throws Exception { + for(int i = 0; i() { + @Override + public DenseVector map(DenseVector vec) throws Exception { + for(int i = 0; i numArray = new ArrayList<>(); + double val; + for(int i =0; i< vec.size(); i++){ + val=vec.apply(i); + if(Double.compare(Double.NaN, val)!=0){ + numArray.add(val); + } + } + Collections.sort(numArray); + int middle = numArray.size() / 2; + if(numArray.size() % 2 == 0){ + double medianA = numArray.get(middle); + double medianB = numArray.get(middle-1); + ret = (medianA + medianB) / 2d; + } else{ + ret = numArray.get(middle); + } + return ret; + } + public static double getValueMOST(DenseVector vec){ + double ret=0; + HashMap frequencies= new HashMap<>(); + for(int i =0; imax){ + max=frequencies.get(key); + maxKey=key; + } + } + ret=maxKey; + return ret; + + } + + public static int numOfElementsNotZero(DenseVector vec){ + int zeros=0; + for(int i=0; i ds, Strategy strategy) throws Exception{ + //(entry, 1, dimension) + DataSet> nonZeroTuples= ds.flatMap(new FlatMapFunction>() { + @Override + public void flatMap(DenseVector vec, Collector> col) throws Exception { + double[] entries= vec.data(); + double entry; + for(int i = 0; i (entry, 1, i) ); + } + } + } + }); + + List>> lists; + DataSet>> nonZeros2= nonZeroTuples.map(new MapFunction, Tuple2> >() { + @Override + public Tuple2> map(Tuple3 t) throws Exception { + return new Tuple2>(t.f2, Lists.newArrayList(t.f0)); + } + }); + + lists= nonZeros2.groupBy(0).reduce(new ReduceFunction>>() { + List ret= new java.util.LinkedList<>(); + @Override + public Tuple2> reduce(Tuple2> t1, + Tuple2> t2) throws Exception { + ret=ListUtils.union(t1.f1,t2.f1); + return new Tuple2>(t1.f0, ret); + } + }).collect(); + + switch(strategy){ + case MEAN: + DataSet> infos= nonZeroTuples.groupBy(2).sum(0).andSum(1); + meansA= new double[lists.size()]; + final String s = "hello"; + List> means= infos.map(new MapFunction, Tuple2>() { + @Override + public Tuple2 map(Tuple3 t) throws Exception { + double mean= (double) t.f0/(double) t.f1; + return new Tuple2(t.f2, mean); + } + }).collect(); + for(Tuple2 t: means){ + meansA[t.f0]=t.f1; + } + break; + case MEDIAN: + double median; + int size; + //lists contains a list for every dimension in which all the values of the data set are written. + // we will later sort every of those lists in order to determine the median. + + medians= new double[lists.size()]; + List l; + for(Tuple2> t: lists){ + l= t.f1; + Collections.sort(l); + System.out.println("list " + t.f0 + " ist "+ l.toString()); + size=l.size(); + if(size%2==0){ + median=(l.get(size/2) + l.get(size/2-1))/2d; + }else{ + median=l.get(size/2); + } + medians[t.f0]=median; + } + for(int i = 0; i frequencies= new HashMap<>(); + mostValues= new double[lists.size()]; + for(Tuple2> t: lists){ + int max=0; + double maxKey=0; + // calculate frequencie hashmap for each dimension + List list=t.f1; + frequencies.clear(); + for(int j=0; jmax){ + max=frequencies.get(k); + maxKey=k; + } + } + mostValues[t.f0]=maxKey; + + } + break; + } + } + + +} diff --git a/flink-libraries/flink-ml/src/main/java/Strategy.java b/flink-libraries/flink-ml/src/main/java/Strategy.java new file mode 100644 index 0000000000000..e29d389a0cbd6 --- /dev/null +++ b/flink-libraries/flink-ml/src/main/java/Strategy.java @@ -0,0 +1,5 @@ +package Imputer; + +public enum Strategy { +MEAN, MEDIAN, MOST_FREQUENT; +} From 8e6909b52dad34d6c4cd6c84618616ac50cd83d1 Mon Sep 17 00:00:00 2001 From: p4nna Date: Mon, 27 Mar 2017 11:49:59 +0200 Subject: [PATCH 02/12] Test for Imputer class Two testclasses which test the functions implemented in the new imputer class. One for the rowwise imputing over all vectors and one for the vectorwise imputing --- .../src/test/Imputer/columnwiseTest.java | 120 ++++++++++++++++ .../src/test/Imputer/rowwiseTest.java | 134 ++++++++++++++++++ 2 files changed, 254 insertions(+) create mode 100644 flink-libraries/flink-ml/src/test/Imputer/columnwiseTest.java create mode 100644 flink-libraries/flink-ml/src/test/Imputer/rowwiseTest.java diff --git a/flink-libraries/flink-ml/src/test/Imputer/columnwiseTest.java b/flink-libraries/flink-ml/src/test/Imputer/columnwiseTest.java new file mode 100644 index 0000000000000..71fa77418ee5c --- /dev/null +++ b/flink-libraries/flink-ml/src/test/Imputer/columnwiseTest.java @@ -0,0 +1,120 @@ +package Imputer; + +import static org.junit.Assert.*; + +import org.apache.flink.api.java.DataSet; +import org.apache.flink.api.java.ExecutionEnvironment; +import org.apache.flink.ml.math.DenseVector; +import org.junit.Test; + +public class columnwiseTest { + + static DenseVector testvec1= new DenseVector(new double[]{Double.NaN,3.0,1.0, 3.0}); + static DenseVector testvec2= new DenseVector(new double[]{1.0,7.0,Double.NaN, 1.0}); + static DenseVector testvec3= new DenseVector(new double[]{0.0,5.0,Double.NaN, 2.0}); + static DenseVector testvec4= new DenseVector(new double[]{6.5,Double.NaN,0.5, 0.5}); + static DenseVector testvec5= new DenseVector(new double[]{6.5,1.0,0.5, 0.5}); + + static ExecutionEnvironment env= ExecutionEnvironment.getExecutionEnvironment(); + static DataSet ds = env.fromElements( testvec3, testvec4); + + + @Test + public void testMEAN() throws Exception { + DataSet dsMean = ds; + +// DenseVector testvec1e= new DenseVector(new double[]{4.0,3.0,1.0, 3.0}); +// DenseVector testvec2e= new DenseVector(new double[]{1.0,7.0,6.0, 1.0}); + DenseVector testvec3e= new DenseVector(new double[]{0.0,5.0,(5.0+2.0)/3, 2.0}); + DenseVector testvec4e= new DenseVector(new double[]{6.5,2.5,0.5, 0.5}); +// DenseVector testvec5e= new DenseVector(new double[]{6.5,1.0,0.5, 0.5}); + + + DataSet dsExpected = env.fromElements( testvec3e, testvec4e); + + try { + dsMean = Imputer.impute(ds, Strategy.MEAN, 0); + } catch (Exception e) { + fail("MEAN could not be calculated"); + + } + boolean fails=false; + for(DenseVector v: dsExpected.collect()){ + if(!dsMean.collect().contains(v)){ + fails=true; + }; + } + + if(fails){ + fail("MEAN could not be calculated columnwise"); + System.out.println("dsmean: "); dsMean.print(); + System.out.println("dsexpected: " );dsExpected.print(); + } + } + + + + @Test + public void testMEDIAN() throws Exception { + DataSet dsMedian = ds; + + DenseVector testvec3e= new DenseVector(new double[]{0.0,5.0,2.0, 2.0}); + DenseVector testvec4e= new DenseVector(new double[]{6.5,0.5,0.5, 0.5}); + + + DataSet dsExpected = env.fromElements( testvec3e, testvec4e); + + try { + dsMedian = Imputer.impute(ds, Strategy.MEDIAN, 0); + } catch (Exception e) { + fail("MEDIAN could not be calculated"); + + } + boolean fails=false; + for(DenseVector v: dsExpected.collect()){ + if(!dsMedian.collect().contains(v)){ + fails=true; + }; + } + + if(fails){ + fail("MEDIAN could not be calculated columnwise"); + System.out.println("dsmedian: "); dsMedian.print(); + System.out.println("dsexpected: " );dsExpected.print(); + } + } + + + @Test + public void testMOSTFREQUENT() throws Exception { + DataSet dsMost = env.fromElements( testvec2, testvec1, testvec4);; + + DenseVector testvec2e= new DenseVector(new double[]{1.0,7.0,1.0, 1.0}); + DenseVector testvec1e= new DenseVector(new double[]{3.0,3.0,1.0, 3.0}); + DenseVector testvec4e= new DenseVector(new double[]{6.5,0.5,0.5, 0.5}); + + + DataSet dsExpected = env.fromElements( testvec2e, testvec1e, testvec4e); + + try { + dsMost = Imputer.impute(dsMost, Strategy.MOST_FREQUENT, 0); + } catch (Exception e) { + fail("MOSTFREQUENT could not be calculated"); + + } + boolean fails=false; + for(DenseVector v: dsExpected.collect()){ + if(!dsMost.collect().contains(v)){ + fails=true; + }; + } + + if(fails){ + System.out.println("dsmostfrequent: "); dsMost.print(); + System.out.println("dsexpected: " );dsExpected.print(); + fail("MOSTFREQUENT could not be calculated columnwise"); + } + } + + +} diff --git a/flink-libraries/flink-ml/src/test/Imputer/rowwiseTest.java b/flink-libraries/flink-ml/src/test/Imputer/rowwiseTest.java new file mode 100644 index 0000000000000..0d98e23aab0e3 --- /dev/null +++ b/flink-libraries/flink-ml/src/test/Imputer/rowwiseTest.java @@ -0,0 +1,134 @@ +package Imputer; + +import static org.junit.Assert.*; + +import org.apache.flink.api.java.DataSet; +import org.apache.flink.api.java.ExecutionEnvironment; +import org.apache.flink.ml.math.DenseVector; +import org.junit.Test; + +public class rowwiseTest { + + + static DenseVector testvec1= new DenseVector(new double[]{Double.NaN,3.0,1.0, 3.0}); + static DenseVector testvec2= new DenseVector(new double[]{1.0,7.0,Double.NaN, 1.0}); + static DenseVector testvec3= new DenseVector(new double[]{0.0,5.0,Double.NaN, 2.0}); + static DenseVector testvec4= new DenseVector(new double[]{6.5,Double.NaN,0.5, 0.5}); + static DenseVector testvec5= new DenseVector(new double[]{6.5,1.0,0.5, 0.5}); + + static ExecutionEnvironment env= ExecutionEnvironment.getExecutionEnvironment(); + static DataSet ds = env.fromElements( testvec1, testvec2, testvec3, testvec4, testvec5); + + + @Test + public void testMEAN() throws Exception { + DataSet dsMean = ds; + + DenseVector testvec1e= new DenseVector(new double[]{14.0/4.0,3.0,1.0, 3.0}); + DenseVector testvec2e= new DenseVector(new double[]{1.0,7.0,2.0/3.0, 1.0}); + DenseVector testvec3e= new DenseVector(new double[]{0.0,5.0,2.0/3.0, 2.0}); + DenseVector testvec4e= new DenseVector(new double[]{6.5,4.0,0.5, 0.5}); + DenseVector testvec5e= new DenseVector(new double[]{6.5,1.0,0.5, 0.5}); + + + DataSet dsExpected = env.fromElements(testvec1e, testvec2e, testvec3e, testvec4e, testvec5e); + + try { + dsMean = Imputer.impute(ds, Strategy.MEAN, 1); + } catch (Exception e) { + fail("MEAN could not be calculated"); + + } + boolean fails=false; + for(DenseVector v: dsExpected.collect()){ + if(!dsMean.collect().contains(v)){ + fails=true; + }; + } + + if(fails){ + fail("MEAN could not be calculated rowwise"); + System.out.println("dsmean: "); dsMean.print(); + System.out.println("dsexpected: " );dsExpected.print(); + } + } + + + + @Test + public void testMEDIAN() throws Exception { + DataSet dsMedian = ds; + + DenseVector testvec1e= new DenseVector(new double[]{(7.5/2.0),3.0,1.0, 3.0}); + DenseVector testvec2e= new DenseVector(new double[]{1.0,7.0,0.5, 1.0}); + DenseVector testvec3e= new DenseVector(new double[]{0.0,5.0,0.5, 2.0}); + DenseVector testvec4e= new DenseVector(new double[]{6.5,4.0,0.5, 0.5}); + DenseVector testvec5e= new DenseVector(new double[]{6.5,1.0,0.5, 0.5}); + + + DataSet dsExpected = env.fromElements(testvec1e, testvec2e, testvec3e, testvec4e, testvec5e); + + try { + dsMedian = Imputer.impute(ds, Strategy.MEDIAN, 1); + } catch (Exception e) { + fail("MEDIAN could not be calculated"); + + } + boolean fails=false; + for(DenseVector v: dsExpected.collect()){ + if(!dsMedian.collect().contains(v)){ + fails=true; + }; + } + + if(fails){ + System.out.println("dsmedian: "); dsMedian.print(); + System.out.println("dsexpected: " );dsExpected.print(); + fail("MEDIAN could not be calculated rowwise"); + } + } + + + + @Test + public void testMOSTFREQUENT() throws Exception { + + DenseVector testvec1= new DenseVector(new double[]{Double.NaN,3.0,1.0, Double.NaN}); + DenseVector testvec2= new DenseVector(new double[]{1.0,7.0,Double.NaN, 1.0}); + DenseVector testvec3= new DenseVector(new double[]{0.0,5.0,Double.NaN, 2.0}); + DenseVector testvec4= new DenseVector(new double[]{6.5,2.0,0.5, 0.5}); + DenseVector testvec5= new DenseVector(new double[]{6.5,1.0,0.5, 0.5}); + DataSet dsMost = env.fromElements(testvec1, testvec2, testvec3, testvec4, testvec5);; + + + DenseVector testvec1e= new DenseVector(new double[]{6.5,3.0,1.0, 0.5}); + DenseVector testvec2e= new DenseVector(new double[]{1.0,7.0,0.5, 1.0}); + DenseVector testvec3e= new DenseVector(new double[]{0.0,5.0,0.5, 2.0}); + DenseVector testvec4e= new DenseVector(new double[]{6.5,2.0,0.5, 0.5}); + DenseVector testvec5e= new DenseVector(new double[]{6.5,1.0,0.5, 0.5}); + + + DataSet dsExpected = env.fromElements(testvec1e, testvec2e, testvec3e, testvec4e, testvec5e); + + try { + dsMost = Imputer.impute(dsMost, Strategy.MOST_FREQUENT, 1); + } catch (Exception e) { + fail("MOSTFREQUENT could not be calculated"); + + } + boolean fails=false; + for(DenseVector v: dsExpected.collect()){ + if(!dsMost.collect().contains(v)){ + fails=true; + }; + } + + if(fails){ + System.out.println("dsMost: "); dsMost.print(); + System.out.println("dsexpected: " );dsExpected.print(); + fail("MOSTFREQUENT could not be calculated rowwise"); + } + } + + +} From 0c420a84c136b330135ce180db04d899b5a6f54c Mon Sep 17 00:00:00 2001 From: p4nna Date: Mon, 27 Mar 2017 11:56:51 +0200 Subject: [PATCH 03/12] removed unused imports and methods --- .../flink-ml/src/main/java/Imputer.java | 43 ------------------- 1 file changed, 43 deletions(-) diff --git a/flink-libraries/flink-ml/src/main/java/Imputer.java b/flink-libraries/flink-ml/src/main/java/Imputer.java index 69e4246e01b46..91d288353064d 100644 --- a/flink-libraries/flink-ml/src/main/java/Imputer.java +++ b/flink-libraries/flink-ml/src/main/java/Imputer.java @@ -5,8 +5,6 @@ import java.util.Collections; import java.util.HashMap; import java.util.List; -//import java.util.Set; -//import java.util.SortedSet; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; @@ -32,54 +30,13 @@ public class Imputer { - static DenseVector testvec1= new DenseVector(new double[]{Double.NaN,3.0,1.0, 3.0}); - static DenseVector testvec2= new DenseVector(new double[]{1.0,7.0,Double.NaN, 1.0}); - static DenseVector testvec3= new DenseVector(new double[]{0.0,5.0,Double.NaN, 2.0}); - static DenseVector testvec4= new DenseVector(new double[]{6.5,Double.NaN,0.5, 0.5}); - static DenseVector testvec5= new DenseVector(new double[]{6.5,1.0,0.5, 0.5}); - static ExecutionEnvironment env= ExecutionEnvironment.getExecutionEnvironment(); - static DataSet ds = env.fromElements(testvec1, testvec2, testvec3, testvec4, testvec5); -// static DataSet ds = env.fromElements( testvec2, testvec3); private static double[] meansA; private static double[] medians; private static double[] mostValues; -// final static ConcurrentHashMap meansHM= new ConcurrentHashMap<>(); - public static void main(String[] args){ - try { -// DataSet dsMean = impute(ds, Strategy.MEAN, 1); -// System.out.println("data set mean "); -// dsMean.print(); -// - DataSet dsMedian = impute(ds, Strategy.MEDIAN, 1); - System.out.println("data set median "); - dsMedian.print(); -// -// DataSet dsMost = impute(ds, Strategy.MOST_FREQUENT, 1); -// System.out.println("data set most frequent "); -// dsMost.print(); -// -// DataSet dsMean0 = impute(ds, Strategy.MEAN, 0); -// System.out.println("data set mean "); -// dsMean0.print(); -// -// DataSet dsMedian0 = impute(ds, Strategy.MEDIAN, 0); -// System.out.println("data set median "); -// dsMedian0.print(); -// -// DataSet dsMax0 = impute(ds, Strategy.MOST_FREQUENT, 0); -// System.out.println("data set max "); -// dsMax0.print(); - - } catch (Exception e) { - System.out.println("here happened an exception"); - e.printStackTrace(); - } - } - /** * * @param sparseData From 9136607e84a0297bb4fb24a53bad9950b86bf116 Mon Sep 17 00:00:00 2001 From: p4nna Date: Mon, 27 Mar 2017 17:58:37 +0200 Subject: [PATCH 04/12] Imputer was added adds missing values in sparse DataSets of Vectors --- .../scala/org/apache/flink/ml/MLUtils.scala | 126 ++ .../apache/flink/ml/classification/SVM.scala | 552 +++++++++ .../org/apache/flink/ml/common/Block.scala | 29 + .../apache/flink/ml/common/FlinkMLTools.scala | 424 +++++++ .../flink/ml/common/LabeledVector.scala | 42 + .../apache/flink/ml/common/ParameterMap.scala | 121 ++ .../apache/flink/ml/common/WeightVector.scala | 32 + .../flink/ml/common/WithParameters.scala | 26 + .../scala/org/apache/flink/ml/math/BLAS.scala | 291 +++++ .../org/apache/flink/ml/math/Breeze.scala | 88 ++ .../flink/ml/math/BreezeVectorConverter.scala | 34 + .../apache/flink/ml/math/DenseMatrix.scala | 191 +++ .../apache/flink/ml/math/DenseVector.scala | 187 +++ .../org/apache/flink/ml/math/Matrix.scala | 69 ++ .../apache/flink/ml/math/SparseMatrix.scala | 267 +++++ .../apache/flink/ml/math/SparseVector.scala | 283 +++++ .../org/apache/flink/ml/math/Vector.scala | 103 ++ .../apache/flink/ml/math/VectorBuilder.scala | 57 + .../math/distributed/DistributedMatrix.scala | 39 + .../distributed/DistributedRowMatrix.scala | 172 +++ .../org/apache/flink/ml/math/package.scala | 110 ++ .../distances/ChebyshevDistanceMetric.scala | 37 + .../distances/CosineDistanceMetric.scala | 45 + .../ml/metrics/distances/DistanceMetric.scala | 37 + .../distances/EuclideanDistanceMetric.scala | 41 + .../distances/ManhattanDistanceMetric.scala | 37 + .../distances/MinkowskiDistanceMetric.scala | 41 + .../SquaredEuclideanDistanceMetric.scala | 37 + .../distances/TanimotoDistanceMetric.scala | 40 + .../scala/org/apache/flink/ml/nn/KNN.scala | 359 ++++++ .../org/apache/flink/ml/nn/QuadTree.scala | 323 +++++ .../ml/optimization/GradientDescent.scala | 297 +++++ .../flink/ml/optimization/LossFunction.scala | 96 ++ .../ml/optimization/PartialLossFunction.scala | 154 +++ .../ml/optimization/PredictionFunction.scala | 40 + .../optimization/RegularizationPenalty.scala | 219 ++++ .../apache/flink/ml/optimization/Solver.scala | 233 ++++ .../outlier/StochasticOutlierSelection.scala | 383 ++++++ .../scala/org/apache/flink/ml/package.scala | 119 ++ .../flink/ml/pipeline/ChainedPredictor.scala | 139 +++ .../ml/pipeline/ChainedTransformer.scala | 110 ++ .../apache/flink/ml/pipeline/Estimator.scala | 181 +++ .../apache/flink/ml/pipeline/Predictor.scala | 258 ++++ .../flink/ml/pipeline/Transformer.scala | 164 +++ .../flink/ml/preprocessing/MinMaxScaler.scala | 265 +++++ .../ml/preprocessing/PolynomialFeatures.scala | 209 ++++ .../flink/ml/preprocessing/Splitter.scala | 210 ++++ .../ml/preprocessing/StandardScaler.scala | 302 +++++ .../apache/flink/ml/recommendation/ALS.scala | 1060 +++++++++++++++++ .../regression/MultipleLinearRegression.scala | 234 ++++ src/test/resources/log4j-test.properties | 38 + src/test/resources/logback-test.xml | 42 + .../org/apache/flink/ml/MLUtilsSuite.scala | 108 ++ .../ml/classification/Classification.scala | 133 +++ .../flink/ml/classification/SVMITSuite.scala | 104 ++ .../flink/ml/common/FlinkMLToolsSuite.scala | 60 + .../flink/ml/math/BreezeMathSuite.scala | 98 ++ .../flink/ml/math/DenseMatrixSuite.scala | 86 ++ .../flink/ml/math/DenseVectorSuite.scala | 176 +++ .../flink/ml/math/SparseMatrixSuite.scala | 134 +++ .../flink/ml/math/SparseVectorSuite.scala | 227 ++++ .../DistributedRowMatrixSuite.scala | 104 ++ .../distances/DistanceMetricSuite.scala | 95 ++ .../org/apache/flink/ml/nn/KNNITSuite.scala | 117 ++ .../apache/flink/ml/nn/QuadTreeSuite.scala | 93 ++ .../optimization/GradientDescentITSuite.scala | 278 +++++ .../ml/optimization/LossFunctionTest.scala | 102 ++ .../PredictionFunctionITSuite.scala | 62 + .../RegularizationPenaltyTest.scala | 64 + .../StochasticOutlierSelectionITSuite.scala | 240 ++++ .../flink/ml/pipeline/PipelineITSuite.scala | 211 ++++ .../preprocessing/MinMaxScalerITSuite.scala | 243 ++++ .../PolynomialFeaturesITSuite.scala | 124 ++ .../ml/preprocessing/SplitterITSuite.scala | 97 ++ .../preprocessing/StandardScalerITSuite.scala | 166 +++ .../flink/ml/recommendation/ALSITSuite.scala | 116 ++ .../ml/recommendation/Recommendation.scala | 153 +++ .../MultipleLinearRegressionITSuite.scala | 159 +++ .../flink/ml/regression/RegressionData.scala | 206 ++++ .../apache/flink/ml/util/FlinkTestBase.scala | 75 ++ 80 files changed, 12824 insertions(+) create mode 100644 src/main/scala/org/apache/flink/ml/MLUtils.scala create mode 100644 src/main/scala/org/apache/flink/ml/classification/SVM.scala create mode 100644 src/main/scala/org/apache/flink/ml/common/Block.scala create mode 100644 src/main/scala/org/apache/flink/ml/common/FlinkMLTools.scala create mode 100644 src/main/scala/org/apache/flink/ml/common/LabeledVector.scala create mode 100644 src/main/scala/org/apache/flink/ml/common/ParameterMap.scala create mode 100644 src/main/scala/org/apache/flink/ml/common/WeightVector.scala create mode 100644 src/main/scala/org/apache/flink/ml/common/WithParameters.scala create mode 100644 src/main/scala/org/apache/flink/ml/math/BLAS.scala create mode 100644 src/main/scala/org/apache/flink/ml/math/Breeze.scala create mode 100644 src/main/scala/org/apache/flink/ml/math/BreezeVectorConverter.scala create mode 100644 src/main/scala/org/apache/flink/ml/math/DenseMatrix.scala create mode 100644 src/main/scala/org/apache/flink/ml/math/DenseVector.scala create mode 100644 src/main/scala/org/apache/flink/ml/math/Matrix.scala create mode 100644 src/main/scala/org/apache/flink/ml/math/SparseMatrix.scala create mode 100644 src/main/scala/org/apache/flink/ml/math/SparseVector.scala create mode 100644 src/main/scala/org/apache/flink/ml/math/Vector.scala create mode 100644 src/main/scala/org/apache/flink/ml/math/VectorBuilder.scala create mode 100644 src/main/scala/org/apache/flink/ml/math/distributed/DistributedMatrix.scala create mode 100644 src/main/scala/org/apache/flink/ml/math/distributed/DistributedRowMatrix.scala create mode 100644 src/main/scala/org/apache/flink/ml/math/package.scala create mode 100644 src/main/scala/org/apache/flink/ml/metrics/distances/ChebyshevDistanceMetric.scala create mode 100644 src/main/scala/org/apache/flink/ml/metrics/distances/CosineDistanceMetric.scala create mode 100644 src/main/scala/org/apache/flink/ml/metrics/distances/DistanceMetric.scala create mode 100644 src/main/scala/org/apache/flink/ml/metrics/distances/EuclideanDistanceMetric.scala create mode 100644 src/main/scala/org/apache/flink/ml/metrics/distances/ManhattanDistanceMetric.scala create mode 100644 src/main/scala/org/apache/flink/ml/metrics/distances/MinkowskiDistanceMetric.scala create mode 100644 src/main/scala/org/apache/flink/ml/metrics/distances/SquaredEuclideanDistanceMetric.scala create mode 100644 src/main/scala/org/apache/flink/ml/metrics/distances/TanimotoDistanceMetric.scala create mode 100644 src/main/scala/org/apache/flink/ml/nn/KNN.scala create mode 100644 src/main/scala/org/apache/flink/ml/nn/QuadTree.scala create mode 100644 src/main/scala/org/apache/flink/ml/optimization/GradientDescent.scala create mode 100644 src/main/scala/org/apache/flink/ml/optimization/LossFunction.scala create mode 100644 src/main/scala/org/apache/flink/ml/optimization/PartialLossFunction.scala create mode 100644 src/main/scala/org/apache/flink/ml/optimization/PredictionFunction.scala create mode 100644 src/main/scala/org/apache/flink/ml/optimization/RegularizationPenalty.scala create mode 100644 src/main/scala/org/apache/flink/ml/optimization/Solver.scala create mode 100644 src/main/scala/org/apache/flink/ml/outlier/StochasticOutlierSelection.scala create mode 100644 src/main/scala/org/apache/flink/ml/package.scala create mode 100644 src/main/scala/org/apache/flink/ml/pipeline/ChainedPredictor.scala create mode 100644 src/main/scala/org/apache/flink/ml/pipeline/ChainedTransformer.scala create mode 100644 src/main/scala/org/apache/flink/ml/pipeline/Estimator.scala create mode 100644 src/main/scala/org/apache/flink/ml/pipeline/Predictor.scala create mode 100644 src/main/scala/org/apache/flink/ml/pipeline/Transformer.scala create mode 100644 src/main/scala/org/apache/flink/ml/preprocessing/MinMaxScaler.scala create mode 100644 src/main/scala/org/apache/flink/ml/preprocessing/PolynomialFeatures.scala create mode 100644 src/main/scala/org/apache/flink/ml/preprocessing/Splitter.scala create mode 100644 src/main/scala/org/apache/flink/ml/preprocessing/StandardScaler.scala create mode 100644 src/main/scala/org/apache/flink/ml/recommendation/ALS.scala create mode 100644 src/main/scala/org/apache/flink/ml/regression/MultipleLinearRegression.scala create mode 100644 src/test/resources/log4j-test.properties create mode 100644 src/test/resources/logback-test.xml create mode 100644 src/test/scala/org/apache/flink/ml/MLUtilsSuite.scala create mode 100644 src/test/scala/org/apache/flink/ml/classification/Classification.scala create mode 100644 src/test/scala/org/apache/flink/ml/classification/SVMITSuite.scala create mode 100644 src/test/scala/org/apache/flink/ml/common/FlinkMLToolsSuite.scala create mode 100644 src/test/scala/org/apache/flink/ml/math/BreezeMathSuite.scala create mode 100644 src/test/scala/org/apache/flink/ml/math/DenseMatrixSuite.scala create mode 100644 src/test/scala/org/apache/flink/ml/math/DenseVectorSuite.scala create mode 100644 src/test/scala/org/apache/flink/ml/math/SparseMatrixSuite.scala create mode 100644 src/test/scala/org/apache/flink/ml/math/SparseVectorSuite.scala create mode 100644 src/test/scala/org/apache/flink/ml/math/distributed/DistributedRowMatrixSuite.scala create mode 100644 src/test/scala/org/apache/flink/ml/metrics/distances/DistanceMetricSuite.scala create mode 100644 src/test/scala/org/apache/flink/ml/nn/KNNITSuite.scala create mode 100644 src/test/scala/org/apache/flink/ml/nn/QuadTreeSuite.scala create mode 100644 src/test/scala/org/apache/flink/ml/optimization/GradientDescentITSuite.scala create mode 100644 src/test/scala/org/apache/flink/ml/optimization/LossFunctionTest.scala create mode 100644 src/test/scala/org/apache/flink/ml/optimization/PredictionFunctionITSuite.scala create mode 100644 src/test/scala/org/apache/flink/ml/optimization/RegularizationPenaltyTest.scala create mode 100644 src/test/scala/org/apache/flink/ml/outlier/StochasticOutlierSelectionITSuite.scala create mode 100644 src/test/scala/org/apache/flink/ml/pipeline/PipelineITSuite.scala create mode 100644 src/test/scala/org/apache/flink/ml/preprocessing/MinMaxScalerITSuite.scala create mode 100644 src/test/scala/org/apache/flink/ml/preprocessing/PolynomialFeaturesITSuite.scala create mode 100644 src/test/scala/org/apache/flink/ml/preprocessing/SplitterITSuite.scala create mode 100644 src/test/scala/org/apache/flink/ml/preprocessing/StandardScalerITSuite.scala create mode 100644 src/test/scala/org/apache/flink/ml/recommendation/ALSITSuite.scala create mode 100644 src/test/scala/org/apache/flink/ml/recommendation/Recommendation.scala create mode 100644 src/test/scala/org/apache/flink/ml/regression/MultipleLinearRegressionITSuite.scala create mode 100644 src/test/scala/org/apache/flink/ml/regression/RegressionData.scala create mode 100644 src/test/scala/org/apache/flink/ml/util/FlinkTestBase.scala diff --git a/src/main/scala/org/apache/flink/ml/MLUtils.scala b/src/main/scala/org/apache/flink/ml/MLUtils.scala new file mode 100644 index 0000000000000..051544f79d937 --- /dev/null +++ b/src/main/scala/org/apache/flink/ml/MLUtils.scala @@ -0,0 +1,126 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml + +import org.apache.flink.api.common.functions.{RichFlatMapFunction, RichMapFunction} +import org.apache.flink.api.java.operators.DataSink +import org.apache.flink.api.scala._ +import org.apache.flink.configuration.Configuration +import org.apache.flink.ml.common.LabeledVector +import org.apache.flink.ml.math.SparseVector +import org.apache.flink.util.Collector + +/** Convenience functions for machine learning tasks + * + * This object contains convenience functions for machine learning tasks: + * + * - readLibSVM: + * Reads a libSVM/SVMLight input file and returns a data set of [[LabeledVector]]. + * The file format is specified [http://svmlight.joachims.org/ here]. + * + * - writeLibSVM: + * Writes a data set of [[LabeledVector]] in libSVM/SVMLight format to disk. THe file format + * is specified [http://svmlight.joachims.org/ here]. + */ +object MLUtils { + + val DIMENSION = "dimension" + + /** Reads a file in libSVM/SVMLight format and converts the data into a data set of + * [[LabeledVector]]. The dimension of the [[LabeledVector]] is determined automatically. + * + * Since the libSVM/SVMLight format stores a vector in its sparse form, the [[LabeledVector]] + * will also be instantiated with a [[SparseVector]]. + * + * @param env executionEnvironment [[ExecutionEnvironment]] + * @param filePath Path to the input file + * @return [[DataSet]] of [[LabeledVector]] containing the information of the libSVM/SVMLight + * file + */ + def readLibSVM(env: ExecutionEnvironment, filePath: String): DataSet[LabeledVector] = { + val labelCOODS = env.readTextFile(filePath).flatMap( + new RichFlatMapFunction[String, (Double, Array[(Int, Double)])] { + val splitPattern = "\\s+".r + + override def flatMap( + line: String, + out: Collector[(Double, Array[(Int, Double)])] + ): Unit = { + val commentFreeLine = line.takeWhile(_ != '#').trim + + if (commentFreeLine.nonEmpty) { + val splits = splitPattern.split(commentFreeLine) + val label = splits.head.toDouble + val sparseFeatures = splits.tail + val coos = sparseFeatures.flatMap { str => + val pair = str.split(':') + require(pair.length == 2, "Each feature entry has to have the form :") + + // libSVM index is 1-based, but we expect it to be 0-based + val index = pair(0).toInt - 1 + val value = pair(1).toDouble + + Some((index, value)) + } + + out.collect((label, coos)) + } + } + }) + + // Calculate maximum dimension of vectors + val dimensionDS = labelCOODS.map { + labelCOO => + labelCOO._2.map( _._1 + 1 ).max + }.reduce(scala.math.max(_, _)) + + labelCOODS.map{ new RichMapFunction[(Double, Array[(Int, Double)]), LabeledVector] { + var dimension = 0 + + override def open(configuration: Configuration): Unit = { + dimension = getRuntimeContext.getBroadcastVariable(DIMENSION).get(0) + } + + override def map(value: (Double, Array[(Int, Double)])): LabeledVector = { + new LabeledVector(value._1, SparseVector.fromCOO(dimension, value._2)) + } + }}.withBroadcastSet(dimensionDS, DIMENSION) + } + + /** Writes a [[DataSet]] of [[LabeledVector]] to a file using the libSVM/SVMLight format. + * + * @param filePath Path to output file + * @param labeledVectors [[DataSet]] of [[LabeledVector]] to write to disk + * @return + */ + def writeLibSVM(filePath: String, labeledVectors: DataSet[LabeledVector]): DataSink[String] = { + val stringRepresentation = labeledVectors.map{ + labeledVector => + val vectorStr = labeledVector.vector. + // remove zero entries + filter( _._2 != 0). + map{case (idx, value) => (idx + 1) + ":" + value}. + mkString(" ") + + labeledVector.label + " " + vectorStr + } + + stringRepresentation.writeAsText(filePath) + } +} diff --git a/src/main/scala/org/apache/flink/ml/classification/SVM.scala b/src/main/scala/org/apache/flink/ml/classification/SVM.scala new file mode 100644 index 0000000000000..eff9fbd258585 --- /dev/null +++ b/src/main/scala/org/apache/flink/ml/classification/SVM.scala @@ -0,0 +1,552 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.classification + +import org.apache.flink.api.common.functions.RichMapFunction +import org.apache.flink.api.scala._ +import org.apache.flink.configuration.Configuration +import org.apache.flink.ml.common.FlinkMLTools.ModuloKeyPartitioner +import org.apache.flink.ml.common._ +import org.apache.flink.ml.math.Breeze._ +import org.apache.flink.ml.math.{DenseVector, Vector} +import org.apache.flink.ml.pipeline.{FitOperation, PredictOperation, Predictor} + +import scala.collection.mutable.ArrayBuffer +import scala.util.Random + +import breeze.linalg.{DenseVector => BreezeDenseVector, Vector => BreezeVector} + +/** Implements a soft-margin SVM using the communication-efficient distributed dual coordinate + * ascent algorithm (CoCoA) with hinge-loss function. + * + * It can be used for binary classification problems, with the labels set as +1.0 to indiciate a + * positive example and -1.0 to indicate a negative example. + * + * The algorithm solves the following minimization problem: + * + * `min_{w in bbb"R"^d} lambda/2 ||w||^2 + 1/n sum_(i=1)^n l_{i}(w^Tx_i)` + * + * with `w` being the weight vector, `lambda` being the regularization constant, + * `x_{i} in bbb"R"^d` being the data points and `l_{i}` being the convex loss functions, which + * can also depend on the labels `y_{i} in bbb"R"`. + * In the current implementation the regularizer is the 2-norm and the loss functions are the + * hinge-loss functions: + * + * `l_{i} = max(0, 1 - y_{i} * w^Tx_i` + * + * With these choices, the problem definition is equivalent to a SVM with soft-margin. + * Thus, the algorithm allows us to train a SVM with soft-margin. + * + * The minimization problem is solved by applying stochastic dual coordinate ascent (SDCA). + * In order to make the algorithm efficient in a distributed setting, the CoCoA algorithm + * calculates several iterations of SDCA locally on a data block before merging the local + * updates into a valid global state. + * This state is redistributed to the different data partitions where the next round of local + * SDCA iterations is then executed. + * The number of outer iterations and local SDCA iterations control the overall network costs, + * because there is only network communication required for each outer iteration. + * The local SDCA iterations are embarrassingly parallel once the individual data partitions have + * been distributed across the cluster. + * + * Further details of the algorithm can be found [[http://arxiv.org/abs/1409.1458 here]]. + * + * @example + * {{{ + * val trainingDS: DataSet[LabeledVector] = env.readLibSVM(pathToTrainingFile) + * + * val svm = SVM() + * .setBlocks(10) + * + * svm.fit(trainingDS) + * + * val testingDS: DataSet[Vector] = env.readLibSVM(pathToTestingFile) + * .map(lv => lv.vector) + * + * val predictionDS: DataSet[(Vector, Double)] = svm.predict(testingDS) + * }}} + * + * =Parameters= + * + * - [[org.apache.flink.ml.classification.SVM.Blocks]]: + * Sets the number of blocks into which the input data will be split. On each block the local + * stochastic dual coordinate ascent method is executed. This number should be set at least to + * the degree of parallelism. If no value is specified, then the parallelism of the input + * [[DataSet]] is used as the number of blocks. (Default value: '''None''') + * + * - [[org.apache.flink.ml.classification.SVM.Iterations]]: + * Defines the maximum number of iterations of the outer loop method. In other words, it defines + * how often the SDCA method is applied to the blocked data. After each iteration, the locally + * computed weight vector updates have to be reduced to update the global weight vector value. + * The new weight vector is broadcast to all SDCA tasks at the beginning of each iteration. + * (Default value: '''10''') + * + * - [[org.apache.flink.ml.classification.SVM.LocalIterations]]: + * Defines the maximum number of SDCA iterations. In other words, it defines how many data points + * are drawn from each local data block to calculate the stochastic dual coordinate ascent. + * (Default value: '''10''') + * + * - [[org.apache.flink.ml.classification.SVM.Regularization]]: + * Defines the regularization constant of the SVM algorithm. The higher the value, the smaller + * will the 2-norm of the weight vector be. In case of a SVM with hinge loss this means that the + * SVM margin will be wider even though it might contain some false classifications. + * (Default value: '''1.0''') + * + * - [[org.apache.flink.ml.classification.SVM.Stepsize]]: + * Defines the initial step size for the updates of the weight vector. The larger the step size + * is, the larger will be the contribution of the weight vector updates to the next weight vector + * value. The effective scaling of the updates is `stepsize/blocks`. This value has to be tuned + * in case that the algorithm becomes instable. (Default value: '''1.0''') + * + * - [[org.apache.flink.ml.classification.SVM.Seed]]: + * Defines the seed to initialize the random number generator. The seed directly controls which + * data points are chosen for the SDCA method. (Default value: '''Random value''') + * + * - [[org.apache.flink.ml.classification.SVM.ThresholdValue]]: + * Defines the limiting value for the decision function above which examples are labeled as + * positive (+1.0). Examples with a decision function value below this value are classified as + * negative(-1.0). In order to get the raw decision function values you need to indicate it by + * using the [[org.apache.flink.ml.classification.SVM.OutputDecisionFunction]]. + * (Default value: '''0.0''') + * + * - [[org.apache.flink.ml.classification.SVM.OutputDecisionFunction]]: + * Determines whether the predict and evaluate functions of the SVM should return the distance + * to the separating hyperplane, or binary class labels. Setting this to true will return the raw + * distance to the hyperplane for each example. Setting it to false will return the binary + * class label (+1.0, -1.0) (Default value: '''false''') + */ +class SVM extends Predictor[SVM] { + + import SVM._ + + /** Stores the learned weight vector after the fit operation */ + var weightsOption: Option[DataSet[DenseVector]] = None + + /** Sets the number of data blocks/partitions + * + * @param blocks the number of blocks into which the input data will be split. + * @return itself + */ + def setBlocks(blocks: Int): SVM = { + parameters.add(Blocks, blocks) + this + } + + /** Sets the number of outer iterations + * + * @param iterations the maximum number of iterations of the outer loop method + * @return itself + */ + def setIterations(iterations: Int): SVM = { + parameters.add(Iterations, iterations) + this + } + + /** Sets the number of local SDCA iterations + * + * @param localIterations the maximum number of SDCA iterations + * @return itself + */ + def setLocalIterations(localIterations: Int): SVM = { + parameters.add(LocalIterations, localIterations) + this + } + + /** Sets the regularization constant + * + * @param regularization the regularization constant of the SVM algorithm + * @return itself + */ + def setRegularization(regularization: Double): SVM = { + parameters.add(Regularization, regularization) + this + } + + /** Sets the stepsize for the weight vector updates + * + * @param stepsize the initial step size for the updates of the weight vector + * @return itself + */ + def setStepsize(stepsize: Double): SVM = { + parameters.add(Stepsize, stepsize) + this + } + + /** Sets the seed value for the random number generator + * + * @param seed the seed to initialize the random number generator + * @return itself + */ + def setSeed(seed: Long): SVM = { + parameters.add(Seed, seed) + this + } + + /** Sets the threshold above which elements are classified as positive. + * + * The [[predict ]] and [[evaluate]] functions will return +1.0 for items with a decision + * function value above this threshold, and -1.0 for items below it. + * @param threshold the limiting value for the decision function above which examples are + * labeled as positive + * @return itself + */ + def setThreshold(threshold: Double): SVM = { + parameters.add(ThresholdValue, threshold) + this + } + + /** Sets whether the predictions should return the raw decision function value or the + * thresholded binary value. + * + * When setting this to true, predict and evaluate return the raw decision value, which is + * the distance from the separating hyperplane. + * When setting this to false, they return thresholded (+1.0, -1.0) values. + * + * @param outputDecisionFunction When set to true, [[predict ]] and [[evaluate]] return the raw + * decision function values. When set to false, they return the + * thresholded binary values (+1.0, -1.0). + * @return itself + */ + def setOutputDecisionFunction(outputDecisionFunction: Boolean): SVM = { + parameters.add(OutputDecisionFunction, outputDecisionFunction) + this + } +} + +/** Companion object of SVM. Contains convenience functions and the parameter type definitions + * of the algorithm. + */ +object SVM{ + + val WEIGHT_VECTOR_BROADCAST_NAME = "weightVector" + + // ========================================== Parameters ========================================= + + case object Blocks extends Parameter[Int] { + val defaultValue: Option[Int] = None + } + + case object Iterations extends Parameter[Int] { + val defaultValue = Some(10) + } + + case object LocalIterations extends Parameter[Int] { + val defaultValue = Some(10) + } + + case object Regularization extends Parameter[Double] { + val defaultValue = Some(1.0) + } + + case object Stepsize extends Parameter[Double] { + val defaultValue = Some(1.0) + } + + case object Seed extends Parameter[Long] { + val defaultValue = Some(Random.nextLong()) + } + + case object ThresholdValue extends Parameter[Double] { + val defaultValue = Some(0.0) + } + + case object OutputDecisionFunction extends Parameter[Boolean] { + val defaultValue = Some(false) + } + + // ========================================== Factory methods ==================================== + + def apply(): SVM = { + new SVM() + } + + // ========================================== Operations ========================================= + + /** Provides the operation that makes the predictions for individual examples. + * + * @tparam T Input data type which is a subtype of [[Vector]] + * @return A PredictOperation, through which it is possible to predict a value, given a + * feature vector + */ + implicit def predictVectors[T <: Vector] = { + new PredictOperation[SVM, DenseVector, T, Double](){ + + var thresholdValue: Double = _ + var outputDecisionFunction: Boolean = _ + + override def getModel(self: SVM, predictParameters: ParameterMap): DataSet[DenseVector] = { + thresholdValue = predictParameters(ThresholdValue) + outputDecisionFunction = predictParameters(OutputDecisionFunction) + self.weightsOption match { + case Some(model) => model + case None => { + throw new RuntimeException("The SVM model has not been trained. Call first fit" + + "before calling the predict operation.") + } + } + } + + override def predict(value: T, model: DenseVector): Double = { + val rawValue = value.asBreeze dot model.asBreeze + + if (outputDecisionFunction) { + rawValue + } else { + if (rawValue > thresholdValue) 1.0 else -1.0 + } + } + } + } + + /** [[FitOperation]] which trains a SVM with soft-margin based on the given training data set. + * + */ + implicit val fitSVM = { + new FitOperation[SVM, LabeledVector] { + override def fit( + instance: SVM, + fitParameters: ParameterMap, + input: DataSet[LabeledVector]) + : Unit = { + val resultingParameters = instance.parameters ++ fitParameters + + // Check if the number of blocks/partitions has been specified + val blocks = resultingParameters.get(Blocks) match { + case Some(value) => value + case None => input.getParallelism + } + + val scaling = resultingParameters(Stepsize)/blocks + val iterations = resultingParameters(Iterations) + val localIterations = resultingParameters(LocalIterations) + val regularization = resultingParameters(Regularization) + val seed = resultingParameters(Seed) + + // Obtain DataSet with the dimension of the data points + val dimension = input.map{_.vector.size}.reduce{ + (a, b) => { + require(a == b, "Dimensions of feature vectors have to be equal.") + a + } + } + + val initialWeights = createInitialWeights(dimension) + + // Count the number of vectors, but keep the value in a DataSet to broadcast it later + // TODO: Once efficient count and intermediate result partitions are implemented, use count + val numberVectors = input map { x => 1 } reduce { _ + _ } + + // Group the input data into blocks in round robin fashion + val blockedInputNumberElements = FlinkMLTools.block( + input, + blocks, + Some(ModuloKeyPartitioner)). + cross(numberVectors). + map { x => x } + + val resultingWeights = initialWeights.iterate(iterations) { + weights => { + // compute the local SDCA to obtain the weight vector updates + val deltaWs = localDualMethod( + weights, + blockedInputNumberElements, + localIterations, + regularization, + scaling, + seed + ) + + // scale the weight vectors + val weightedDeltaWs = deltaWs map { + deltaW => { + deltaW :*= scaling + } + } + + // calculate the new weight vector by adding the weight vector updates to the weight + // vector value + weights.union(weightedDeltaWs).reduce { _ + _ } + } + } + + // Store the learned weight vector in hte given instance + instance.weightsOption = Some(resultingWeights.map(_.fromBreeze[DenseVector])) + } + } + } + + /** Creates a zero vector of length dimension + * + * @param dimension [[DataSet]] containing the dimension of the initial weight vector + * @return Zero vector of length dimension + */ + private def createInitialWeights(dimension: DataSet[Int]): DataSet[BreezeDenseVector[Double]] = { + dimension.map { + d => BreezeDenseVector.zeros[Double](d) + } + } + + /** Computes the local SDCA on the individual data blocks/partitions + * + * @param w Current weight vector + * @param blockedInputNumberElements Blocked/Partitioned input data + * @param localIterations Number of local SDCA iterations + * @param regularization Regularization constant + * @param scaling Scaling value for new weight vector updates + * @param seed Random number generator seed + * @return [[DataSet]] of weight vector updates. The weight vector updates are double arrays + */ + private def localDualMethod( + w: DataSet[BreezeDenseVector[Double]], + blockedInputNumberElements: DataSet[(Block[LabeledVector], Int)], + localIterations: Int, + regularization: Double, + scaling: Double, + seed: Long) + : DataSet[BreezeDenseVector[Double]] = { + /* + Rich mapper calculating for each data block the local SDCA. We use a RichMapFunction here, + because we broadcast the current value of the weight vector to all mappers. + */ + val localSDCA = new RichMapFunction[(Block[LabeledVector], Int), BreezeDenseVector[Double]] { + var originalW: BreezeDenseVector[Double] = _ + // we keep the alphas across the outer loop iterations + val alphasArray = ArrayBuffer[BreezeDenseVector[Double]]() + // there might be several data blocks in one Flink partition, therefore store mapping + val idMapping = scala.collection.mutable.HashMap[Int, Int]() + var counter = 0 + + var r: Random = _ + + override def open(parameters: Configuration): Unit = { + originalW = getRuntimeContext.getBroadcastVariable(WEIGHT_VECTOR_BROADCAST_NAME).get(0) + + if(r == null){ + r = new Random(seed ^ getRuntimeContext.getIndexOfThisSubtask) + } + } + + override def map(blockNumberElements: (Block[LabeledVector], Int)) + : BreezeDenseVector[Double] = { + val (block, numberElements) = blockNumberElements + + // check if we already processed a data block with the corresponding block index + val localIndex = idMapping.get(block.index) match { + case Some(idx) => idx + case None => + idMapping += (block.index -> counter) + counter += 1 + + alphasArray += BreezeDenseVector.zeros[Double](block.values.length) + + counter - 1 + } + + // create temporary alpha array for the local SDCA iterations + val tempAlphas = alphasArray(localIndex).copy + + val numLocalDatapoints = tempAlphas.length + val deltaAlphas = BreezeDenseVector.zeros[Double](numLocalDatapoints) + + val w = originalW.copy + + val deltaW = BreezeDenseVector.zeros[Double](originalW.length) + + for(i <- 1 to localIterations) { + // pick random data point for SDCA + val idx = r.nextInt(numLocalDatapoints) + + val LabeledVector(label, vector) = block.values(idx) + val alpha = tempAlphas(idx) + + // maximize the dual problem and retrieve alpha and weight vector updates + val (deltaAlpha, deltaWUpdate) = maximize( + vector.asBreeze, + label, + regularization, + alpha, + w, + numberElements) + + // update alpha values + tempAlphas(idx) += deltaAlpha + deltaAlphas(idx) += deltaAlpha + + // deltaWUpdate is already scaled with 1/lambda/n + w += deltaWUpdate + deltaW += deltaWUpdate + } + + // update local alpha values + alphasArray(localIndex) += deltaAlphas * scaling + + deltaW + } + } + + blockedInputNumberElements.map(localSDCA).withBroadcastSet(w, WEIGHT_VECTOR_BROADCAST_NAME) + } + + /** Maximizes the dual problem using hinge loss functions. It returns the alpha and weight + * vector updates. + * + * @param x Selected data point + * @param y Label of selected data point + * @param regularization Regularization constant + * @param alpha Alpha value of selected data point + * @param w Current weight vector value + * @param numberElements Number of elements in the training data set + * @return Alpha and weight vector updates + */ + private def maximize( + x: BreezeVector[Double], + y: Double, regularization: Double, + alpha: Double, + w: BreezeVector[Double], + numberElements: Int) + : (Double, BreezeVector[Double]) = { + // compute hinge loss gradient + val dotProduct = x dot w + val grad = (y * dotProduct - 1.0) * (regularization * numberElements) + + // compute projected gradient + var proj_grad = if(alpha <= 0.0){ + scala.math.min(grad, 0) + } else if(alpha >= 1.0) { + scala.math.max(grad, 0) + } else { + grad + } + + if(scala.math.abs(grad) != 0.0){ + val qii = x dot x + val newAlpha = if(qii != 0.0){ + scala.math.min(scala.math.max(alpha - (grad / qii), 0.0), 1.0) + } else { + 1.0 + } + + val deltaW = x * y * (newAlpha - alpha) / (regularization * numberElements) + + (newAlpha - alpha, deltaW) + } else { + (0.0 , BreezeVector.zeros(w.length)) + } + } + +} diff --git a/src/main/scala/org/apache/flink/ml/common/Block.scala b/src/main/scala/org/apache/flink/ml/common/Block.scala new file mode 100644 index 0000000000000..01c680690702e --- /dev/null +++ b/src/main/scala/org/apache/flink/ml/common/Block.scala @@ -0,0 +1,29 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.common + +/** Base class for blocks of elements. + * + * TODO: Replace Vector type by Array type once Flink supports generic arrays + * + * @param index + * @param values + * @tparam T + */ +case class Block[T](index: Int, values: Vector[T]) diff --git a/src/main/scala/org/apache/flink/ml/common/FlinkMLTools.scala b/src/main/scala/org/apache/flink/ml/common/FlinkMLTools.scala new file mode 100644 index 0000000000000..2460061809db1 --- /dev/null +++ b/src/main/scala/org/apache/flink/ml/common/FlinkMLTools.scala @@ -0,0 +1,424 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.common + +import org.apache.flink.api.common.functions.Partitioner +import org.apache.flink.api.common.io.FileOutputFormat.OutputDirectoryMode +import org.apache.flink.api.common.typeinfo.TypeInformation +import org.apache.flink.api.java.io.{TypeSerializerInputFormat, TypeSerializerOutputFormat} +import org.apache.flink.api.scala._ +import org.apache.flink.core.fs.FileSystem.WriteMode +import org.apache.flink.core.fs.Path + +import scala.reflect.ClassTag + +/** FlinkMLTools contains a set of convenience functions for Flink's machine learning library: + * + * - persist: + * Takes up to 5 [[DataSet]]s and file paths. Each [[DataSet]] is written to the specified + * path and subsequently re-read from disk. This method can be used to effectively split the + * execution graph at the given [[DataSet]]. Writing it to disk triggers its materialization + * and specifying it as a source will prevent the re-execution of it. + * + * - block: + * Takes a DataSet of elements T and groups them in n blocks. + * + */ +object FlinkMLTools { + val EXECUTION_ENVIRONMENT_NAME = "FlinkMLTools persist" + + /** Registers the different FlinkML related types for Kryo serialization + * + * @param env The Flink execution environment where the types need to be registered + */ + def registerFlinkMLTypes(env: ExecutionEnvironment): Unit = { + + // Vector types + env.registerType(classOf[org.apache.flink.ml.math.DenseVector]) + env.registerType(classOf[org.apache.flink.ml.math.SparseVector]) + + // Matrix types + env.registerType(classOf[org.apache.flink.ml.math.DenseMatrix]) + env.registerType(classOf[org.apache.flink.ml.math.SparseMatrix]) + + // Breeze Vector types + env.registerType(classOf[breeze.linalg.DenseVector[_]]) + env.registerType(classOf[breeze.linalg.SparseVector[_]]) + + // Breeze specialized types + env.registerType(breeze.linalg.DenseVector.zeros[Double](0).getClass) + env.registerType(breeze.linalg.SparseVector.zeros[Double](0).getClass) + + // Breeze Matrix types + env.registerType(classOf[breeze.linalg.DenseMatrix[Double]]) + env.registerType(classOf[breeze.linalg.CSCMatrix[Double]]) + + // Breeze specialized types + env.registerType(breeze.linalg.DenseMatrix.zeros[Double](0, 0).getClass) + env.registerType(breeze.linalg.CSCMatrix.zeros[Double](0, 0).getClass) + } + + /** Writes a [[DataSet]] to the specified path and returns it as a DataSource for subsequent + * operations. + * + * @param dataset [[DataSet]] to write to disk + * @param path File path to write dataset to + * @tparam T Type of the [[DataSet]] elements + * @return [[DataSet]] reading the just written file + */ + def persist[T: ClassTag: TypeInformation](dataset: DataSet[T], path: String): DataSet[T] = { + val env = dataset.getExecutionEnvironment + val outputFormat = new TypeSerializerOutputFormat[T] + + val filePath = new Path(path) + + outputFormat.setOutputFilePath(filePath) + outputFormat.setWriteMode(WriteMode.OVERWRITE) + + dataset.output(outputFormat) + env.execute(EXECUTION_ENVIRONMENT_NAME) + + val inputFormat = new TypeSerializerInputFormat[T](dataset.getType) + inputFormat.setFilePath(filePath) + + env.createInput(inputFormat) + } + + /** Writes multiple [[DataSet]]s to the specified paths and returns them as DataSources for + * subsequent operations. + * + * @param ds1 First [[DataSet]] to write to disk + * @param ds2 Second [[DataSet]] to write to disk + * @param path1 Path for ds1 + * @param path2 Path for ds2 + * @tparam A Type of the first [[DataSet]]'s elements + * @tparam B Type of the second [[DataSet]]'s elements + * @return Tuple of [[DataSet]]s reading the just written files + */ + def persist[A: ClassTag: TypeInformation ,B: ClassTag: TypeInformation](ds1: DataSet[A], ds2: + DataSet[B], path1: String, path2: String): (DataSet[A], DataSet[B]) = { + val env = ds1.getExecutionEnvironment + + val f1 = new Path(path1) + + val of1 = new TypeSerializerOutputFormat[A] + of1.setOutputFilePath(f1) + of1.setWriteMode(WriteMode.OVERWRITE) + + ds1.output(of1) + + val f2 = new Path(path2) + + val of2 = new TypeSerializerOutputFormat[B] + of2.setOutputFilePath(f2) + of2.setWriteMode(WriteMode.OVERWRITE) + + ds2.output(of2) + + env.execute(EXECUTION_ENVIRONMENT_NAME) + + val if1 = new TypeSerializerInputFormat[A](ds1.getType) + if1.setFilePath(f1) + + val if2 = new TypeSerializerInputFormat[B](ds2.getType) + if2.setFilePath(f2) + + (env.createInput(if1), env.createInput(if2)) + } + + /** Writes multiple [[DataSet]]s to the specified paths and returns them as DataSources for + * subsequent operations. + * + * @param ds1 First [[DataSet]] to write to disk + * @param ds2 Second [[DataSet]] to write to disk + * @param ds3 Third [[DataSet]] to write to disk + * @param path1 Path for ds1 + * @param path2 Path for ds2 + * @param path3 Path for ds3 + * @tparam A Type of first [[DataSet]]'s elements + * @tparam B Type of second [[DataSet]]'s elements + * @tparam C Type of third [[DataSet]]'s elements + * @return Tuple of [[DataSet]]s reading the just written files + */ + def persist[A: ClassTag: TypeInformation ,B: ClassTag: TypeInformation, + C: ClassTag: TypeInformation](ds1: DataSet[A], ds2: DataSet[B], ds3: DataSet[C], path1: + String, path2: String, path3: String): (DataSet[A], DataSet[B], DataSet[C]) = { + val env = ds1.getExecutionEnvironment + + val f1 = new Path(path1) + + val of1 = new TypeSerializerOutputFormat[A] + of1.setOutputFilePath(f1) + of1.setWriteMode(WriteMode.OVERWRITE) + + ds1.output(of1) + + val f2 = new Path(path2) + + val of2 = new TypeSerializerOutputFormat[B] + of2.setOutputFilePath(f2) + of2.setWriteMode(WriteMode.OVERWRITE) + + ds2.output(of2) + + val f3 = new Path(path3) + + val of3 = new TypeSerializerOutputFormat[C] + of3.setOutputFilePath(f3) + of3.setWriteMode(WriteMode.OVERWRITE) + + ds3.output(of3) + + env.execute(EXECUTION_ENVIRONMENT_NAME) + + val if1 = new TypeSerializerInputFormat[A](ds1.getType) + if1.setFilePath(f1) + + val if2 = new TypeSerializerInputFormat[B](ds2.getType) + if2.setFilePath(f2) + + val if3 = new TypeSerializerInputFormat[C](ds3.getType) + if3.setFilePath(f3) + + (env.createInput(if1), env.createInput(if2), env.createInput(if3)) + } + + /** Writes multiple [[DataSet]]s to the specified paths and returns them as DataSources for + * subsequent operations. + * + * @param ds1 First [[DataSet]] to write to disk + * @param ds2 Second [[DataSet]] to write to disk + * @param ds3 Third [[DataSet]] to write to disk + * @param ds4 Fourth [[DataSet]] to write to disk + * @param path1 Path for ds1 + * @param path2 Path for ds2 + * @param path3 Path for ds3 + * @param path4 Path for ds4 + * @tparam A Type of first [[DataSet]]'s elements + * @tparam B Type of second [[DataSet]]'s elements + * @tparam C Type of third [[DataSet]]'s elements + * @tparam D Type of fourth [[DataSet]]'s elements + * @return Tuple of [[DataSet]]s reading the just written files + */ + def persist[A: ClassTag: TypeInformation ,B: ClassTag: TypeInformation, + C: ClassTag: TypeInformation, D: ClassTag: TypeInformation](ds1: DataSet[A], ds2: DataSet[B], + ds3: DataSet[C], ds4: DataSet[D], + path1: String, path2: String, path3: + String, path4: String): + (DataSet[A], DataSet[B], DataSet[C], DataSet[D]) = { + val env = ds1.getExecutionEnvironment + + val f1 = new Path(path1) + + val of1 = new TypeSerializerOutputFormat[A] + of1.setOutputFilePath(f1) + of1.setWriteMode(WriteMode.OVERWRITE) + + ds1.output(of1) + + val f2 = new Path(path2) + + val of2 = new TypeSerializerOutputFormat[B] + of2.setOutputFilePath(f2) + of2.setWriteMode(WriteMode.OVERWRITE) + + ds2.output(of2) + + val f3 = new Path(path3) + + val of3 = new TypeSerializerOutputFormat[C] + of3.setOutputFilePath(f3) + of3.setWriteMode(WriteMode.OVERWRITE) + + ds3.output(of3) + + val f4 = new Path(path4) + + val of4 = new TypeSerializerOutputFormat[D] + of4.setOutputFilePath(f4) + of4.setWriteMode(WriteMode.OVERWRITE) + + ds4.output(of4) + + env.execute(EXECUTION_ENVIRONMENT_NAME) + + val if1 = new TypeSerializerInputFormat[A](ds1.getType) + if1.setFilePath(f1) + + val if2 = new TypeSerializerInputFormat[B](ds2.getType) + if2.setFilePath(f2) + + val if3 = new TypeSerializerInputFormat[C](ds3.getType) + if3.setFilePath(f3) + + val if4 = new TypeSerializerInputFormat[D](ds4.getType) + if4.setFilePath(f4) + + (env.createInput(if1), env.createInput(if2), env.createInput(if3), env.createInput(if4)) + } + + /** Writes multiple [[DataSet]]s to the specified paths and returns them as DataSources for + * subsequent operations. + * + * @param ds1 First [[DataSet]] to write to disk + * @param ds2 Second [[DataSet]] to write to disk + * @param ds3 Third [[DataSet]] to write to disk + * @param ds4 Fourth [[DataSet]] to write to disk + * @param ds5 Fifth [[DataSet]] to write to disk + * @param path1 Path for ds1 + * @param path2 Path for ds2 + * @param path3 Path for ds3 + * @param path4 Path for ds4 + * @param path5 Path for ds5 + * @tparam A Type of first [[DataSet]]'s elements + * @tparam B Type of second [[DataSet]]'s elements + * @tparam C Type of third [[DataSet]]'s elements + * @tparam D Type of fourth [[DataSet]]'s elements + * @tparam E Type of fifth [[DataSet]]'s elements + * @return Tuple of [[DataSet]]s reading the just written files + */ + def persist[A: ClassTag: TypeInformation ,B: ClassTag: TypeInformation, + C: ClassTag: TypeInformation, D: ClassTag: TypeInformation, E: ClassTag: TypeInformation] + (ds1: DataSet[A], ds2: DataSet[B], ds3: DataSet[C], ds4: DataSet[D], ds5: DataSet[E], path1: + String, path2: String, path3: String, path4: String, path5: String): (DataSet[A], DataSet[B], + DataSet[C], DataSet[D], DataSet[E]) = { + val env = ds1.getExecutionEnvironment + + val f1 = new Path(path1) + + val of1 = new TypeSerializerOutputFormat[A] + of1.setOutputFilePath(f1) + of1.setWriteMode(WriteMode.OVERWRITE) + + ds1.output(of1) + + val f2 = new Path(path2) + + val of2 = new TypeSerializerOutputFormat[B] + of2.setOutputFilePath(f2) + of2.setOutputDirectoryMode(OutputDirectoryMode.ALWAYS) + of2.setWriteMode(WriteMode.OVERWRITE) + + ds2.output(of2) + + val f3 = new Path(path3) + + val of3 = new TypeSerializerOutputFormat[C] + of3.setOutputFilePath(f3) + of3.setWriteMode(WriteMode.OVERWRITE) + + ds3.output(of3) + + val f4 = new Path(path4) + + val of4 = new TypeSerializerOutputFormat[D] + of4.setOutputFilePath(f4) + of4.setWriteMode(WriteMode.OVERWRITE) + + ds4.output(of4) + + val f5 = new Path(path5) + + val of5 = new TypeSerializerOutputFormat[E] + of5.setOutputFilePath(f5) + of5.setWriteMode(WriteMode.OVERWRITE) + + ds5.output(of5) + + env.execute(EXECUTION_ENVIRONMENT_NAME) + + val if1 = new TypeSerializerInputFormat[A](ds1.getType) + if1.setFilePath(f1) + + val if2 = new TypeSerializerInputFormat[B](ds2.getType) + if2.setFilePath(f2) + + val if3 = new TypeSerializerInputFormat[C](ds3.getType) + if3.setFilePath(f3) + + val if4 = new TypeSerializerInputFormat[D](ds4.getType) + if4.setFilePath(f4) + + val if5 = new TypeSerializerInputFormat[E](ds5.getType) + if5.setFilePath(f5) + + (env.createInput(if1), env.createInput(if2), env.createInput(if3), env.createInput(if4), env + .createInput(if5)) + } + + /** Groups the DataSet input into numBlocks blocks. + * + * @param input the input dataset + * @param numBlocks Number of Blocks + * @param partitionerOption Optional partitioner to control the partitioning + * @tparam T Type of the [[DataSet]]'s elements + * @return The different datasets grouped into blocks + */ + def block[T: TypeInformation: ClassTag]( + input: DataSet[T], + numBlocks: Int, + partitionerOption: Option[Partitioner[Int]] = None) + : DataSet[Block[T]] = { + val blockIDInput = input map { + element => + val blockID = element.hashCode() % numBlocks + + val blockIDResult = if(blockID < 0){ + blockID + numBlocks + } else { + blockID + } + + (blockIDResult, element) + } + + val preGroupBlockIDInput = partitionerOption match { + case Some(partitioner) => + blockIDInput partitionCustom(partitioner, 0) + + case None => blockIDInput + } + + preGroupBlockIDInput.groupBy(0).reduceGroup { + iterator => { + val array = iterator.toVector + + val blockID = array.head._1 + val elements = array.map(_._2) + + Block[T](blockID, elements) + } + }.withForwardedFields("0 -> index") + } + + /** Distributes the elements by taking the modulo of their keys and assigning it to this channel + * + */ + object ModuloKeyPartitioner extends Partitioner[Int] { + override def partition(key: Int, numPartitions: Int): Int = { + val result = key % numPartitions + + if(result < 0) { + result + numPartitions + } else { + result + } + } + } +} diff --git a/src/main/scala/org/apache/flink/ml/common/LabeledVector.scala b/src/main/scala/org/apache/flink/ml/common/LabeledVector.scala new file mode 100644 index 0000000000000..ddf9b6c277d29 --- /dev/null +++ b/src/main/scala/org/apache/flink/ml/common/LabeledVector.scala @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.common + +import org.apache.flink.ml.math.Vector + +/** This class represents a vector with an associated label as it is required for many supervised + * learning tasks. + * + * @param label Label of the data point + * @param vector Data point + */ +case class LabeledVector(label: Double, vector: Vector) extends Serializable { + + override def equals(obj: Any): Boolean = { + obj match { + case labeledVector: LabeledVector => + vector.equals(labeledVector.vector) && label.equals(labeledVector.label) + case _ => false + } + } + + override def toString: String = { + s"LabeledVector($label, $vector)" + } +} diff --git a/src/main/scala/org/apache/flink/ml/common/ParameterMap.scala b/src/main/scala/org/apache/flink/ml/common/ParameterMap.scala new file mode 100644 index 0000000000000..91eada252ea81 --- /dev/null +++ b/src/main/scala/org/apache/flink/ml/common/ParameterMap.scala @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.common + +import scala.collection.mutable + +/** + * Map used to store configuration parameters for algorithms. The parameter + * values are stored in a [[Map]] being identified by a [[Parameter]] object. ParameterMaps can + * be fused. This operation is left associative, meaning that latter ParameterMaps can override + * parameter values defined in a preceding ParameterMap. + * + * @param map Map containing parameter settings + */ +class ParameterMap(val map: mutable.Map[Parameter[_], Any]) extends Serializable { + + def this() = { + this(new mutable.HashMap[Parameter[_], Any]()) + } + + /** + * Adds a new parameter value to the ParameterMap. + * + * @param parameter Key + * @param value Value associated with the given key + * @tparam T Type of value + */ + def add[T](parameter: Parameter[T], value: T): ParameterMap = { + map += (parameter -> value) + this + } + + /** + * Retrieves a parameter value associated to a given key. The value is returned as an Option. + * If there is no value associated to the given key, then the default value of the [[Parameter]] + * is returned. + * + * @param parameter Key + * @tparam T Type of the value to retrieve + * @return Some(value) if an value is associated to the given key, otherwise the default value + * defined by parameter + */ + def get[T](parameter: Parameter[T]): Option[T] = { + if(map.isDefinedAt(parameter)) { + map.get(parameter).asInstanceOf[Option[T]] + } else { + parameter.defaultValue + } + } + + /** + * Retrieves a parameter value associated to a given key. If there is no value contained in the + * map, then the default value of the [[Parameter]] is checked. If the default value is defined, + * then it is returned. If the default is undefined, then a [[NoSuchElementException]] is thrown. + * + * @param parameter Key + * @tparam T Type of value + * @return Value associated with the given key or its default value + */ + def apply[T](parameter: Parameter[T]): T = { + if(map.isDefinedAt(parameter)) { + map(parameter).asInstanceOf[T] + } else { + parameter.defaultValue match { + case Some(value) => value + case None => throw new NoSuchElementException(s"Could not retrieve " + + s"parameter value $parameter.") + } + } + } + + /** + * Adds the parameter values contained in parameters to itself. + * + * @param parameters [[ParameterMap]] containing the parameter values to be added + * @return this after inserting the parameter values from parameters + */ + def ++(parameters: ParameterMap): ParameterMap = { + val result = new ParameterMap(map) + result.map ++= parameters.map + + result + } +} + +object ParameterMap { + val Empty = new ParameterMap + + def apply(): ParameterMap = { + new ParameterMap + } +} + +/** + * Base trait for parameter keys + * + * @tparam T Type of parameter value associated to this parameter key + */ +trait Parameter[T] { + + /** + * Default value of parameter. If no such value exists, then returns [[None]] + */ + val defaultValue: Option[T] +} diff --git a/src/main/scala/org/apache/flink/ml/common/WeightVector.scala b/src/main/scala/org/apache/flink/ml/common/WeightVector.scala new file mode 100644 index 0000000000000..f21c60f9c9f7e --- /dev/null +++ b/src/main/scala/org/apache/flink/ml/common/WeightVector.scala @@ -0,0 +1,32 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.common + +import org.apache.flink.ml.math.Vector + +// TODO(tvas): This provides an abstraction for the weights +// but at the same time it leads to the creation of many objects as we have to pack and unpack +// the weights and the intercept often during SGD. + +/** This class represents a weight vector with an intercept, as it is required for many supervised + * learning tasks + * @param weights The vector of weights + * @param intercept The intercept (bias) weight + */ +case class WeightVector(weights: Vector, intercept: Double) extends Serializable {} diff --git a/src/main/scala/org/apache/flink/ml/common/WithParameters.scala b/src/main/scala/org/apache/flink/ml/common/WithParameters.scala new file mode 100644 index 0000000000000..ded8d36c13a6e --- /dev/null +++ b/src/main/scala/org/apache/flink/ml/common/WithParameters.scala @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.common + +/** + * Adds a [[ParameterMap]] which can be used to store configuration values + */ +trait WithParameters { + val parameters = new ParameterMap +} diff --git a/src/main/scala/org/apache/flink/ml/math/BLAS.scala b/src/main/scala/org/apache/flink/ml/math/BLAS.scala new file mode 100644 index 0000000000000..df1ec3ae508a6 --- /dev/null +++ b/src/main/scala/org/apache/flink/ml/math/BLAS.scala @@ -0,0 +1,291 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.math + +import com.github.fommil.netlib.{BLAS => NetlibBLAS, F2jBLAS} +import com.github.fommil.netlib.BLAS.{getInstance => NativeBLAS} + +/** + * BLAS routines for vectors and matrices. + * + * Original code from the Apache Spark project: + * http://git.io/vfZUe + */ +object BLAS extends Serializable { + + @transient private var _f2jBLAS: NetlibBLAS = _ + @transient private var _nativeBLAS: NetlibBLAS = _ + + // For level-1 routines, we use Java implementation. + private def f2jBLAS: NetlibBLAS = { + if (_f2jBLAS == null) { + _f2jBLAS = new F2jBLAS + } + _f2jBLAS + } + + /** + * y += a * x + */ + def axpy(a: Double, x: Vector, y: Vector): Unit = { + require(x.size == y.size) + y match { + case dy: DenseVector => + x match { + case sx: SparseVector => + axpy(a, sx, dy) + case dx: DenseVector => + axpy(a, dx, dy) + case _ => + throw new UnsupportedOperationException( + s"axpy doesn't support x type ${x.getClass}.") + } + case _ => + throw new IllegalArgumentException( + s"axpy only supports adding to a dense vector but got type ${y.getClass}.") + } + } + + /** + * y += a * x + */ + private def axpy(a: Double, x: DenseVector, y: DenseVector): Unit = { + val n = x.size + f2jBLAS.daxpy(n, a, x.data, 1, y.data, 1) + } + + /** + * y += a * x + */ + private def axpy(a: Double, x: SparseVector, y: DenseVector): Unit = { + val xValues = x.data + val xIndices = x.indices + val yValues = y.data + val nnz = xIndices.length + + if (a == 1.0) { + var k = 0 + while (k < nnz) { + yValues(xIndices(k)) += xValues(k) + k += 1 + } + } else { + var k = 0 + while (k < nnz) { + yValues(xIndices(k)) += a * xValues(k) + k += 1 + } + } + } + + /** + * dot(x, y) + */ + def dot(x: Vector, y: Vector): Double = { + require(x.size == y.size, + "BLAS.dot(x: Vector, y:Vector) was given Vectors with non-matching sizes:" + + " x.size = " + x.size + ", y.size = " + y.size) + (x, y) match { + case (dx: DenseVector, dy: DenseVector) => + dot(dx, dy) + case (sx: SparseVector, dy: DenseVector) => + dot(sx, dy) + case (dx: DenseVector, sy: SparseVector) => + dot(sy, dx) + case (sx: SparseVector, sy: SparseVector) => + dot(sx, sy) + case _ => + throw new IllegalArgumentException(s"dot doesn't support (${x.getClass}, ${y.getClass}).") + } + } + + /** + * dot(x, y) + */ + private def dot(x: DenseVector, y: DenseVector): Double = { + val n = x.size + f2jBLAS.ddot(n, x.data, 1, y.data, 1) + } + + /** + * dot(x, y) + */ + private def dot(x: SparseVector, y: DenseVector): Double = { + val xValues = x.data + val xIndices = x.indices + val yValues = y.data + val nnz = xIndices.length + + var sum = 0.0 + var k = 0 + while (k < nnz) { + sum += xValues(k) * yValues(xIndices(k)) + k += 1 + } + sum + } + + /** + * dot(x, y) + */ + private def dot(x: SparseVector, y: SparseVector): Double = { + val xValues = x.data + val xIndices = x.indices + val yValues = y.data + val yIndices = y.indices + val nnzx = xIndices.length + val nnzy = yIndices.length + + var kx = 0 + var ky = 0 + var sum = 0.0 + // y catching x + while (kx < nnzx && ky < nnzy) { + val ix = xIndices(kx) + while (ky < nnzy && yIndices(ky) < ix) { + ky += 1 + } + if (ky < nnzy && yIndices(ky) == ix) { + sum += xValues(kx) * yValues(ky) + ky += 1 + } + kx += 1 + } + sum + } + + /** + * y = x + */ + def copy(x: Vector, y: Vector): Unit = { + val n = y.size + require(x.size == n) + y match { + case dy: DenseVector => + x match { + case sx: SparseVector => + val sxIndices = sx.indices + val sxValues = sx.data + val dyValues = dy.data + val nnz = sxIndices.length + + var i = 0 + var k = 0 + while (k < nnz) { + val j = sxIndices(k) + while (i < j) { + dyValues(i) = 0.0 + i += 1 + } + dyValues(i) = sxValues(k) + i += 1 + k += 1 + } + while (i < n) { + dyValues(i) = 0.0 + i += 1 + } + case dx: DenseVector => + Array.copy(dx.data, 0, dy.data, 0, n) + } + case _ => + throw new IllegalArgumentException(s"y must be dense in copy but got ${y.getClass}") + } + } + + /** + * x = a * x + */ + def scal(a: Double, x: Vector): Unit = { + x match { + case sx: SparseVector => + f2jBLAS.dscal(sx.data.length, a, sx.data, 1) + case dx: DenseVector => + f2jBLAS.dscal(dx.data.length, a, dx.data, 1) + case _ => + throw new IllegalArgumentException(s"scal doesn't support vector type ${x.getClass}.") + } + } + + // For level-3 routines, we use the native BLAS. + private def nativeBLAS: NetlibBLAS = { + if (_nativeBLAS == null) { + _nativeBLAS = NativeBLAS + } + _nativeBLAS + } + + /** + * A := alpha * x * x^T^ + A + * @param alpha a real scalar that will be multiplied to x * x^T^. + * @param x the vector x that contains the n elements. + * @param A the symmetric matrix A. Size of n x n. + */ + def syr(alpha: Double, x: Vector, A: DenseMatrix) { + val mA = A.numRows + val nA = A.numCols + require(mA == nA, s"A is not a square matrix (and hence is not symmetric). A: $mA x $nA") + require(mA == x.size, s"The size of x doesn't match the rank of A. A: $mA x $nA, x: ${x.size}") + + x match { + case dv: DenseVector => syr(alpha, dv, A) + case sv: SparseVector => syr(alpha, sv, A) + case _ => + throw new IllegalArgumentException(s"syr doesn't support vector type ${x.getClass}.") + } + } + + private def syr(alpha: Double, x: DenseVector, A: DenseMatrix) { + val nA = A.numRows + val mA = A.numCols + + nativeBLAS.dsyr("U", x.size, alpha, x.data, 1, A.data, nA) + + // Fill lower triangular part of A + var i = 0 + while (i < mA) { + var j = i + 1 + while (j < nA) { + A(j, i) = A(i, j) + j += 1 + } + i += 1 + } + } + + private def syr(alpha: Double, x: SparseVector, A: DenseMatrix) { + val mA = A.numCols + val xIndices = x.indices + val xValues = x.data + val nnz = xValues.length + val Avalues = A.data + + var i = 0 + while (i < nnz) { + val multiplier = alpha * xValues(i) + val offset = xIndices(i) * mA + var j = 0 + while (j < nnz) { + Avalues(xIndices(j) + offset) += multiplier * xValues(j) + j += 1 + } + i += 1 + } + } +} diff --git a/src/main/scala/org/apache/flink/ml/math/Breeze.scala b/src/main/scala/org/apache/flink/ml/math/Breeze.scala new file mode 100644 index 0000000000000..91caf1dab9bda --- /dev/null +++ b/src/main/scala/org/apache/flink/ml/math/Breeze.scala @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.math + +import breeze.linalg.{Matrix => BreezeMatrix, DenseMatrix => BreezeDenseMatrix, +CSCMatrix => BreezeCSCMatrix, DenseVector => BreezeDenseVector, SparseVector => BreezeSparseVector, +Vector => BreezeVector} + +/** This class contains convenience function to wrap a matrix/vector into a breeze matrix/vector + * and to unwrap it again. + * + */ +object Breeze { + + implicit class Matrix2BreezeConverter(matrix: Matrix) { + def asBreeze: BreezeMatrix[Double] = { + matrix match { + case dense: DenseMatrix => + new BreezeDenseMatrix[Double]( + dense.numRows, + dense.numCols, + dense.data) + + case sparse: SparseMatrix => + new BreezeCSCMatrix[Double]( + sparse.data, + sparse.numRows, + sparse.numCols, + sparse.colPtrs, + sparse.rowIndices + ) + } + } + } + + implicit class Breeze2MatrixConverter(matrix: BreezeMatrix[Double]) { + def fromBreeze: Matrix = { + matrix match { + case dense: BreezeDenseMatrix[Double] => + new DenseMatrix(dense.rows, dense.cols, dense.data) + + case sparse: BreezeCSCMatrix[Double] => + new SparseMatrix(sparse.rows, sparse.cols, sparse.rowIndices, sparse.colPtrs, sparse.data) + } + } + } + + implicit class BreezeArrayConverter[T](array: Array[T]) { + def asBreeze: BreezeDenseVector[T] = { + new BreezeDenseVector[T](array) + } + } + + implicit class Breeze2VectorConverter(vector: BreezeVector[Double]) { + def fromBreeze[T <: Vector: BreezeVectorConverter]: T = { + val converter = implicitly[BreezeVectorConverter[T]] + converter.convert(vector) + } + } + + implicit class Vector2BreezeConverter(vector: Vector) { + def asBreeze: BreezeVector[Double] = { + vector match { + case dense: DenseVector => + new breeze.linalg.DenseVector(dense.data) + + case sparse: SparseVector => + new BreezeSparseVector(sparse.indices, sparse.data, sparse.size) + } + } + } +} diff --git a/src/main/scala/org/apache/flink/ml/math/BreezeVectorConverter.scala b/src/main/scala/org/apache/flink/ml/math/BreezeVectorConverter.scala new file mode 100644 index 0000000000000..d82cfa5f2a8ab --- /dev/null +++ b/src/main/scala/org/apache/flink/ml/math/BreezeVectorConverter.scala @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.math + +import breeze.linalg.{Vector => BreezeVector} + +/** Type class which allows the conversion from Breeze vectors to Flink vectors + * + * @tparam T Resulting type of the conversion, subtype of [[Vector]] + */ +trait BreezeVectorConverter[T <: Vector] extends Serializable { + /** Converts a Breeze vector into a Flink vector of type T + * + * @param vector Breeze vector + * @return Flink vector of type T + */ + def convert(vector: BreezeVector[Double]): T +} diff --git a/src/main/scala/org/apache/flink/ml/math/DenseMatrix.scala b/src/main/scala/org/apache/flink/ml/math/DenseMatrix.scala new file mode 100644 index 0000000000000..d66ee262e8e6f --- /dev/null +++ b/src/main/scala/org/apache/flink/ml/math/DenseMatrix.scala @@ -0,0 +1,191 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.math + +/** + * Dense matrix implementation of [[Matrix]]. Stores data in column major order in a continuous + * double array. + * + * @param numRows Number of rows + * @param numCols Number of columns + * @param data Array of matrix elements in column major order + */ +case class DenseMatrix(numRows: Int, numCols: Int, data: Array[Double]) + extends Matrix with Serializable { + + import DenseMatrix._ + + require(numRows * numCols == data.length, s"The number of values ${data.length} does " + + s"not correspond to its dimensions ($numRows, $numCols).") + + /** + * Element wise access function + * + * @param row row index + * @param col column index + * @return matrix entry at (row, col) + */ + override def apply(row: Int, col: Int): Double = { + val index = locate(row, col) + + data(index) + } + + override def toString: String = { + val result = StringBuilder.newBuilder + result.append(s"DenseMatrix($numRows, $numCols)\n") + + val columnsFieldWidths = for(row <- 0 until math.min(numRows, MAX_ROWS)) yield { + var column = 0 + var maxFieldWidth = 0 + + while(column * maxFieldWidth < LINE_WIDTH && column < numCols) { + val fieldWidth = printEntry(row, column).length + 2 + + if(fieldWidth > maxFieldWidth) { + maxFieldWidth = fieldWidth + } + + if(column * maxFieldWidth < LINE_WIDTH) { + column += 1 + } + } + + (column, maxFieldWidth) + } + + val (columns, fieldWidths) = columnsFieldWidths.unzip + + val maxColumns = columns.min + val fieldWidth = fieldWidths.max + + for(row <- 0 until math.min(numRows, MAX_ROWS)) { + for(col <- 0 until maxColumns) { + val str = printEntry(row, col) + + result.append(" " * (fieldWidth - str.length) + str) + } + + if(maxColumns < numCols) { + result.append("...") + } + + result.append("\n") + } + + if(numRows > MAX_ROWS) { + result.append("...\n") + } + + result.toString() + } + + override def equals(obj: Any): Boolean = { + obj match { + case dense: DenseMatrix => + numRows == dense.numRows && numCols == dense.numCols && data.sameElements(dense.data) + case _ => false + } + } + + override def hashCode: Int = { + val hashCodes = List(numRows.hashCode(), numCols.hashCode(), java.util.Arrays.hashCode(data)) + + hashCodes.foldLeft(3){(left, right) => left * 41 + right} + } + + /** Element wise update function + * + * @param row row index + * @param col column index + * @param value value to set at (row, col) + */ + override def update(row: Int, col: Int, value: Double): Unit = { + val index = locate(row, col) + + data(index) = value + } + + /** Converts the DenseMatrix to a SparseMatrix + * + * @return SparseMatrix build from all the non-null values + */ + def toSparseMatrix: SparseMatrix = { + val entries = for(row <- 0 until numRows; col <- 0 until numCols) yield { + (row, col, apply(row, col)) + } + + SparseMatrix.fromCOO(numRows, numCols, entries.filter(_._3 != 0)) + } + + /** Calculates the linear index of the respective matrix entry + * + * @param row row index + * @param col column index + * @return the index of the value according to the row and index + */ + private def locate(row: Int, col: Int): Int = { + require(0 <= row && row < numRows && 0 <= col && col < numCols, + (row, col) + " not in [0, " + numRows + ") x [0, " + numCols + ")") + + row + col * numRows + } + + /** Converts the entry at (row, col) to string + * + * @param row row index + * @param col column index + * @return Takes the value according to the row and index and convert it to string + */ + private def printEntry(row: Int, col: Int): String = { + val index = locate(row, col) + + data(index).toString + } + + /** Copies the matrix instance + * + * @return Copy of itself + */ + override def copy: DenseMatrix = { + new DenseMatrix(numRows, numCols, data.clone) + } +} + +object DenseMatrix { + + val LINE_WIDTH = 100 + val MAX_ROWS = 50 + + def apply(numRows: Int, numCols: Int, values: Array[Int]): DenseMatrix = { + new DenseMatrix(numRows, numCols, values.map(_.toDouble)) + } + + def apply(numRows: Int, numCols: Int, values: Double*): DenseMatrix = { + new DenseMatrix(numRows, numCols, values.toArray) + } + + def zeros(numRows: Int, numCols: Int): DenseMatrix = { + new DenseMatrix(numRows, numCols, Array.fill(numRows * numCols)(0.0)) + } + + def eye(numRows: Int, numCols: Int): DenseMatrix = { + new DenseMatrix(numRows, numCols, Array.fill(numRows * numCols)(1.0)) + } +} diff --git a/src/main/scala/org/apache/flink/ml/math/DenseVector.scala b/src/main/scala/org/apache/flink/ml/math/DenseVector.scala new file mode 100644 index 0000000000000..cc3d2859e592d --- /dev/null +++ b/src/main/scala/org/apache/flink/ml/math/DenseVector.scala @@ -0,0 +1,187 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.math + +import breeze.linalg.{SparseVector => BreezeSparseVector, DenseVector => BreezeDenseVector, Vector => BreezeVector} + +/** + * Dense vector implementation of [[Vector]]. The data is represented in a continuous array of + * doubles. + * + * @param data Array of doubles to store the vector elements + */ +case class DenseVector(data: Array[Double]) extends Vector with Serializable { + + /** + * Number of elements in a vector + * @return the number of the elements in the vector + */ + override def size: Int = { + data.length + } + + /** + * Element wise access function + * + * @param index index of the accessed element + * @return element at the given index + */ + override def apply(index: Int): Double = { + require(0 <= index && index < data.length, index + " not in [0, " + data.length + ")") + data(index) + } + + override def toString: String = { + s"DenseVector(${data.mkString(", ")})" + } + + override def equals(obj: Any): Boolean = { + obj match { + case dense: DenseVector => data.length == dense.data.length && data.sameElements(dense.data) + case _ => false + } + } + + override def hashCode: Int = { + java.util.Arrays.hashCode(data) + } + + /** + * Copies the vector instance + * + * @return Copy of the vector instance + */ + override def copy: DenseVector = { + DenseVector(data.clone()) + } + + /** Updates the element at the given index with the provided value + * + * @param index Index whose value is updated. + * @param value The value used to update the index. + */ + override def update(index: Int, value: Double): Unit = { + require(0 <= index && index < data.length, index + " not in [0, " + data.length + ")") + + data(index) = value + } + + /** Returns the dot product of the recipient and the argument + * + * @param other a Vector + * @return a scalar double of dot product + */ + override def dot(other: Vector): Double = { + require(size == other.size, "The size of vector must be equal.") + + other match { + case SparseVector(_, otherIndices, otherData) => + otherIndices.zipWithIndex.map { + case (idx, sparseIdx) => data(idx) * otherData(sparseIdx) + }.sum + case _ => (0 until size).map(i => data(i) * other(i)).sum + } + } + + /** Returns the outer product (a.k.a. Kronecker product) of `this` + * with `other`. The result will given in [[org.apache.flink.ml.math.SparseMatrix]] + * representation if `other` is sparse and as [[org.apache.flink.ml.math.DenseMatrix]] otherwise. + * + * @param other a Vector + * @return the [[org.apache.flink.ml.math.Matrix]] which equals the outer product of `this` + * with `other.` + */ + override def outer(other: Vector): Matrix = { + val numRows = size + val numCols = other.size + + other match { + case sv: SparseVector => + val entries = for { + i <- 0 until numRows + (j, k) <- sv.indices.zipWithIndex + value = this(i) * sv.data(k) + if value != 0 + } yield (i, j, value) + + SparseMatrix.fromCOO(numRows, numCols, entries) + case _ => + val values = for { + i <- 0 until numRows + j <- 0 until numCols + } yield this(i) * other(j) + + DenseMatrix(numRows, numCols, values.toArray) + } + } + + /** Magnitude of a vector + * + * @return The length of the vector + */ + override def magnitude: Double = { + math.sqrt(data.map(x => x * x).sum) + } + + /** Convert to a [[SparseVector]] + * + * @return Creates a SparseVector from the DenseVector + */ + def toSparseVector: SparseVector = { + val nonZero = (0 until size).zip(data).filter(_._2 != 0) + + SparseVector.fromCOO(size, nonZero) + } +} + +object DenseVector { + + def apply(values: Double*): DenseVector = { + new DenseVector(values.toArray) + } + + def apply(values: Array[Int]): DenseVector = { + new DenseVector(values.map(_.toDouble)) + } + + def zeros(size: Int): DenseVector = { + init(size, 0.0) + } + + def eye(size: Int): DenseVector = { + init(size, 1.0) + } + + def init(size: Int, value: Double): DenseVector = { + new DenseVector(Array.fill(size)(value)) + } + + /** BreezeVectorConverter implementation for [[org.apache.flink.ml.math.DenseVector]] + * + * This allows to convert Breeze vectors into [[DenseVector]]. + */ + implicit val denseVectorConverter = new BreezeVectorConverter[DenseVector] { + override def convert(vector: BreezeVector[Double]): DenseVector = { + vector match { + case dense: BreezeDenseVector[Double] => new DenseVector(dense.data) + case sparse: BreezeSparseVector[Double] => new DenseVector(sparse.toDenseVector.data) + } + } + } +} diff --git a/src/main/scala/org/apache/flink/ml/math/Matrix.scala b/src/main/scala/org/apache/flink/ml/math/Matrix.scala new file mode 100644 index 0000000000000..788eeae5cd7a2 --- /dev/null +++ b/src/main/scala/org/apache/flink/ml/math/Matrix.scala @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.math + +/** Base trait for a matrix representation + * + */ +trait Matrix { + + /** Number of rows + * + * @return number of rows in the matrix + */ + def numRows: Int + + /** Number of columns + * + * @return number of columns in the matrix + */ + def numCols: Int + + /** Element wise access function + * + * @param row row index + * @param col column index + * @return matrix entry at (row, col) + */ + def apply(row: Int, col: Int): Double + + /** Element wise update function + * + * @param row row index + * @param col column index + * @param value value to set at (row, col) + */ + def update(row: Int, col: Int, value: Double): Unit + + /** Copies the matrix instance + * + * @return Copy of itself + */ + def copy: Matrix + + def equalsMatrix(matrix: Matrix): Boolean = { + if(numRows == matrix.numRows && numCols == matrix.numCols) { + val coordinates = for(row <- 0 until numRows; col <- 0 until numCols) yield (row, col) + coordinates forall { case(row, col) => this.apply(row, col) == matrix(row, col)} + } else { + false + } + } + +} diff --git a/src/main/scala/org/apache/flink/ml/math/SparseMatrix.scala b/src/main/scala/org/apache/flink/ml/math/SparseMatrix.scala new file mode 100644 index 0000000000000..7d53e85f42cd3 --- /dev/null +++ b/src/main/scala/org/apache/flink/ml/math/SparseMatrix.scala @@ -0,0 +1,267 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.math + +import scala.util.Sorting + +/** Sparse matrix using the compressed sparse column (CSC) representation. + * + * More details concerning the compressed sparse column (CSC) representation can be found + * [http://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_column_.28CSC_or_CCS.29]. + * + * @param numRows Number of rows + * @param numCols Number of columns + * @param rowIndices Array containing the row indices of non-zero entries + * @param colPtrs Array containing the starting offsets in data for each column + * @param data Array containing the non-zero entries in column-major order + */ +class SparseMatrix( + val numRows: Int, + val numCols: Int, + val rowIndices: Array[Int], + val colPtrs: Array[Int], + val data: Array[Double]) + extends Matrix + with Serializable { + + /** Element wise access function + * + * @param row row index + * @param col column index + * @return matrix entry at (row, col) + */ + override def apply(row: Int, col: Int): Double = { + val index = locate(row, col) + + if(index < 0){ + 0 + } else { + data(index) + } + } + + def toDenseMatrix: DenseMatrix = { + val result = DenseMatrix.zeros(numRows, numCols) + + for(row <- 0 until numRows; col <- 0 until numCols) { + result(row, col) = apply(row, col) + } + + result + } + + /** Element wise update function + * + * @param row row index + * @param col column index + * @param value value to set at (row, col) + */ + override def update(row: Int, col: Int, value: Double): Unit = { + val index = locate(row, col) + + if(index < 0) { + throw new IllegalArgumentException("Cannot update zero value of sparse matrix at index " + + s"($row, $col)") + } else { + data(index) = value + } + } + + override def toString: String = { + val result = StringBuilder.newBuilder + + result.append(s"SparseMatrix($numRows, $numCols)\n") + + var columnIndex = 0 + + val fieldWidth = math.max(numRows, numCols).toString.length + val valueFieldWidth = data.map(_.toString.length).max + 2 + + for(index <- 0 until colPtrs.last) { + while(colPtrs(columnIndex + 1) <= index){ + columnIndex += 1 + } + + val rowStr = rowIndices(index).toString + val columnStr = columnIndex.toString + val valueStr = data(index).toString + + result.append("(" + " " * (fieldWidth - rowStr.length) + rowStr + "," + + " " * (fieldWidth - columnStr.length) + columnStr + ")") + result.append(" " * (valueFieldWidth - valueStr.length) + valueStr) + result.append("\n") + } + + result.toString + } + + override def equals(obj: Any): Boolean = { + obj match { + case sm: SparseMatrix if numRows == sm.numRows && numCols == sm.numCols => + rowIndices.sameElements(sm.rowIndices) && colPtrs.sameElements(sm.colPtrs) && + data.sameElements(sm.data) + case _ => false + } + } + + override def hashCode: Int = { + val hashCodes = List(numRows.hashCode(), numCols.hashCode(), + java.util.Arrays.hashCode(rowIndices), java.util.Arrays.hashCode(colPtrs), + java.util.Arrays.hashCode(data)) + + hashCodes.foldLeft(5){(left, right) => left * 41 + right} + } + + private def locate(row: Int, col: Int): Int = { + require(0 <= row && row < numRows && 0 <= col && col < numCols, + (row, col) + " not in [0, " + numRows + ") x [0, " + numCols + ")") + + val startIndex = colPtrs(col) + val endIndex = colPtrs(col + 1) + + java.util.Arrays.binarySearch(rowIndices, startIndex, endIndex, row) + } + + /** Copies the matrix instance + * + * @return Copy of itself + */ + override def copy: SparseMatrix = { + new SparseMatrix(numRows, numCols, rowIndices.clone, colPtrs.clone(), data.clone) + } +} + +object SparseMatrix{ + + /** Constructs a sparse matrix from a coordinate list (COO) representation where each entry + * is stored as a tuple of (rowIndex, columnIndex, value). + * + * @param numRows Number of rows + * @param numCols Number of columns + * @param entries Data entries in the matrix + * @return Newly constructed sparse matrix + */ + def fromCOO(numRows: Int, numCols: Int, entries: (Int, Int, Double)*): SparseMatrix = { + fromCOO(numRows, numCols, entries) + } + + /** Constructs a sparse matrix from a coordinate list (COO) representation where each entry + * is stored as a tuple of (rowIndex, columnIndex, value). + * + * @param numRows Number of rows + * @param numCols Number of columns + * @param entries Data entries in the matrix + * @return Newly constructed sparse matrix + */ + def fromCOO(numRows: Int, numCols: Int, entries: Iterable[(Int, Int, Double)]): SparseMatrix = { + val entryArray = entries.toArray + + entryArray.foreach{ case (row, col, _) => + require(0 <= row && row < numRows && 0 <= col && col <= numCols, + (row, col) + " not in [0, " + numRows + ") x [0, " + numCols + ")") + } + + val COOOrdering = new Ordering[(Int, Int, Double)] { + override def compare(x: (Int, Int, Double), y: (Int, Int, Double)): Int = { + if(x._2 < y._2) { + -1 + } else if(x._2 > y._2) { + 1 + } else { + x._1 - y._1 + } + } + } + + Sorting.quickSort(entryArray)(COOOrdering) + + val nnz = entryArray.length + + val data = new Array[Double](nnz) + val rowIndices = new Array[Int](nnz) + val colPtrs = new Array[Int](numCols + 1) + + var (lastRow, lastCol, lastValue) = entryArray(0) + + rowIndices(0) = lastRow + data(0) = lastValue + + var i = 1 + var lastDataIndex = 0 + + while(i < nnz) { + val (curRow, curCol, curValue) = entryArray(i) + + if(lastRow == curRow && lastCol == curCol) { + // add values with identical coordinates + data(lastDataIndex) += curValue + } else { + lastDataIndex += 1 + data(lastDataIndex) = curValue + rowIndices(lastDataIndex) = curRow + lastRow = curRow + } + + while(lastCol < curCol) { + lastCol += 1 + colPtrs(lastCol) = lastDataIndex + } + + i += 1 + } + + lastDataIndex += 1 + while(lastCol < numCols) { + colPtrs(lastCol + 1) = lastDataIndex + lastCol += 1 + } + + val prunedRowIndices = if(lastDataIndex < nnz) { + val prunedArray = new Array[Int](lastDataIndex) + rowIndices.copyToArray(prunedArray) + prunedArray + } else { + rowIndices + } + + val prunedData = if(lastDataIndex < nnz) { + val prunedArray = new Array[Double](lastDataIndex) + data.copyToArray(prunedArray) + prunedArray + } else { + data + } + + new SparseMatrix(numRows, numCols, prunedRowIndices, colPtrs, prunedData) + } + + /** Convenience method to convert a single tuple with an integer value into a SparseMatrix. + * The problem is that providing a single tuple to the fromCOO method, the Scala type inference + * cannot infer that the tuple has to be of type (Int, Int, Double) because of the overloading + * with the Iterable type. + * + * @param numRows Number of rows + * @param numCols Number of columns + * @param entry Data entries in the matrix + * @return Newly constructed sparse matrix + */ + def fromCOO(numRows: Int, numCols: Int, entry: (Int, Int, Int)): SparseMatrix = { + fromCOO(numRows, numCols, (entry._1, entry._2, entry._3.toDouble)) + } +} diff --git a/src/main/scala/org/apache/flink/ml/math/SparseVector.scala b/src/main/scala/org/apache/flink/ml/math/SparseVector.scala new file mode 100644 index 0000000000000..7350a45c46144 --- /dev/null +++ b/src/main/scala/org/apache/flink/ml/math/SparseVector.scala @@ -0,0 +1,283 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.math + +import breeze.linalg.{SparseVector => BreezeSparseVector, DenseVector => BreezeDenseVector, Vector => BreezeVector} + +import scala.util.Sorting + +/** Sparse vector implementation storing the data in two arrays. One index contains the sorted + * indices of the non-zero vector entries and the other the corresponding vector entries + */ +case class SparseVector(size: Int, indices: Array[Int], data: Array[Double]) + extends Vector with Serializable { + + /** Updates the element at the given index with the provided value + * + * @param index Index whose value is updated. + * @param value The value used to update the index. + */ + override def update(index: Int, value: Double): Unit = { + val resolvedIndex = locate(index) + + if (resolvedIndex < 0) { + throw new IllegalArgumentException("Cannot update zero value of sparse vector at " + + s"index $index") + } else { + data(resolvedIndex) = value + } + } + + /** Copies the vector instance + * + * @return Copy of the [[SparseVector]] instance + */ + override def copy: SparseVector = { + new SparseVector(size, indices.clone, data.clone) + } + + /** Returns the dot product of the recipient and the argument + * + * @param other a Vector + * @return a scalar double of dot product + */ + override def dot(other: Vector): Double = { + require(size == other.size, "The size of vector must be equal.") + other match { + case DenseVector(otherData) => + indices.zipWithIndex.map { case (sparseIdx, idx) => data(idx) * otherData(sparseIdx) }.sum + case SparseVector(_, otherIndices, otherData) => + var left = 0 + var right = 0 + var result = 0.0 + + while (left < indices.length && right < otherIndices.length) { + if (indices(left) < otherIndices(right)) { + left += 1 + } else if (otherIndices(right) < indices(left)) { + right += 1 + } else { + result += data(left) * otherData(right) + left += 1 + right += 1 + } + } + result + } + } + + /** Returns the outer product (a.k.a. Kronecker product) of `this` with `other`. The result is + * given in [[SparseMatrix]] representation. + * + * @param other a [[Vector]] + * @return the [[SparseMatrix]] which equals the outer product of `this` with `other.` + */ + override def outer(other: Vector): SparseMatrix = { + val numRows = size + val numCols = other.size + + val entries = other match { + case sv: SparseVector => + for { + (i, k) <- indices.zipWithIndex + (j, l) <- sv.indices.zipWithIndex + value = data(k) * sv.data(l) + if value != 0 + } yield (i, j, value) + case _ => + for { + (i, k) <- indices.zipWithIndex + j <- 0 until numCols + value = data(k) * other(j) + if value != 0 + } yield (i, j, value) + } + + SparseMatrix.fromCOO(numRows, numCols, entries) + } + + + /** Magnitude of a vector + * + * @return The length of the vector + */ + override def magnitude: Double = math.sqrt(data.map(x => x * x).sum) + + /** Element wise access function + * + * * @param index index of the accessed element + * @return element with index + */ + override def apply(index: Int): Double = { + val resolvedIndex = locate(index) + + if(resolvedIndex < 0) { + 0 + } else { + data(resolvedIndex) + } + } + + /** Converts the [[SparseVector]] to a [[DenseVector]] + * + * @return The DenseVector out of the SparseVector + */ + def toDenseVector: DenseVector = { + val denseVector = DenseVector.zeros(size) + + for(index <- 0 until size) { + denseVector(index) = this(index) + } + + denseVector + } + + override def equals(obj: Any): Boolean = { + obj match { + case sv: SparseVector if size == sv.size => + indices.sameElements(sv.indices) && data.sameElements(sv.data) + case _ => false + } + } + + override def hashCode: Int = { + val hashCodes = List(size.hashCode, java.util.Arrays.hashCode(indices), + java.util.Arrays.hashCode(data)) + + hashCodes.foldLeft(3){ (left, right) => left * 41 + right} + } + + override def toString: String = { + val entries = indices.zip(data).mkString(", ") + "SparseVector(" + entries + ")" + } + + private def locate(index: Int): Int = { + require(0 <= index && index < size, index + " not in [0, " + size + ")") + + java.util.Arrays.binarySearch(indices, 0, indices.length, index) + } +} + +object SparseVector { + + /** Constructs a sparse vector from a coordinate list (COO) representation where each entry + * is stored as a tuple of (index, value). + * + * @param size The number of elements in the vector + * @param entries The values in the vector + * @return a new [[SparseVector]] + */ + def fromCOO(size: Int, entries: (Int, Double)*): SparseVector = { + fromCOO(size, entries) + } + + /** Constructs a sparse vector from a coordinate list (COO) representation where each entry + * is stored as a tuple of (index, value). + * + * @param size The number of elements in the vector + * @param entries An iterator supplying the values in the vector + * @return a new [[SparseVector]] + */ + def fromCOO(size: Int, entries: Iterable[(Int, Double)]): SparseVector = { + val entryArray = entries.toArray + + entryArray.foreach { case (index, _) => + require(0 <= index && index < size, index + " not in [0, " + size + ")") + } + + val COOOrdering = new Ordering[(Int, Double)] { + override def compare(x: (Int, Double), y: (Int, Double)): Int = { + x._1 - y._1 + } + } + + Sorting.quickSort(entryArray)(COOOrdering) + + // calculate size of the array + val arraySize = entryArray.foldLeft((-1, 0)){ case ((lastIndex, numRows), (index, _)) => + if(lastIndex == index) { + (lastIndex, numRows) + } else { + (index, numRows + 1) + } + }._2 + + val indices = new Array[Int](arraySize) + val data = new Array[Double](arraySize) + + val (index, value) = entryArray(0) + + indices(0) = index + data(0) = value + + var i = 1 + var lastIndex = indices(0) + var lastDataIndex = 0 + + while(i < entryArray.length) { + val (curIndex, curValue) = entryArray(i) + + if(curIndex == lastIndex) { + data(lastDataIndex) += curValue + } else { + lastDataIndex += 1 + data(lastDataIndex) = curValue + indices(lastDataIndex) = curIndex + lastIndex = curIndex + } + + i += 1 + } + + new SparseVector(size, indices, data) + } + + /** Convenience method to be able to instantiate a SparseVector with a single element. The Scala + * type inference mechanism cannot infer that the second tuple value has to be of type Double + * if only a single tuple is provided. + * + * @param size The number of elements in the vector + * @param entry The value in the vector + * @return a new [[SparseVector]] + */ + def fromCOO(size: Int, entry: (Int, Int)): SparseVector = { + fromCOO(size, (entry._1, entry._2.toDouble)) + } + + /** BreezeVectorConverter implementation for [[org.apache.flink.ml.math.SparseVector]] + * + * This allows to convert Breeze vectors into [[SparseVector]] + */ + implicit val sparseVectorConverter = new BreezeVectorConverter[SparseVector] { + override def convert(vector: BreezeVector[Double]): SparseVector = { + vector match { + case dense: BreezeDenseVector[Double] => + SparseVector.fromCOO( + dense.length, + dense.iterator.toIterable) + case sparse: BreezeSparseVector[Double] => + new SparseVector( + sparse.length, + sparse.index.take(sparse.used), + sparse.data.take(sparse.used)) + } + } + } +} diff --git a/src/main/scala/org/apache/flink/ml/math/Vector.scala b/src/main/scala/org/apache/flink/ml/math/Vector.scala new file mode 100644 index 0000000000000..8d26c28e9d5ab --- /dev/null +++ b/src/main/scala/org/apache/flink/ml/math/Vector.scala @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.math + +import breeze.linalg.{SparseVector => BreezeSparseVector, DenseVector => BreezeDenseVector, Vector => BreezeVector} + +/** Base trait for Vectors + * + */ +trait Vector extends Serializable { + + /** Number of elements in a vector + * + * @return The number of elements of the vector + */ + def size: Int + + /** Element wise access function + * + * @param index index of the accessed element + * @return value of the associated with the index + */ + def apply(index: Int): Double + + /** Updates the element at the given index with the provided value + * + * @param index The index of the element to be updated + * @param value The new value + */ + def update(index: Int, value: Double): Unit + + /** Copies the vector instance + * + * @return Copy of the vector instance + */ + def copy: Vector + + /** Returns the dot product of the recipient and the argument + * + * @param other a Vector + * @return a scalar double of dot product + */ + def dot(other: Vector): Double + + /** Returns the outer product of the recipient and the argument + * + * @param other a Vector + * @return a matrix + */ + def outer(other: Vector): Matrix + + /** Magnitude of a vector + * + * @return The length of the vector + */ + def magnitude: Double + + def equalsVector(vector: Vector): Boolean = { + if(size == vector.size) { + (0 until size) forall { idx => + this(idx) == vector(idx) + } + } else { + false + } + } +} + +object Vector{ + /** BreezeVectorConverter implementation for [[Vector]] + * + * This allows to convert Breeze vectors into [[Vector]]. + */ + implicit val vectorConverter = new BreezeVectorConverter[Vector] { + override def convert(vector: BreezeVector[Double]): Vector = { + vector match { + case dense: BreezeDenseVector[Double] => new DenseVector(dense.data) + + case sparse: BreezeSparseVector[Double] => + new SparseVector( + sparse.length, + sparse.index.take(sparse.used), + sparse.data.take(sparse.used)) + } + } + } +} diff --git a/src/main/scala/org/apache/flink/ml/math/VectorBuilder.scala b/src/main/scala/org/apache/flink/ml/math/VectorBuilder.scala new file mode 100644 index 0000000000000..19f8fc2723354 --- /dev/null +++ b/src/main/scala/org/apache/flink/ml/math/VectorBuilder.scala @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.math + +/** Type class to allow the vector construction from different data types + * + * @tparam T Subtype of [[Vector]] + */ +trait VectorBuilder[T <: Vector] extends Serializable { + /** Builds a [[Vector]] of type T from a List[Double] + * + * @param data Input data where the index denotes the resulting index of the vector + * @return A vector of type T + */ + def build(data: List[Double]): T +} + +object VectorBuilder{ + + /** Type class implementation for [[org.apache.flink.ml.math.DenseVector]] */ + implicit val denseVectorBuilder = new VectorBuilder[DenseVector] { + override def build(data: List[Double]): DenseVector = { + new DenseVector(data.toArray) + } + } + + /** Type class implementation for [[org.apache.flink.ml.math.SparseVector]] */ + implicit val sparseVectorBuilder = new VectorBuilder[SparseVector] { + override def build(data: List[Double]): SparseVector = { + // Enrich elements with explicit indices and filter out zero entries + SparseVector.fromCOO(data.length, data.indices.zip(data).filter(_._2 != 0.0)) + } + } + + /** Type class implementation for [[Vector]] */ + implicit val vectorBuilder = new VectorBuilder[Vector] { + override def build(data: List[Double]): Vector = { + new DenseVector(data.toArray) + } + } +} diff --git a/src/main/scala/org/apache/flink/ml/math/distributed/DistributedMatrix.scala b/src/main/scala/org/apache/flink/ml/math/distributed/DistributedMatrix.scala new file mode 100644 index 0000000000000..d7dab8f89ffb4 --- /dev/null +++ b/src/main/scala/org/apache/flink/ml/math/distributed/DistributedMatrix.scala @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.math.distributed + +/** Common trait used by distributed data structures representing a matrix. */ +trait DistributedMatrix { + /** Returns number of rows in matrix. + * + * @return Number of rows + */ + def numRows: Int + + /** Returns number of columns in matrix. + * + * @return Number of columns + */ + def numCols: Int +} + +object DistributedMatrix { + type MatrixColIndex = Int + type MatrixRowIndex = Int +} diff --git a/src/main/scala/org/apache/flink/ml/math/distributed/DistributedRowMatrix.scala b/src/main/scala/org/apache/flink/ml/math/distributed/DistributedRowMatrix.scala new file mode 100644 index 0000000000000..342191a30207a --- /dev/null +++ b/src/main/scala/org/apache/flink/ml/math/distributed/DistributedRowMatrix.scala @@ -0,0 +1,172 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.math.distributed + +import org.apache.flink.api.scala._ +import org.apache.flink.ml.math._ +import org.apache.flink.ml.math.Breeze._ +import org.apache.flink.ml.math.distributed.DistributedMatrix._ + +/** Represents distributed row-major matrix. + * + * @param data [[DataSet]] which contains [[IndexedRow]]s + * @param numRows Number of rows + * @param numCols Number of columns + */ +class DistributedRowMatrix( + val data: DataSet[IndexedRow], + val numRows: Int, + val numCols: Int +) extends DistributedMatrix { + + /** Collects the data in the form of a sequence of coordinates associated with their values. + * This operation immediately triggers program execution. + * + * @return Returns the matrix in the sparse coordinate format + */ + def toCOO: Seq[(MatrixRowIndex, MatrixColIndex, Double)] = { + val localRows = data.collect() + + for { + IndexedRow(rowIndex, vector) <- localRows + (columnIndex, value) <- vector + } yield (rowIndex, columnIndex, value) + } + + /** Collects the data in the form of a SparseMatrix. This operation immediately triggers program + * execution. + * + * @return Returns the matrix as a local [[SparseMatrix]] + */ + def toLocalSparseMatrix: SparseMatrix = { + val localMatrix = SparseMatrix.fromCOO(this.numRows, this.numCols, this.toCOO) + require(localMatrix.numRows == this.numRows) + require(localMatrix.numCols == this.numCols) + + localMatrix + } + + // TODO: convert to dense representation on the distributed matrix and collect it afterward + /** Collects the data in the form of a DenseMatrix. This operation immediately triggers program + * execution. + * + * @return Returns the matrix as a [[DenseMatrix]] + */ + def toLocalDenseMatrix: DenseMatrix = this.toLocalSparseMatrix.toDenseMatrix + + /** Applies a high-order function to couple of rows. + * + * @param func a function to be applied + * @param other a [[DistributedRowMatrix]] to apply the function together + * @return Applies the function and returns a new [[DistributedRowMatrix]] + */ + def byRowOperation( + func: (Vector, Vector) => Vector, + other: DistributedRowMatrix + ): DistributedRowMatrix = { + val otherData = other.data + require(this.numCols == other.numCols) + require(this.numRows == other.numRows) + + val result = this.data + .fullOuterJoin(otherData) + .where("rowIndex") + .equalTo("rowIndex")( + (left: IndexedRow, right: IndexedRow) => { + val row1 = Option(left) match { + case Some(row: IndexedRow) => row + case None => + IndexedRow(right.rowIndex, SparseVector.fromCOO(right.values.size, List((0, 0.0)))) + } + val row2 = Option(right) match { + case Some(row: IndexedRow) => row + case None => + IndexedRow(left.rowIndex, SparseVector.fromCOO(left.values.size, List((0, 0.0)))) + } + IndexedRow(row1.rowIndex, func(row1.values, row2.values)) + } + ) + new DistributedRowMatrix(result, numRows, numCols) + } + + /** Adds this matrix to another matrix. + * + * @param other a [[DistributedRowMatrix]] to be added + * @return [[DistributedRowMatrix]] representing the two matrices added. + */ + def add(other: DistributedRowMatrix): DistributedRowMatrix = { + val addFunction = (x: Vector, y: Vector) => (x.asBreeze + y.asBreeze).fromBreeze + this.byRowOperation(addFunction, other) + } + + /** Subtracts another matrix from this matrix. + * + * @param other a [[DistributedRowMatrix]] to be subtracted from this matrix + * @return [[DistributedRowMatrix]] representing the original matrix subtracted by the supplied + * matrix. + */ + def subtract(other: DistributedRowMatrix): DistributedRowMatrix = { + val subFunction = (x: Vector, y: Vector) => (x.asBreeze - y.asBreeze).fromBreeze + this.byRowOperation(subFunction, other) + } +} + +object DistributedRowMatrix { + + /** Builds a [[DistributedRowMatrix]] from a [[DataSet]] in COO. + * + * @param data [[DataSet]] which contains matrix elements in the form of + * (row index, column index, value) + * @param numRows Number of rows + * @param numCols Number of columns + * @param isSorted If false, sorts the row to properly build the matrix representation. + * If already sorted, set this parameter to true to skip sorting. + * @return the [[DistributedRowMatrix]] build from the original coordinate matrix + */ + def fromCOO(data: DataSet[(MatrixRowIndex, MatrixColIndex, Double)], + numRows: Int, + numCols: Int, + isSorted: Boolean = false + ): DistributedRowMatrix = { + val vectorData: DataSet[(MatrixRowIndex, SparseVector)] = data + .groupBy(0) + .reduceGroup(sparseRow => { + require(sparseRow.nonEmpty) + val sortedRow = + if (isSorted) { + sparseRow.toList + } else { + sparseRow.toList.sortBy(row => row._2) + } + val (indices, values) = sortedRow.map(x => (x._2, x._3)).unzip + (sortedRow.head._1, SparseVector(numCols, indices.toArray, values.toArray)) + }) + + val zippedData = vectorData.map(x => IndexedRow(x._1.toInt, x._2)) + + new DistributedRowMatrix(zippedData, numRows, numCols) + } +} + +/** Represents a row in row-major matrix. */ +case class IndexedRow(rowIndex: MatrixRowIndex, values: Vector) extends Ordered[IndexedRow] { + def compare(other: IndexedRow) = this.rowIndex.compare(other.rowIndex) + + override def toString: String = s"($rowIndex, ${values.toString})" +} diff --git a/src/main/scala/org/apache/flink/ml/math/package.scala b/src/main/scala/org/apache/flink/ml/math/package.scala new file mode 100644 index 0000000000000..cb5dee4f48c56 --- /dev/null +++ b/src/main/scala/org/apache/flink/ml/math/package.scala @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml + +/** + * Convenience methods to handle Flink's [[org.apache.flink.ml.math.Matrix]] and [[Vector]] + * abstraction. + */ +package object math { + implicit class RichMatrix(matrix: Matrix) extends Iterable[(Int, Int, Double)] { + + override def iterator: Iterator[(Int, Int, Double)] = { + new Iterator[(Int, Int, Double)] { + var index = 0 + + override def hasNext: Boolean = { + index < matrix.numRows * matrix.numCols + } + + override def next(): (Int, Int, Double) = { + val row = index % matrix.numRows + val column = index / matrix.numRows + + index += 1 + + (row, column, matrix(row, column)) + } + } + } + + def valueIterator: Iterator[Double] = { + val it = iterator + + new Iterator[Double] { + override def hasNext: Boolean = it.hasNext + + override def next(): Double = it.next._3 + } + } + + } + + implicit class RichVector(vector: Vector) extends Iterable[(Int, Double)] { + + override def iterator: Iterator[(Int, Double)] = { + new Iterator[(Int, Double)] { + var index = 0 + + override def hasNext: Boolean = { + index < vector.size + } + + override def next(): (Int, Double) = { + val resultIndex = index + + index += 1 + + (resultIndex, vector(resultIndex)) + } + } + } + + def valueIterator: Iterator[Double] = { + val it = iterator + + new Iterator[Double] { + override def hasNext: Boolean = it.hasNext + + override def next(): Double = it.next._2 + } + } + } + + /** Stores the vector values in a dense array + * + * @param vector Subtype of [[Vector]] + * @return Array containing the vector values + */ + def vector2Array(vector: Vector): Array[Double] = { + vector match { + case dense: DenseVector => dense.data.clone + + case sparse: SparseVector => { + val result = new Array[Double](sparse.size) + + for ((index, value) <- sparse) { + result(index) = value + } + + result + } + } + } +} diff --git a/src/main/scala/org/apache/flink/ml/metrics/distances/ChebyshevDistanceMetric.scala b/src/main/scala/org/apache/flink/ml/metrics/distances/ChebyshevDistanceMetric.scala new file mode 100644 index 0000000000000..51d490bc5e01d --- /dev/null +++ b/src/main/scala/org/apache/flink/ml/metrics/distances/ChebyshevDistanceMetric.scala @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.metrics.distances + +import org.apache.flink.ml.math.Vector + +/** This class implements a Chebyshev distance metric. The class calculates the distance between + * the given vectors by finding the maximum difference between each coordinate. + * + * @see http://en.wikipedia.org/wiki/Chebyshev_distance + */ +class ChebyshevDistanceMetric extends DistanceMetric { + override def distance(a: Vector, b: Vector): Double = { + checkValidArguments(a, b) + (0 until a.size).map(i => math.abs(a(i) - b(i))).max + } +} + +object ChebyshevDistanceMetric { + def apply() = new ChebyshevDistanceMetric() +} diff --git a/src/main/scala/org/apache/flink/ml/metrics/distances/CosineDistanceMetric.scala b/src/main/scala/org/apache/flink/ml/metrics/distances/CosineDistanceMetric.scala new file mode 100644 index 0000000000000..e7ad8a1aa72a3 --- /dev/null +++ b/src/main/scala/org/apache/flink/ml/metrics/distances/CosineDistanceMetric.scala @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.metrics.distances + +import org.apache.flink.ml.math.Vector + +/** This class implements a cosine distance metric. The class calculates the distance between + * the given vectors by dividing the dot product of two vectors by the product of their lengths. + * We convert the result of division to a usable distance. So, 1 - cos(angle) is actually returned. + * + * @see http://en.wikipedia.org/wiki/Cosine_similarity + */ +class CosineDistanceMetric extends DistanceMetric { + override def distance(a: Vector, b: Vector): Double = { + checkValidArguments(a, b) + + val dotProd: Double = a.dot(b) + val denominator: Double = a.magnitude * b.magnitude + if (dotProd == 0 && denominator == 0) { + 0 + } else { + 1 - dotProd / denominator + } + } +} + +object CosineDistanceMetric { + def apply() = new CosineDistanceMetric() +} diff --git a/src/main/scala/org/apache/flink/ml/metrics/distances/DistanceMetric.scala b/src/main/scala/org/apache/flink/ml/metrics/distances/DistanceMetric.scala new file mode 100644 index 0000000000000..b421976ae9f72 --- /dev/null +++ b/src/main/scala/org/apache/flink/ml/metrics/distances/DistanceMetric.scala @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.metrics.distances + +import org.apache.flink.ml.math.Vector + +/** DistanceMeasure interface is used for object which determines distance between two points. + */ +trait DistanceMetric extends Serializable { + /** Returns the distance between the arguments. + * + * @param a a Vector defining a multi-dimensional point in some space + * @param b a Vector defining a multi-dimensional point in some space + * @return a scalar double of the distance + */ + def distance(a: Vector, b: Vector): Double + + protected def checkValidArguments(a: Vector, b: Vector) = { + require(a.size == b.size, "The each size of vectors must be same to calculate distance.") + } +} diff --git a/src/main/scala/org/apache/flink/ml/metrics/distances/EuclideanDistanceMetric.scala b/src/main/scala/org/apache/flink/ml/metrics/distances/EuclideanDistanceMetric.scala new file mode 100644 index 0000000000000..838f6d1e75a16 --- /dev/null +++ b/src/main/scala/org/apache/flink/ml/metrics/distances/EuclideanDistanceMetric.scala @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.metrics.distances + +import org.apache.flink.ml.math.Vector + +/** This class implements a Euclidean distance metric. The metric calculates the distance between + * the given two vectors by summing the square root of the squared differences between + * each coordinate. + * + * http://en.wikipedia.org/wiki/Euclidean_distance + * + * If you don't care about the true distance and only need for comparison, + * [[SquaredEuclideanDistanceMetric]] will be faster because it doesn't calculate the actual + * square root of the distances. + * + * @see http://en.wikipedia.org/wiki/Euclidean_distance + */ +class EuclideanDistanceMetric extends SquaredEuclideanDistanceMetric { + override def distance(a: Vector, b: Vector): Double = math.sqrt(super.distance(a, b)) +} + +object EuclideanDistanceMetric { + def apply() = new EuclideanDistanceMetric() +} diff --git a/src/main/scala/org/apache/flink/ml/metrics/distances/ManhattanDistanceMetric.scala b/src/main/scala/org/apache/flink/ml/metrics/distances/ManhattanDistanceMetric.scala new file mode 100644 index 0000000000000..0228448aa80cd --- /dev/null +++ b/src/main/scala/org/apache/flink/ml/metrics/distances/ManhattanDistanceMetric.scala @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.metrics.distances + +import org.apache.flink.ml.math.Vector + +/** This class implements a Manhattan distance metric. The class calculates the distance between + * the given vectors by summing the differences between each coordinate. + * + * @see http://en.wikipedia.org/wiki/Taxicab_geometry + */ +class ManhattanDistanceMetric extends DistanceMetric{ + override def distance(a: Vector, b: Vector): Double = { + checkValidArguments(a, b) + (0 until a.size).map(i => math.abs(a(i) - b(i))).sum + } +} + +object ManhattanDistanceMetric { + def apply() = new ManhattanDistanceMetric() +} diff --git a/src/main/scala/org/apache/flink/ml/metrics/distances/MinkowskiDistanceMetric.scala b/src/main/scala/org/apache/flink/ml/metrics/distances/MinkowskiDistanceMetric.scala new file mode 100644 index 0000000000000..98912951dd0f8 --- /dev/null +++ b/src/main/scala/org/apache/flink/ml/metrics/distances/MinkowskiDistanceMetric.scala @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.metrics.distances + +import org.apache.flink.ml.math.Vector + +/** This class implements a Minkowski distance metric. The metric is a generalization of + * L(p) distances: Euclidean distance and Manhattan distance. If you need for a special case of + * p = 1 or p = 2, use [[ManhattanDistanceMetric]], [[EuclideanDistanceMetric]]. This class is + * useful for high exponents. + * + * @param p the norm exponent of space + * + * @see http://en.wikipedia.org/wiki/Minkowski_distance + */ +class MinkowskiDistanceMetric(val p: Double) extends DistanceMetric { + override def distance(a: Vector, b: Vector): Double = { + checkValidArguments(a, b) + math.pow((0 until a.size).map(i => math.pow(math.abs(a(i) - b(i)), p)).sum, 1 / p) + } +} + +object MinkowskiDistanceMetric { + def apply(p: Double) = new MinkowskiDistanceMetric(p) +} diff --git a/src/main/scala/org/apache/flink/ml/metrics/distances/SquaredEuclideanDistanceMetric.scala b/src/main/scala/org/apache/flink/ml/metrics/distances/SquaredEuclideanDistanceMetric.scala new file mode 100644 index 0000000000000..0a468b8cfa3b9 --- /dev/null +++ b/src/main/scala/org/apache/flink/ml/metrics/distances/SquaredEuclideanDistanceMetric.scala @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.metrics.distances + +import org.apache.flink.ml.math.Vector + +/** This class is like [[EuclideanDistanceMetric]] but it does not take the square root. + * + * The value calculated by this class is not exact Euclidean distance, but it saves on computation + * when you need the value for only comparison. + */ +class SquaredEuclideanDistanceMetric extends DistanceMetric { + override def distance(a: Vector, b: Vector): Double = { + checkValidArguments(a, b) + (0 until a.size).map(i => math.pow(a(i) - b(i), 2)).sum + } +} + +object SquaredEuclideanDistanceMetric { + def apply() = new SquaredEuclideanDistanceMetric() +} diff --git a/src/main/scala/org/apache/flink/ml/metrics/distances/TanimotoDistanceMetric.scala b/src/main/scala/org/apache/flink/ml/metrics/distances/TanimotoDistanceMetric.scala new file mode 100644 index 0000000000000..8db9612020807 --- /dev/null +++ b/src/main/scala/org/apache/flink/ml/metrics/distances/TanimotoDistanceMetric.scala @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.metrics.distances + +import org.apache.flink.ml.math.Vector + +/** This class implements a Tanimoto distance metric. The class calculates the distance between + * the given vectors. The vectors are assumed as bit-wise vectors. We convert the result of + * division to a usable distance. So, 1 - similarity is actually returned. + * + * @see http://en.wikipedia.org/wiki/Jaccard_index + */ +class TanimotoDistanceMetric extends DistanceMetric { + override def distance(a: Vector, b: Vector): Double = { + checkValidArguments(a, b) + + val dotProd: Double = a.dot(b) + 1 - dotProd / (a.magnitude * a.magnitude + b.magnitude * b.magnitude - dotProd) + } +} + +object TanimotoDistanceMetric { + def apply() = new TanimotoDistanceMetric() +} diff --git a/src/main/scala/org/apache/flink/ml/nn/KNN.scala b/src/main/scala/org/apache/flink/ml/nn/KNN.scala new file mode 100644 index 0000000000000..f5c9c9c8cb90b --- /dev/null +++ b/src/main/scala/org/apache/flink/ml/nn/KNN.scala @@ -0,0 +1,359 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.nn + +import org.apache.flink.api.common.operators.Order +import org.apache.flink.api.common.operators.base.CrossOperatorBase.CrossHint +import org.apache.flink.api.common.typeinfo.TypeInformation +import org.apache.flink.api.scala._ +import org.apache.flink.api.scala.utils._ +import org.apache.flink.ml.common._ +import org.apache.flink.ml.math.{DenseVector, Vector => FlinkVector} +import org.apache.flink.ml.metrics.distances._ +import org.apache.flink.ml.pipeline.{FitOperation, PredictDataSetOperation, Predictor} +import org.apache.flink.util.Collector + +import scala.collection.immutable.Vector +import scala.collection.mutable +import scala.collection.mutable.ArrayBuffer +import scala.reflect.ClassTag + +/** Implements a `k`-nearest neighbor join. + * + * Calculates the `k`-nearest neighbor points in the training set for each point in the test set. + * + * @example + * {{{ + * val trainingDS: DataSet[Vector] = ... + * val testingDS: DataSet[Vector] = ... + * + * val knn = KNN() + * .setK(10) + * .setBlocks(5) + * .setDistanceMetric(EuclideanDistanceMetric()) + * + * knn.fit(trainingDS) + * + * val predictionDS: DataSet[(Vector, Array[Vector])] = knn.predict(testingDS) + * }}} + * + * =Parameters= + * + * - [[org.apache.flink.ml.nn.KNN.K]] + * Sets the K which is the number of selected points as neighbors. (Default value: '''5''') + * + * - [[org.apache.flink.ml.nn.KNN.DistanceMetric]] + * Sets the distance metric we use to calculate the distance between two points. If no metric is + * specified, then [[org.apache.flink.ml.metrics.distances.EuclideanDistanceMetric]] is used. + * (Default value: '''EuclideanDistanceMetric()''') + * + * - [[org.apache.flink.ml.nn.KNN.Blocks]] + * Sets the number of blocks into which the input data will be split. This number should be set + * at least to the degree of parallelism. If no value is specified, then the parallelism of the + * input [[DataSet]] is used as the number of blocks. (Default value: '''None''') + * + * - [[org.apache.flink.ml.nn.KNN.UseQuadTree]] + * A boolean variable that whether or not to use a quadtree to partition the training set + * to potentially simplify the KNN search. If no value is specified, the code will + * automatically decide whether or not to use a quadtree. Use of a quadtree scales well + * with the number of training and testing points, though poorly with the dimension. + * (Default value: '''None''') + * + * - [[org.apache.flink.ml.nn.KNN.SizeHint]] + * Specifies whether the training set or test set is small to optimize the cross + * product operation needed for the KNN search. If the training set is small + * this should be `CrossHint.FIRST_IS_SMALL` and set to `CrossHint.SECOND_IS_SMALL` + * if the test set is small. + * (Default value: '''None''') + * + */ + +class KNN extends Predictor[KNN] { + + import KNN._ + + var trainingSet: Option[DataSet[Block[FlinkVector]]] = None + + /** Sets K + * + * @param k the number of selected points as neighbors + */ + def setK(k: Int): KNN = { + require(k > 0, "K must be positive.") + parameters.add(K, k) + this + } + + /** Sets the distance metric + * + * @param metric the distance metric to calculate distance between two points + */ + def setDistanceMetric(metric: DistanceMetric): KNN = { + parameters.add(DistanceMetric, metric) + this + } + + /** Sets the number of data blocks/partitions + * + * @param n the number of data blocks + */ + def setBlocks(n: Int): KNN = { + require(n > 0, "Number of blocks must be positive.") + parameters.add(Blocks, n) + this + } + + /** Sets the Boolean variable that decides whether to use the QuadTree or not */ + def setUseQuadTree(useQuadTree: Boolean): KNN = { + if (useQuadTree) { + require(parameters(DistanceMetric).isInstanceOf[SquaredEuclideanDistanceMetric] || + parameters(DistanceMetric).isInstanceOf[EuclideanDistanceMetric]) + } + parameters.add(UseQuadTree, useQuadTree) + this + } + + /** Parameter a user can specify if one of the training or test sets are small + * + * @param sizeHint cross hint tells the system which sizes to expect from the data sets + */ + def setSizeHint(sizeHint: CrossHint): KNN = { + parameters.add(SizeHint, sizeHint) + this + } + +} + +object KNN { + + case object K extends Parameter[Int] { + val defaultValue: Option[Int] = Some(5) + } + + case object DistanceMetric extends Parameter[DistanceMetric] { + val defaultValue: Option[DistanceMetric] = Some(EuclideanDistanceMetric()) + } + + case object Blocks extends Parameter[Int] { + val defaultValue: Option[Int] = None + } + + case object UseQuadTree extends Parameter[Boolean] { + val defaultValue: Option[Boolean] = None + } + + case object SizeHint extends Parameter[CrossHint] { + val defaultValue: Option[CrossHint] = None + } + + def apply(): KNN = { + new KNN() + } + + /** [[FitOperation]] which trains a KNN based on the given training data set. + * + * @tparam T Subtype of [[org.apache.flink.ml.math.Vector]] + */ + implicit def fitKNN[T <: FlinkVector : TypeInformation] = new FitOperation[KNN, T] { + override def fit( + instance: KNN, + fitParameters: ParameterMap, + input: DataSet[T] + ): Unit = { + val resultParameters = instance.parameters ++ fitParameters + + require(resultParameters.get(K).isDefined, "K is needed for calculation") + + val blocks = resultParameters.get(Blocks).getOrElse(input.getParallelism) + val partitioner = FlinkMLTools.ModuloKeyPartitioner + val inputAsVector = input.asInstanceOf[DataSet[FlinkVector]] + + instance.trainingSet = Some(FlinkMLTools.block(inputAsVector, blocks, Some(partitioner))) + } + } + + /** [[PredictDataSetOperation]] which calculates k-nearest neighbors of the given testing data + * set. + * + * @tparam T Subtype of [[Vector]] + * @return The given testing data set with k-nearest neighbors + */ + implicit def predictValues[T <: FlinkVector : ClassTag : TypeInformation] = { + new PredictDataSetOperation[KNN, T, (FlinkVector, Array[FlinkVector])] { + override def predictDataSet( + instance: KNN, + predictParameters: ParameterMap, + input: DataSet[T] + ): DataSet[(FlinkVector, Array[FlinkVector])] = { + val resultParameters = instance.parameters ++ predictParameters + + instance.trainingSet match { + case Some(trainingSet) => + val k = resultParameters.get(K).get + val blocks = resultParameters.get(Blocks).getOrElse(input.getParallelism) + val metric = resultParameters.get(DistanceMetric).get + val partitioner = FlinkMLTools.ModuloKeyPartitioner + + // attach unique id for each data + val inputWithId: DataSet[(Long, T)] = input.zipWithUniqueId + + // split data into multiple blocks + val inputSplit = FlinkMLTools.block(inputWithId, blocks, Some(partitioner)) + + val sizeHint = resultParameters.get(SizeHint) + val crossTuned = sizeHint match { + case Some(hint) if hint == CrossHint.FIRST_IS_SMALL => + trainingSet.crossWithHuge(inputSplit) + case Some(hint) if hint == CrossHint.SECOND_IS_SMALL => + trainingSet.crossWithTiny(inputSplit) + case _ => trainingSet.cross(inputSplit) + } + + // join input and training set + val crossed = crossTuned.mapPartition { + (iter, out: Collector[(FlinkVector, FlinkVector, Long, Double)]) => { + for ((training, testing) <- iter) { + // use a quadtree if (4 ^ dim) * Ntest * log(Ntrain) + // < Ntest * Ntrain, and distance is Euclidean + val checkSize = math.log(4.0) * training.values.head.size + + math.log(math.log(training.values.length)) < math.log(training.values.length) + val checkMetric = metric match { + case _: EuclideanDistanceMetric => true + case _: SquaredEuclideanDistanceMetric => true + case _ => false + } + val useQuadTree = resultParameters.get(UseQuadTree) + .getOrElse(checkSize && checkMetric) + + if (useQuadTree) { + knnQueryWithQuadTree(training.values, testing.values, k, metric, out) + } else { + knnQueryBasic(training.values, testing.values, k, metric, out) + } + } + } + } + + // group by input vector id and pick k nearest neighbor for each group + val result = crossed.groupBy(2).sortGroup(3, Order.ASCENDING).reduceGroup { + (iter, out: Collector[(FlinkVector, Array[FlinkVector])]) => { + if (iter.hasNext) { + val head = iter.next() + val key = head._2 + val neighbors: ArrayBuffer[FlinkVector] = ArrayBuffer(head._1) + + for ((vector, _, _, _) <- iter.take(k - 1)) { + // we already took a first element + neighbors += vector + } + + out.collect(key, neighbors.toArray) + } + } + } + + result + case None => throw new RuntimeException("The KNN model has not been trained." + + "Call first fit before calling the predict operation.") + } + } + } + } + + private def knnQueryWithQuadTree[T <: FlinkVector]( + training: Vector[T], + testing: Vector[(Long, T)], + k: Int, + metric: DistanceMetric, + out: Collector[(FlinkVector, FlinkVector, Long, Double)] + ): Unit = { + // find a bounding box + val MinArr = Array.tabulate(training.head.size)(x => x) + val MaxArr = Array.tabulate(training.head.size)(x => x) + + val minVecTrain = MinArr.map(i => training.map(x => x(i)).min - 0.01) + val minVecTest = MinArr.map(i => testing.map(x => x._2(i)).min - 0.01) + val maxVecTrain = MaxArr.map(i => training.map(x => x(i)).max + 0.01) + val maxVecTest = MaxArr.map(i => testing.map(x => x._2(i)).max + 0.01) + + val MinVec = DenseVector(MinArr.map(i => math.min(minVecTrain(i), minVecTest(i)))) + val MaxVec = DenseVector(MinArr.map(i => math.max(maxVecTrain(i), maxVecTest(i)))) + + // default value of max elements/box is set to max(20,k) + val maxPerBox = math.max(k, 20) + val trainingQuadTree = new QuadTree(MinVec, MaxVec, metric, maxPerBox) + + val queue = mutable.PriorityQueue[(FlinkVector, FlinkVector, Long, Double)]()( + Ordering.by(_._4)) + + for (v <- training) { + trainingQuadTree.insert(v) + } + + for ((id, vector) <- testing) { + // Find siblings' objects and do local kNN there + val siblingObjects = trainingQuadTree.searchNeighborsSiblingQueue(vector) + + // do KNN query on siblingObjects and get max distance of kNN then rad is good choice + // for a neighborhood to do a refined local kNN search + val knnSiblings = siblingObjects.map(v => metric.distance(vector, v)).sortWith(_ < _).take(k) + + val rad = knnSiblings.last + val trainingFiltered = trainingQuadTree.searchNeighbors(vector, rad) + + for (b <- trainingFiltered) { + // (training vector, input vector, input key, distance) + queue.enqueue((b, vector, id, metric.distance(b, vector))) + if (queue.size > k) { + queue.dequeue() + } + } + + for (v <- queue) { + out.collect(v) + } + } + } + + private def knnQueryBasic[T <: FlinkVector]( + training: Vector[T], + testing: Vector[(Long, T)], + k: Int, + metric: DistanceMetric, + out: Collector[(FlinkVector, FlinkVector, Long, Double)] + ): Unit = { + val queue = mutable.PriorityQueue[(FlinkVector, FlinkVector, Long, Double)]()( + Ordering.by(_._4)) + + for ((id, vector) <- testing) { + for (b <- training) { + // (training vector, input vector, input key, distance) + queue.enqueue((b, vector, id, metric.distance(b, vector))) + if (queue.size > k) { + queue.dequeue() + } + } + + for (v <- queue) { + out.collect(v) + } + } + } + +} diff --git a/src/main/scala/org/apache/flink/ml/nn/QuadTree.scala b/src/main/scala/org/apache/flink/ml/nn/QuadTree.scala new file mode 100644 index 0000000000000..c44fe0796aee5 --- /dev/null +++ b/src/main/scala/org/apache/flink/ml/nn/QuadTree.scala @@ -0,0 +1,323 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.nn + +import org.apache.flink.ml.math.Breeze._ +import org.apache.flink.ml.math.Vector +import org.apache.flink.ml.metrics.distances._ + +import scala.annotation.tailrec +import scala.collection.mutable + +/** n-dimensional QuadTree data structure; partitions + * spatial data for faster queries (e.g. KNN query) + * The skeleton of the data structure was initially + * based off of the 2D Quadtree found here: + * http://www.cs.trinity.edu/~mlewis/CSCI1321-F11/Code/src/util/Quadtree.scala + * + * Many additional methods were added to the class both for + * efficient KNN queries and generalizing to n-dim. + * + * @param minVec vector of the corner of the bounding box with smallest coordinates + * @param maxVec vector of the corner of the bounding box with smallest coordinates + * @param distMetric metric, must be Euclidean or squareEuclidean + * @param maxPerBox threshold for number of points in each box before slitting a box + */ +class QuadTree( + minVec: Vector, + maxVec: Vector, + distMetric: DistanceMetric, + maxPerBox: Int) { + + class Node( + center: Vector, + width: Vector, + var children: Seq[Node]) { + + val nodeElements = new mutable.ListBuffer[Vector] + + /** for testing purposes only; used in QuadTreeSuite.scala + * + * @return center and width of the box + */ + def getCenterWidth(): (Vector, Vector) = (center, width) + + /** Tests whether the queryPoint is in the node, or a child of that node + * + * @param queryPoint a point to test + * @return whether the given point is in the node, or a child of this node + */ + def contains(queryPoint: Vector): Boolean = overlap(queryPoint, 0.0) + + /** Tests if queryPoint is within a radius of the node + * + * @param queryPoint a point to test + * @param radius radius of test area + * @return whether the given point is in the area + */ + def overlap(queryPoint: Vector, radius: Double): Boolean = { + (0 until queryPoint.size).forall { i => + (queryPoint(i) - radius < center(i) + width(i) / 2) && + (queryPoint(i) + radius > center(i) - width(i) / 2) + } + } + + /** Tests if queryPoint is near a node + * + * @param queryPoint a point to test + * @param radius radius of covered area + */ + def isNear(queryPoint: Vector, radius: Double): Boolean = minDist(queryPoint) < radius + + /** minDist is defined so that every point in the box has distance to queryPoint greater + * than minDist (minDist adopted from "Nearest Neighbors Queries" by N. Roussopoulos et al.) + * + * @param queryPoint + */ + def minDist(queryPoint: Vector): Double = { + val minDist = (0 until queryPoint.size).map { i => + if (queryPoint(i) < center(i) - width(i) / 2) { + math.pow(queryPoint(i) - center(i) + width(i) / 2, 2) + } else if (queryPoint(i) > center(i) + width(i) / 2) { + math.pow(queryPoint(i) - center(i) - width(i) / 2, 2) + } else { + 0 + } + }.sum + + distMetric match { + case _: EuclideanDistanceMetric => math.sqrt(minDist) + case _: SquaredEuclideanDistanceMetric => minDist + case _ => throw new IllegalArgumentException(s" Error: metric must be" + + s" Euclidean or SquaredEuclidean!") + } + } + + /** Finds which child queryPoint lies in. node.children is a Seq[Node], and + * [[whichChild]] finds the appropriate index of that Seq. + * + * @param queryPoint + * @return + */ + def whichChild(queryPoint: Vector): Int = { + (0 until queryPoint.size).map { i => + if (queryPoint(i) > center(i)) { + scala.math.pow(2, queryPoint.size - 1 - i).toInt + } else { + 0 + } + }.sum + } + + /** Makes children nodes by partitioning the box into equal sub-boxes + * and adding a node for each sub-box + */ + def makeChildren() { + val centerClone = center.copy + val cPart = partitionBox(centerClone, width) + val mappedWidth = 0.5 * width.asBreeze + children = cPart.map(p => new Node(p, mappedWidth.fromBreeze, null)) + } + + /** Recursive function that partitions a n-dim box by taking the (n-1) dimensional + * plane through the center of the box keeping the n-th coordinate fixed, + * then shifting it in the n-th direction up and down + * and recursively applying partitionBox to the two shifted (n-1) dimensional planes. + * + * @param center the center of the box + * @param width a vector of lengths of each dimension of the box + * @return + */ + def partitionBox(center: Vector, width: Vector): Seq[Vector] = { + @tailrec + def partitionHelper(box: Seq[Vector], dim: Int): Seq[Vector] = { + if (dim >= width.size) { + box + } else { + val newBox = box.flatMap { vector => + val (up, down) = (vector.copy, vector) + up.update(dim, up(dim) - width(dim) / 4) + down.update(dim, down(dim) + width(dim) / 4) + + Seq(up, down) + } + partitionHelper(newBox, dim + 1) + } + } + partitionHelper(Seq(center), 0) + } + } + + + val root = new Node(((minVec.asBreeze + maxVec.asBreeze) * 0.5).fromBreeze, + (maxVec.asBreeze - minVec.asBreeze).fromBreeze, null) + + /** Prints tree for testing/debugging */ + def printTree(): Unit = { + def printTreeRecur(node: Node) { + if (node.children != null) { + for (c <- node.children) { + printTreeRecur(c) + } + } else { + println("printing tree: n.nodeElements " + node.nodeElements) + } + } + + printTreeRecur(root) + } + + /** Recursively adds an object to the tree + * + * @param queryPoint an object which is added + */ + def insert(queryPoint: Vector) = { + def insertRecur(queryPoint: Vector, node: Node): Unit = { + if (node.children == null) { + if (node.nodeElements.length < maxPerBox) { + node.nodeElements += queryPoint + } else { + node.makeChildren() + for (o <- node.nodeElements) { + insertRecur(o, node.children(node.whichChild(o))) + } + node.nodeElements.clear() + insertRecur(queryPoint, node.children(node.whichChild(queryPoint))) + } + } else { + insertRecur(queryPoint, node.children(node.whichChild(queryPoint))) + } + } + + insertRecur(queryPoint, root) + } + + /** Used to zoom in on a region near a test point for a fast KNN query. + * This capability is used in the KNN query to find k "near" neighbors n_1,...,n_k, from + * which one computes the max distance D_s to queryPoint. D_s is then used during the + * kNN query to find all points within a radius D_s of queryPoint using searchNeighbors. + * To find the "near" neighbors, a min-heap is defined on the leaf nodes of the leaf + * nodes of the minimal bounding box of the queryPoint. The priority of a leaf node + * is an appropriate notion of the distance between the test point and the node, + * which is defined by minDist(queryPoint), + * + * @param queryPoint a test point for which the method finds the minimal bounding + * box that queryPoint lies in and returns elements in that boxes + * siblings' leaf nodes + */ + def searchNeighborsSiblingQueue(queryPoint: Vector): mutable.ListBuffer[Vector] = { + val ret = new mutable.ListBuffer[Vector] + // edge case when the main box has not been partitioned at all + if (root.children == null) { + root.nodeElements.clone() + } else { + val nodeQueue = new mutable.PriorityQueue[(Double, Node)]()(Ordering.by(x => x._1)) + searchRecurSiblingQueue(queryPoint, root, nodeQueue) + + var count = 0 + while (count < maxPerBox) { + val dq = nodeQueue.dequeue() + if (dq._2.nodeElements.nonEmpty) { + ret ++= dq._2.nodeElements + count += dq._2.nodeElements.length + } + } + ret + } + } + + /** + * + * @param queryPoint point under consideration + * @param node node that queryPoint lies in + * @param nodeQueue defined in searchSiblingQueue, this stores nodes based on their + * distance to node as defined by minDist + */ + private def searchRecurSiblingQueue( + queryPoint: Vector, + node: Node, + nodeQueue: mutable.PriorityQueue[(Double, Node)] + ): Unit = { + if (node.children != null) { + for (child <- node.children; if child.contains(queryPoint)) { + if (child.children == null) { + for (c <- node.children) { + minNodes(queryPoint, c, nodeQueue) + } + } else { + searchRecurSiblingQueue(queryPoint, child, nodeQueue) + } + } + } + } + + /** Goes down to minimal bounding box of queryPoint, and add elements to nodeQueue + * + * @param queryPoint point under consideration + * @param node node that queryPoint lies in + * @param nodeQueue [[mutable.PriorityQueue]] that stores all points in minimal bounding box + * of queryPoint + */ + private def minNodes( + queryPoint: Vector, + node: Node, + nodeQueue: mutable.PriorityQueue[(Double, Node)] + ): Unit = { + if (node.children == null) { + nodeQueue += ((-node.minDist(queryPoint), node)) + } else { + for (c <- node.children) { + minNodes(queryPoint, c, nodeQueue) + } + } + } + + /** Finds all objects within a neighborhood of queryPoint of a specified radius + * scope is modified from original 2D version in: + * http://www.cs.trinity.edu/~mlewis/CSCI1321-F11/Code/src/util/Quadtree.scala + * + * original version only looks in minimal box; for the KNN Query, we look at + * all nearby boxes. The radius is determined from searchNeighborsSiblingQueue + * by defining a min-heap on the leaf nodes + * + * @param queryPoint a point which is center + * @param radius radius of scope + * @return all points within queryPoint with given radius + */ + def searchNeighbors(queryPoint: Vector, radius: Double): mutable.ListBuffer[Vector] = { + def searchRecur( + queryPoint: Vector, + radius: Double, + node: Node, + ret: mutable.ListBuffer[Vector] + ): Unit = { + if (node.children == null) { + ret ++= node.nodeElements + } else { + for (child <- node.children; if child.isNear(queryPoint, radius)) { + searchRecur(queryPoint, radius, child, ret) + } + } + } + + val ret = new mutable.ListBuffer[Vector] + searchRecur(queryPoint, radius, root, ret) + ret + } +} diff --git a/src/main/scala/org/apache/flink/ml/optimization/GradientDescent.scala b/src/main/scala/org/apache/flink/ml/optimization/GradientDescent.scala new file mode 100644 index 0000000000000..7cbab585ff319 --- /dev/null +++ b/src/main/scala/org/apache/flink/ml/optimization/GradientDescent.scala @@ -0,0 +1,297 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package org.apache.flink.ml.optimization + +import org.apache.flink.api.scala._ +import org.apache.flink.ml.common._ +import org.apache.flink.ml.math._ +import org.apache.flink.ml.optimization.IterativeSolver._ +import org.apache.flink.ml.optimization.LearningRateMethod.LearningRateMethodTrait +import org.apache.flink.ml.optimization.Solver._ +import org.apache.flink.ml._ + +/** Base class which performs Stochastic Gradient Descent optimization using mini batches. + * + * For each labeled vector in a mini batch the gradient is computed and added to a partial + * gradient. The partial gradients are then summed and divided by the size of the batches. The + * average gradient is then used to updated the weight values, including regularization. + * + * At the moment, the whole partition is used for SGD, making it effectively a batch gradient + * descent. Once a sampling operator has been introduced, the algorithm can be optimized + * + * The parameters to tune the algorithm are: + * [[Solver.LossFunction]] for the loss function to be used, + * [[Solver.RegularizationPenaltyValue]] for the regularization penalty. + * [[Solver.RegularizationConstant]] for the regularization parameter, + * [[IterativeSolver.Iterations]] for the maximum number of iteration, + * [[IterativeSolver.LearningRate]] for the learning rate used. + * [[IterativeSolver.ConvergenceThreshold]] when provided the algorithm will + * stop the iterations if the relative change in the value of the objective + * function between successive iterations is is smaller than this value. + * [[IterativeSolver.LearningRateMethodValue]] determines functional form of + * effective learning rate. + */ +class GradientDescent extends IterativeSolver { + + /** Provides a solution for the given optimization problem + * + * @param data A Dataset of LabeledVector (label, features) pairs + * @param initialWeights The initial weights that will be optimized + * @return The weights, optimized for the provided data. + */ + override def optimize( + data: DataSet[LabeledVector], + initialWeights: Option[DataSet[WeightVector]]): DataSet[WeightVector] = { + + val numberOfIterations: Int = parameters(Iterations) + val convergenceThresholdOption: Option[Double] = parameters.get(ConvergenceThreshold) + val lossFunction = parameters(LossFunction) + val learningRate = parameters(LearningRate) + val regularizationPenalty = parameters(RegularizationPenaltyValue) + val regularizationConstant = parameters(RegularizationConstant) + val learningRateMethod = parameters(LearningRateMethodValue) + + // Initialize weights + val initialWeightsDS: DataSet[WeightVector] = createInitialWeightsDS(initialWeights, data) + + // Perform the iterations + convergenceThresholdOption match { + // No convergence criterion + case None => + optimizeWithoutConvergenceCriterion( + data, + initialWeightsDS, + numberOfIterations, + regularizationPenalty, + regularizationConstant, + learningRate, + lossFunction, + learningRateMethod) + case Some(convergence) => + optimizeWithConvergenceCriterion( + data, + initialWeightsDS, + numberOfIterations, + regularizationPenalty, + regularizationConstant, + learningRate, + convergence, + lossFunction, + learningRateMethod) + } + } + + def optimizeWithConvergenceCriterion( + dataPoints: DataSet[LabeledVector], + initialWeightsDS: DataSet[WeightVector], + numberOfIterations: Int, + regularizationPenalty: RegularizationPenalty, + regularizationConstant: Double, + learningRate: Double, + convergenceThreshold: Double, + lossFunction: LossFunction, + learningRateMethod: LearningRateMethodTrait) + : DataSet[WeightVector] = { + // We have to calculate for each weight vector the sum of squared residuals, + // and then sum them and apply regularization + val initialLossSumDS = calculateLoss(dataPoints, initialWeightsDS, lossFunction) + + // Combine weight vector with the current loss + val initialWeightsWithLossSum = initialWeightsDS.mapWithBcVariable(initialLossSumDS){ + (weights, loss) => (weights, loss) + } + + val resultWithLoss = initialWeightsWithLossSum.iterateWithTermination(numberOfIterations) { + weightsWithPreviousLossSum => + + // Extract weight vector and loss + val previousWeightsDS = weightsWithPreviousLossSum.map{_._1} + val previousLossSumDS = weightsWithPreviousLossSum.map{_._2} + + val currentWeightsDS = SGDStep( + dataPoints, + previousWeightsDS, + lossFunction, + regularizationPenalty, + regularizationConstant, + learningRate, + learningRateMethod) + + val currentLossSumDS = calculateLoss(dataPoints, currentWeightsDS, lossFunction) + + // Check if the relative change in the loss is smaller than the + // convergence threshold. If yes, then terminate i.e. return empty termination data set + val termination = previousLossSumDS.filterWithBcVariable(currentLossSumDS){ + (previousLoss, currentLoss) => { + if (previousLoss <= 0) { + false + } else { + scala.math.abs((previousLoss - currentLoss)/previousLoss) >= convergenceThreshold + } + } + } + + // Result for new iteration + (currentWeightsDS.mapWithBcVariable(currentLossSumDS)((w, l) => (w, l)), termination) + } + // Return just the weights + resultWithLoss.map{_._1} + } + + def optimizeWithoutConvergenceCriterion( + data: DataSet[LabeledVector], + initialWeightsDS: DataSet[WeightVector], + numberOfIterations: Int, + regularizationPenalty: RegularizationPenalty, + regularizationConstant: Double, + learningRate: Double, + lossFunction: LossFunction, + optimizationMethod: LearningRateMethodTrait) + : DataSet[WeightVector] = { + initialWeightsDS.iterate(numberOfIterations) { + weightVectorDS => { + SGDStep(data, + weightVectorDS, + lossFunction, + regularizationPenalty, + regularizationConstant, + learningRate, + optimizationMethod) + } + } + } + + /** Performs one iteration of Stochastic Gradient Descent using mini batches + * + * @param data A Dataset of LabeledVector (label, features) pairs + * @param currentWeights A Dataset with the current weights to be optimized as its only element + * @param lossFunction The loss function to be used + * @param regularizationPenalty The regularization penalty to be used + * @param regularizationConstant The regularization parameter + * @param learningRate The effective step size for this iteration + * @param learningRateMethod The learning rate used + * + * @return A Dataset containing the weights after one stochastic gradient descent step + */ + private def SGDStep( + data: DataSet[(LabeledVector)], + currentWeights: DataSet[WeightVector], + lossFunction: LossFunction, + regularizationPenalty: RegularizationPenalty, + regularizationConstant: Double, + learningRate: Double, + learningRateMethod: LearningRateMethodTrait) + : DataSet[WeightVector] = { + + data.mapWithBcVariable(currentWeights){ + (data, weightVector) => (lossFunction.gradient(data, weightVector), 1) + }.reduce{ + (left, right) => + val (leftGradVector, leftCount) = left + val (rightGradVector, rightCount) = right + + // make the left gradient dense so that the following reduce operations (left fold) reuse + // it. This strongly depends on the underlying implementation of the ReduceDriver which + // always passes the new input element as the second parameter + val result = leftGradVector.weights match { + case d: DenseVector => d + case s: SparseVector => s.toDenseVector + } + + // Add the right gradient to the result + BLAS.axpy(1.0, rightGradVector.weights, result) + val gradients = WeightVector( + result, leftGradVector.intercept + rightGradVector.intercept) + + (gradients , leftCount + rightCount) + }.mapWithBcVariableIteration(currentWeights){ + (gradientCount, weightVector, iteration) => { + val (WeightVector(weights, intercept), count) = gradientCount + + BLAS.scal(1.0/count, weights) + + val gradient = WeightVector(weights, intercept/count) + val effectiveLearningRate = learningRateMethod.calculateLearningRate( + learningRate, + iteration, + regularizationConstant) + + val newWeights = takeStep( + weightVector.weights, + gradient.weights, + regularizationPenalty, + regularizationConstant, + effectiveLearningRate) + + WeightVector( + newWeights, + weightVector.intercept - effectiveLearningRate * gradient.intercept) + } + } + } + + /** Calculates the new weights based on the gradient + * + * @param weightVector The weights to be updated + * @param gradient The gradient according to which we will update the weights + * @param regularizationPenalty The regularization penalty to apply + * @param regularizationConstant The regularization parameter + * @param learningRate The effective step size for this iteration + * @return Updated weights + */ + def takeStep( + weightVector: Vector, + gradient: Vector, + regularizationPenalty: RegularizationPenalty, + regularizationConstant: Double, + learningRate: Double + ): Vector = { + regularizationPenalty.takeStep(weightVector, gradient, regularizationConstant, learningRate) + } + + /** Calculates the regularized loss, from the data and given weights. + * + * @param data A Dataset of LabeledVector (label, features) pairs + * @param weightDS A Dataset with the current weights to be optimized as its only element + * @param lossFunction The loss function to be used + * @return A Dataset with the regularized loss as its only element + */ + private def calculateLoss( + data: DataSet[LabeledVector], + weightDS: DataSet[WeightVector], + lossFunction: LossFunction) + : DataSet[Double] = { + data.mapWithBcVariable(weightDS){ + (data, weightVector) => (lossFunction.loss(data, weightVector), 1) + }.reduce{ + (left, right) => (left._1 + right._1, left._2 + right._2) + }.map { + lossCount => lossCount._1 / lossCount._2 + } + } +} + + +/** Implementation of a Gradient Descent solver. + * + */ +object GradientDescent { + def apply() = new GradientDescent +} diff --git a/src/main/scala/org/apache/flink/ml/optimization/LossFunction.scala b/src/main/scala/org/apache/flink/ml/optimization/LossFunction.scala new file mode 100644 index 0000000000000..30d4addcca3f2 --- /dev/null +++ b/src/main/scala/org/apache/flink/ml/optimization/LossFunction.scala @@ -0,0 +1,96 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.optimization + +import org.apache.flink.ml.common.{WeightVector, LabeledVector} +import org.apache.flink.ml.math.BLAS + +/** Abstract class that implements some of the functionality for common loss functions + * + * A loss function determines the loss term `L(w)` of the objective function `f(w) = L(w) + + * lambda*R(w)` for prediction tasks, the other being regularization, `R(w)`. + * + * The regularization is specific to the used optimization algorithm and, thus, implemented there. + * + * We currently only support differentiable loss functions, in the future this class + * could be changed to DiffLossFunction in order to support other types, such as absolute loss. + */ +trait LossFunction extends Serializable { + + /** Calculates the loss given the prediction and label value + * + * @param dataPoint + * @param weightVector + * @return + */ + def loss(dataPoint: LabeledVector, weightVector: WeightVector): Double = { + lossGradient(dataPoint, weightVector)._1 + } + + /** Calculates the gradient of the loss function given a data point and weight vector + * + * @param dataPoint + * @param weightVector + * @return + */ + def gradient(dataPoint: LabeledVector, weightVector: WeightVector): WeightVector = { + lossGradient(dataPoint, weightVector)._2 + } + + /** Calculates the gradient as well as the loss given a data point and the weight vector + * + * @param dataPoint + * @param weightVector + * @return + */ + def lossGradient(dataPoint: LabeledVector, weightVector: WeightVector): (Double, WeightVector) +} + +/** Generic loss function which lets you build a loss function out of the [[PartialLossFunction]] + * and the [[PredictionFunction]]. + * + * @param partialLossFunction + * @param predictionFunction + */ +case class GenericLossFunction( + partialLossFunction: PartialLossFunction, + predictionFunction: PredictionFunction) + extends LossFunction { + + /** Calculates the gradient as well as the loss given a data point and the weight vector + * + * @param dataPoint + * @param weightVector + * @return + */ + def lossGradient(dataPoint: LabeledVector, weightVector: WeightVector): (Double, WeightVector) = { + val prediction = predictionFunction.predict(dataPoint.vector, weightVector) + + val loss = partialLossFunction.loss(prediction, dataPoint.label) + + val lossDerivative = partialLossFunction.derivative(prediction, dataPoint.label) + + val WeightVector(weightGradient, interceptGradient) = + predictionFunction.gradient(dataPoint.vector, weightVector) + + BLAS.scal(lossDerivative, weightGradient) + + (loss, WeightVector(weightGradient, lossDerivative * interceptGradient)) + } +} diff --git a/src/main/scala/org/apache/flink/ml/optimization/PartialLossFunction.scala b/src/main/scala/org/apache/flink/ml/optimization/PartialLossFunction.scala new file mode 100644 index 0000000000000..36db1d7d73fca --- /dev/null +++ b/src/main/scala/org/apache/flink/ml/optimization/PartialLossFunction.scala @@ -0,0 +1,154 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.optimization + +/** Represents loss functions which can be used with the [[GenericLossFunction]]. + * + */ +trait PartialLossFunction extends Serializable { + /** Calculates the loss depending on the label and the prediction + * + * @param prediction The predicted value + * @param label The true value + * @return The loss + */ + def loss(prediction: Double, label: Double): Double + + /** Calculates the derivative of the [[PartialLossFunction]] + * + * @param prediction The predicted value + * @param label The true value + * @return The derivative of the loss function + */ + def derivative(prediction: Double, label: Double): Double +} + +/** Squared loss function which can be used with the [[GenericLossFunction]] + * + * The [[SquaredLoss]] function implements `1/2 (prediction - label)^2` + */ +object SquaredLoss extends PartialLossFunction { + + /** Calculates the loss depending on the label and the prediction + * + * @param prediction The predicted value + * @param label The true value + * @return The loss + */ + override def loss(prediction: Double, label: Double): Double = { + 0.5 * (prediction - label) * (prediction - label) + } + + /** Calculates the derivative of the [[PartialLossFunction]] + * + * @param prediction The predicted value + * @param label The true value + * @return The derivative of the loss function + */ + override def derivative(prediction: Double, label: Double): Double = { + prediction - label + } +} + +/** Logistic loss function which can be used with the [[GenericLossFunction]] + * + * + * The [[LogisticLoss]] function implements `log(1 + -exp(prediction*label))` + * for binary classification with label in {-1, 1} + */ +object LogisticLoss extends PartialLossFunction { + + /** Calculates the loss depending on the label and the prediction + * + * @param prediction The predicted value + * @param label The true value + * @return The loss + */ + override def loss(prediction: Double, label: Double): Double = { + val z = prediction * label + + // based on implementation in scikit-learn + // approximately equal and saves the computation of the log + if (z > 18) { + math.exp(-z) + } + else if (z < -18) { + -z + } + else { + math.log(1 + math.exp(-z)) + } + } + + /** Calculates the derivative of the loss function with respect to the prediction + * + * @param prediction The predicted value + * @param label The true value + * @return The derivative of the loss function + */ + override def derivative(prediction: Double, label: Double): Double = { + val z = prediction * label + + // based on implementation in scikit-learn + // approximately equal and saves the computation of the log + if (z > 18) { + label * math.exp(-z) + } + else if (z < -18) { + -label + } + else { + -label/(math.exp(z) + 1) + } + } +} + +/** Hinge loss function which can be used with the [[GenericLossFunction]] + * + * The [[HingeLoss]] function implements `max(0, 1 - prediction*label)` + * for binary classification with label in {-1, 1} + */ +object HingeLoss extends PartialLossFunction { + /** Calculates the loss for a given prediction/truth pair + * + * @param prediction The predicted value + * @param label The true value + * @return The loss + */ + override def loss(prediction: Double, label: Double): Double = { + val z = prediction * label + math.max(0, 1 - z) + } + + /** Calculates the derivative of the loss function with respect to the prediction + * + * @param prediction The predicted value + * @param label The true value + * @return The derivative of the loss function + */ + override def derivative(prediction: Double, label: Double): Double = { + val z = prediction * label + if (z <= 1) { + -label + } + else { + 0 + } + } +} diff --git a/src/main/scala/org/apache/flink/ml/optimization/PredictionFunction.scala b/src/main/scala/org/apache/flink/ml/optimization/PredictionFunction.scala new file mode 100644 index 0000000000000..d9d27c9386fc9 --- /dev/null +++ b/src/main/scala/org/apache/flink/ml/optimization/PredictionFunction.scala @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.optimization + +import org.apache.flink.ml.common.WeightVector +import org.apache.flink.ml.math.{Vector => FlinkVector, BLAS} + +/** An abstract class for prediction functions to be used in optimization **/ +abstract class PredictionFunction extends Serializable { + def predict(features: FlinkVector, weights: WeightVector): Double + + def gradient(features: FlinkVector, weights: WeightVector): WeightVector +} + +/** A linear prediction function **/ +object LinearPrediction extends PredictionFunction { + override def predict(features: FlinkVector, weightVector: WeightVector): Double = { + BLAS.dot(features, weightVector.weights) + weightVector.intercept + } + + override def gradient(features: FlinkVector, weights: WeightVector): WeightVector = { + WeightVector(features.copy, 1) + } +} diff --git a/src/main/scala/org/apache/flink/ml/optimization/RegularizationPenalty.scala b/src/main/scala/org/apache/flink/ml/optimization/RegularizationPenalty.scala new file mode 100644 index 0000000000000..dbdb3781b0821 --- /dev/null +++ b/src/main/scala/org/apache/flink/ml/optimization/RegularizationPenalty.scala @@ -0,0 +1,219 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.optimization + +import org.apache.flink.ml.math.{Vector, BLAS} +import org.apache.flink.ml.math.Breeze._ +import breeze.linalg.{norm => BreezeNorm} + +/** Represents a type of regularization penalty + * + * Regularization penalties are used to restrict the optimization problem to solutions with + * certain desirable characteristics, such as sparsity for the L1 penalty, or penalizing large + * weights for the L2 penalty. + * + * The regularization term, `R(w)` is added to the objective function, `f(w) = L(w) + lambda*R(w)` + * where lambda is the regularization parameter used to tune the amount of regularization applied. + */ +trait RegularizationPenalty extends Serializable { + + /** Calculates the new weights based on the gradient and regularization penalty + * + * Weights are updated using the gradient descent step `w - learningRate * gradient` + * with `w` being the weight vector. + * + * @param weightVector The weights to be updated + * @param gradient The gradient used to update the weights + * @param regularizationConstant The regularization parameter to be applied + * @param learningRate The effective step size for this iteration + * @return Updated weights + */ + def takeStep( + weightVector: Vector, + gradient: Vector, + regularizationConstant: Double, + learningRate: Double) + : Vector + + /** Adds regularization to the loss value + * + * @param oldLoss The loss to be updated + * @param weightVector The weights used to update the loss + * @param regularizationConstant The regularization parameter to be applied + * @return Updated loss + */ + def regLoss(oldLoss: Double, weightVector: Vector, regularizationConstant: Double): Double + +} + + +/** `L_2` regularization penalty. + * + * The regularization function is the square of the L2 norm `1/2*||w||_2^2` + * with `w` being the weight vector. The function penalizes large weights, + * favoring solutions with more small weights rather than few large ones. + */ +object L2Regularization extends RegularizationPenalty { + + /** Calculates the new weights based on the gradient and L2 regularization penalty + * + * The updated weight is `w - learningRate * (gradient + lambda * w)` where + * `w` is the weight vector, and `lambda` is the regularization parameter. + * + * @param weightVector The weights to be updated + * @param gradient The gradient according to which we will update the weights + * @param regularizationConstant The regularization parameter to be applied + * @param learningRate The effective step size for this iteration + * @return Updated weights + */ + override def takeStep( + weightVector: Vector, + gradient: Vector, + regularizationConstant: Double, + learningRate: Double) + : Vector = { + // add the gradient of the L2 regularization + BLAS.axpy(regularizationConstant, weightVector, gradient) + + // update the weights according to the learning rate + BLAS.axpy(-learningRate, gradient, weightVector) + + weightVector + } + + /** Adds regularization to the loss value + * + * The updated loss is `oldLoss + lambda * 1/2*||w||_2^2` where + * `w` is the weight vector, and `lambda` is the regularization parameter + * + * @param oldLoss The loss to be updated + * @param weightVector The weights used to update the loss + * @param regularizationConstant The regularization parameter to be applied + * @return Updated loss + */ + override def regLoss(oldLoss: Double, weightVector: Vector, regularizationConstant: Double) + : Double = { + val squareNorm = BLAS.dot(weightVector, weightVector) + oldLoss + regularizationConstant * 0.5 * squareNorm + } +} + +/** `L_1` regularization penalty. + * + * The regularization function is the `L1` norm `||w||_1` with `w` being the weight vector. + * The `L_1` penalty can be used to drive a number of the solution coefficients to 0, thereby + * producing sparse solutions. + * + */ +object L1Regularization extends RegularizationPenalty { + + /** Calculates the new weights based on the gradient and L1 regularization penalty + * + * Uses the proximal gradient method with L1 regularization to update weights. + * The updated weight `w - learningRate * gradient` is shrunk towards zero + * by applying the proximal operator `signum(w) * max(0.0, abs(w) - shrinkageVal)` + * where `w` is the weight vector, `lambda` is the regularization parameter, + * and `shrinkageVal` is `lambda*learningRate`. + * + * @param weightVector The weights to be updated + * @param gradient The gradient according to which we will update the weights + * @param regularizationConstant The regularization parameter to be applied + * @param learningRate The effective step size for this iteration + * @return Updated weights + */ + override def takeStep( + weightVector: Vector, + gradient: Vector, + regularizationConstant: Double, + learningRate: Double) + : Vector = { + // Update weight vector with gradient. + BLAS.axpy(-learningRate, gradient, weightVector) + + // Apply proximal operator (soft thresholding) + val shrinkageVal = regularizationConstant * learningRate + var i = 0 + while (i < weightVector.size) { + val wi = weightVector(i) + weightVector(i) = math.signum(wi) * + math.max(0.0, math.abs(wi) - shrinkageVal) + i += 1 + } + + weightVector + } + + /** Adds regularization to the loss value + * + * The updated loss is `oldLoss + lambda * ||w||_1` where + * `w` is the weight vector and `lambda` is the regularization parameter + * + * @param oldLoss The loss to be updated + * @param weightVector The weights used to update the loss + * @param regularizationConstant The regularization parameter to be applied + * @return Updated loss + */ + override def regLoss(oldLoss: Double, weightVector: Vector, regularizationConstant: Double) + : Double = { + val norm = BreezeNorm(weightVector.asBreeze, 1.0) + oldLoss + norm * regularizationConstant + } +} + +/** No regularization penalty. + * + */ +object NoRegularization extends RegularizationPenalty { + + /** Calculates the new weights based on the gradient + * + * The updated weight is `w - learningRate *gradient` where `w` is the weight vector + * + * @param weightVector The weights to be updated + * @param gradient The gradient according to which we will update the weights + * @param regularizationConstant The regularization parameter which is ignored + * @param learningRate The effective step size for this iteration + * @return Updated weights + */ + override def takeStep( + weightVector: Vector, + gradient: Vector, + regularizationConstant: Double, + learningRate: Double) + : Vector = { + // Update the weight vector + BLAS.axpy(-learningRate, gradient, weightVector) + weightVector + } + + /** + * Returns the unmodified loss value + * + * The updated loss is `oldLoss` + * + * @param oldLoss The loss to be updated + * @param weightVector The weights used to update the loss + * @param regularizationParameter The regularization parameter which is ignored + * @return Updated loss + */ + override def regLoss(oldLoss: Double, weightVector: Vector, regularizationParameter: Double) + : Double = { + oldLoss + } +} diff --git a/src/main/scala/org/apache/flink/ml/optimization/Solver.scala b/src/main/scala/org/apache/flink/ml/optimization/Solver.scala new file mode 100644 index 0000000000000..310903b83bb58 --- /dev/null +++ b/src/main/scala/org/apache/flink/ml/optimization/Solver.scala @@ -0,0 +1,233 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.optimization + +import org.apache.flink.api.scala.DataSet +import org.apache.flink.ml.common._ +import org.apache.flink.ml.math.{SparseVector, DenseVector} +import org.apache.flink.api.scala._ +import org.apache.flink.ml.optimization.IterativeSolver._ +import org.apache.flink.ml.optimization.LearningRateMethod.LearningRateMethodTrait + +/** Base class for optimization algorithms + * + */ +abstract class Solver extends Serializable with WithParameters { + import Solver._ + + /** Provides a solution for the given optimization problem + * + * @param data A Dataset of LabeledVector (input, output) pairs + * @param initialWeights The initial weight that will be optimized + * @return A Vector of weights optimized to the given problem + */ + def optimize( + data: DataSet[LabeledVector], + initialWeights: Option[DataSet[WeightVector]]) + : DataSet[WeightVector] + + /** Creates initial weights vector, creating a DataSet with a WeightVector element + * + * @param initialWeights An Option that may contain an initial set of weights + * @param data The data for which we optimize the weights + * @return A DataSet containing a single WeightVector element + */ + def createInitialWeightsDS(initialWeights: Option[DataSet[WeightVector]], + data: DataSet[LabeledVector]): DataSet[WeightVector] = { + // TODO: Faster way to do this? + val dimensionsDS = data.map(_.vector.size).reduce((a, b) => b) + + initialWeights match { + // Ensure provided weight vector is a DenseVector + case Some(wvDS) => + wvDS.map { + wv => { + val denseWeights = wv.weights match { + case dv: DenseVector => dv + case sv: SparseVector => sv.toDenseVector + } + WeightVector(denseWeights, wv.intercept) + } + } + case None => createInitialWeightVector(dimensionsDS) + } + } + + /** Creates a DataSet with one zero vector. The zero vector has dimension d, which is given + * by the dimensionDS. + * + * @param dimensionDS DataSet with one element d, denoting the dimension of the returned zero + * vector + * @return DataSet of a zero vector of dimension d + */ + def createInitialWeightVector(dimensionDS: DataSet[Int]): DataSet[WeightVector] = { + dimensionDS.map { + dimension => + val values = Array.fill(dimension)(0.0) + WeightVector(DenseVector(values), 0.0) + } + } + + //Setters for parameters + // TODO(tvas): Provide an option to fit an intercept or not + def setLossFunction(lossFunction: LossFunction): this.type = { + parameters.add(LossFunction, lossFunction) + this + } + + def setRegularizationConstant(regularizationConstant: Double): this.type = { + parameters.add(RegularizationConstant, regularizationConstant) + this + } + + def setRegularizationPenalty(regularizationPenalty: RegularizationPenalty) : this.type = { + parameters.add(RegularizationPenaltyValue, regularizationPenalty) + this + } +} + +object Solver { + // Define parameters for Solver + case object LossFunction extends Parameter[LossFunction] { + // TODO(tvas): Should depend on problem, here is where differentiating between classification + // and regression could become useful + val defaultValue = None + } + + case object RegularizationConstant extends Parameter[Double] { + val defaultValue = Some(0.0001) // TODO(tvas): Properly initialize this, ensure Parameter > 0! + } + + case object RegularizationPenaltyValue extends Parameter[RegularizationPenalty] { + val defaultValue = Some(NoRegularization) + } +} + +/** An abstract class for iterative optimization algorithms + * + * See [[https://en.wikipedia.org/wiki/Iterative_method Iterative Methods on Wikipedia]] for more + * info + */ +abstract class IterativeSolver() extends Solver { + + //Setters for parameters + def setIterations(iterations: Int): this.type = { + parameters.add(Iterations, iterations) + this + } + + def setStepsize(stepsize: Double): this.type = { + parameters.add(LearningRate, stepsize) + this + } + + def setConvergenceThreshold(convergenceThreshold: Double): this.type = { + parameters.add(ConvergenceThreshold, convergenceThreshold) + this + } + + def setLearningRateMethod(learningRateMethod: LearningRateMethodTrait): this.type = { + parameters.add(LearningRateMethodValue, learningRateMethod) + this + } +} + +object IterativeSolver { + + val MAX_DLOSS: Double = 1e12 + + // Define parameters for IterativeSolver + case object LearningRate extends Parameter[Double] { + val defaultValue = Some(0.1) + } + + case object Iterations extends Parameter[Int] { + val defaultValue = Some(10) + } + + case object ConvergenceThreshold extends Parameter[Double] { + val defaultValue = None + } + + case object LearningRateMethodValue extends Parameter[LearningRateMethodTrait] { + val defaultValue = Some(LearningRateMethod.Default) + } +} + +object LearningRateMethod { + + sealed trait LearningRateMethodTrait extends Serializable { + def calculateLearningRate( + initialLearningRate: Double, + iteration: Int, + regularizationConstant: Double) + : Double + } + + object Default extends LearningRateMethodTrait { + override def calculateLearningRate( + initialLearningRate: Double, + iteration: Int, + regularizationConstant: Double) + : Double = { + initialLearningRate / Math.sqrt(iteration) + } + } + + object Constant extends LearningRateMethodTrait { + override def calculateLearningRate( + initialLearningRate: Double, + iteration: Int, + regularizationConstant: Double) + : Double = { + initialLearningRate + } + } + + case class Bottou(optimalInit: Double) extends LearningRateMethodTrait { + override def calculateLearningRate( + initialLearningRate: Double, + iteration: Int, + regularizationConstant: Double) + : Double = { + 1 / (regularizationConstant * (optimalInit + iteration - 1)) + } + } + + case class InvScaling(decay: Double) extends LearningRateMethodTrait { + override def calculateLearningRate( + initialLearningRate: Double, + iteration: Int, + regularizationConstant: Double) + : Double = { + initialLearningRate / Math.pow(iteration, decay) + } + } + + case class Xu(decay: Double) extends LearningRateMethodTrait { + override def calculateLearningRate( + initialLearningRate: Double, + iteration: Int, + regularizationConstant: Double) + : Double = { + initialLearningRate * + Math.pow(1 + regularizationConstant * initialLearningRate * iteration, -decay) + } + } +} diff --git a/src/main/scala/org/apache/flink/ml/outlier/StochasticOutlierSelection.scala b/src/main/scala/org/apache/flink/ml/outlier/StochasticOutlierSelection.scala new file mode 100644 index 0000000000000..9c158fc111376 --- /dev/null +++ b/src/main/scala/org/apache/flink/ml/outlier/StochasticOutlierSelection.scala @@ -0,0 +1,383 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.outlier + +/** An implementation of the Stochastic Outlier Selection algorithm by Jeroen Jansen + * + * For more information about SOS, see https://github.com/jeroenjanssens/sos + * J.H.M. Janssens, F. Huszar, E.O. Postma, and H.J. van den Herik. Stochastic + * Outlier Selection. Technical Report TiCC TR 2012-001, Tilburg University, + * Tilburg, the Netherlands, 2012. + * + * @example + * {{{ + * val data = env.fromCollection(List( + * LabeledVector(0.0, DenseVector(1.0, 1.0)), + * LabeledVector(1.0, DenseVector(2.0, 1.0)), + * LabeledVector(2.0, DenseVector(1.0, 2.0)), + * LabeledVector(3.0, DenseVector(2.0, 2.0)), + * LabeledVector(4.0, DenseVector(5.0, 8.0)) // The outlier! + * )) + * + * val sos = new StochasticOutlierSelection().setPerplexity(3) + * + * val outputVector = sos + * .transform(data) + * .collect() + * + * val expectedOutputVector = Map( + * 0 -> 0.2790094479202896, + * 1 -> 0.25775014551682535, + * 2 -> 0.22136130977995766, + * 3 -> 0.12707053787018444, + * 4 -> 0.9922779902453757 // The outlier! + * ) + * + * outputVector.foreach(output => expectedOutputVector(output._1) should be(output._2)) + * }}} + * + * =Parameters= + * + * - [[org.apache.flink.ml.outlier.StochasticOutlierSelection.Perplexity]]: + * Perplexity can be interpreted as the k in k-nearest neighbor algorithms. The difference is + * in SOS being a neighbor is not a binary property, but a probabilistic one, and therefore it + * a real number. Must be between 1 and n-1, where n is the number of points. + * (Default value: '''30''') + * + * - [[org.apache.flink.ml.outlier.StochasticOutlierSelection.ErrorTolerance]]: + * The accepted error tolerance when computing the perplexity. When increasing this number, it + * will sacrifice accuracy in return for reduced computational time. + * (Default value: '''1e-20''') + * + * - [[org.apache.flink.ml.outlier.StochasticOutlierSelection.MaxIterations]]: + * The maximum number of iterations to perform to constrain the computational time. + * (Default value: '''5000''') + */ + +import breeze.linalg.functions.euclideanDistance +import breeze.linalg.{sum, DenseVector => BreezeDenseVector, Vector => BreezeVector} +import org.apache.flink.api.common.operators.Order +import org.apache.flink.api.common.typeinfo.TypeInformation +import org.apache.flink.api.scala._ +import org.apache.flink.api.scala.utils._ +import org.apache.flink.ml.common.{LabeledVector, Parameter, ParameterMap, WithParameters} +import org.apache.flink.ml.math.Breeze._ +import org.apache.flink.ml.math.{BreezeVectorConverter, Vector} +import org.apache.flink.ml.pipeline.{TransformDataSetOperation, Transformer} + +import scala.language.implicitConversions +import scala.reflect.ClassTag + +class StochasticOutlierSelection extends Transformer[StochasticOutlierSelection] { + + import StochasticOutlierSelection._ + + + /** Sets the perplexity of the outlier selection algorithm, can be seen as the k of kNN + * For more information, please read the Stochastic Outlier Selection algorithm technical paper. + * + * @param perplexity the perplexity of the affinity fit + * @return + */ + def setPerplexity(perplexity: Double): StochasticOutlierSelection = { + require(perplexity >= 1, "Perplexity must be at least one.") + parameters.add(Perplexity, perplexity) + this + } + + /** The accepted error tolerance to reduce computational time when approximating the affinity. + * + * @param errorTolerance the accepted error tolerance with respect to the affinity + * @return + */ + def setErrorTolerance(errorTolerance: Double): StochasticOutlierSelection = { + require(errorTolerance >= 0, "Error tolerance cannot be negative.") + parameters.add(ErrorTolerance, errorTolerance) + this + } + + /** The maximum number of iterations to approximate the affinity of the algorithm. + * + * @param maxIterations the maximum number of iterations. + * @return + */ + def setMaxIterations(maxIterations: Int): StochasticOutlierSelection = { + require(maxIterations > 0, "Maximum iterations must be positive.") + parameters.add(MaxIterations, maxIterations) + this + } + +} + +object StochasticOutlierSelection extends WithParameters { + + // ========================================= Parameters ========================================== + case object Perplexity extends Parameter[Double] { + val defaultValue: Option[Double] = Some(30) + } + + case object ErrorTolerance extends Parameter[Double] { + val defaultValue: Option[Double] = Some(1e-20) + } + + case object MaxIterations extends Parameter[Int] { + val defaultValue: Option[Int] = Some(5000) + } + + // ==================================== Factory methods ========================================== + + def apply(): StochasticOutlierSelection = { + new StochasticOutlierSelection() + } + + // ===================================== Operations ============================================== + case class BreezeLabeledVector(idx: Int, data: BreezeVector[Double]) + + implicit val transformLabeledVectors = { + + new TransformDataSetOperation[StochasticOutlierSelection, LabeledVector, (Int, Double)] { + + + /** Overrides the method of the parent class and applies the sochastic outlier selection + * algorithm. + * + * @param instance Instance of the class + * @param transformParameters The user defined parameters of the algorithm + * @param input A data set which consists of all the LabeledVectors, which should have an + * index or unique integer label as vector. + * @return The outlierness of the vectors compared to each other + */ + override def transformDataSet(instance: StochasticOutlierSelection, + transformParameters: ParameterMap, + input: DataSet[LabeledVector]): DataSet[(Int, Double)] = { + + val resultingParameters = instance.parameters ++ transformParameters + + val vectorsWithIndex = input.map(labeledVector => { + BreezeLabeledVector(labeledVector.label.toInt, labeledVector.vector.asBreeze) + }) + + // Don't map back to a labeled-vector since the output of the algorithm is + // a single double instead of vector + outlierSelection(vectorsWithIndex, resultingParameters) + } + } + } + + /** [[TransformDataSetOperation]] applies the stochastic outlier selection algorithm on a + * [[Vector]] which will transform the high-dimensionaly input to a single Double output. + * + * @tparam T Type of the input and output data which has to be a subtype of [[Vector]] + * @return [[TransformDataSetOperation]] a single double which represents the oulierness of + * the input vectors, where the output is in [0, 1] + */ + implicit def transformVectors[T <: Vector : BreezeVectorConverter : TypeInformation : ClassTag] + = { + new TransformDataSetOperation[StochasticOutlierSelection, T, Double] { + override def transformDataSet(instance: StochasticOutlierSelection, + transformParameters: ParameterMap, + input: DataSet[T]): DataSet[Double] = { + + val resultingParameters = instance.parameters ++ transformParameters + + // Map to the right format + val vectorsWithIndex = input.zipWithUniqueId.map(vector => { + BreezeLabeledVector(vector._1.toInt, vector._2.asBreeze) + }) + + outlierSelection(vectorsWithIndex, resultingParameters).map(_._2) + } + } + } + + /** Internal entry point which will execute the different stages of the algorithm using a single + * interface + * + * @param inputVectors Input vectors on which the stochastic outlier selection algorithm + * will be applied which should be the index or a unique integer value + * @param transformParameters The user defined parameters of the algorithm + * @return The outlierness of the vectors compared to each other + */ + private def outlierSelection(inputVectors: DataSet[BreezeLabeledVector], + transformParameters: ParameterMap): DataSet[(Int, Double)] = { + val dissimilarityVectors = computeDissimilarityVectors(inputVectors) + val affinityVectors = computeAffinity(dissimilarityVectors, transformParameters) + val bindingProbabilityVectors = computeBindingProbabilities(affinityVectors) + val outlierProbability = computeOutlierProbability(bindingProbabilityVectors) + + outlierProbability + } + + /** Compute pair-wise distance from each vector, to all other vectors. + * + * @param inputVectors The input vectors, will compare the vector to all other vectors based + * on an distance method. + * @return Returns new set of [[BreezeLabeledVector]] with dissimilarity vector + */ + def computeDissimilarityVectors(inputVectors: DataSet[BreezeLabeledVector]): + DataSet[BreezeLabeledVector] = + inputVectors.cross(inputVectors) { + (a, b) => (a.idx, b.idx, euclideanDistance(a.data, b.data)) + }.filter(dist => dist._1 != dist._2) // Filter out the diagonal, this contains no information. + .groupBy(0) + .sortGroup(1, Order.ASCENDING) + .reduceGroup { + distancesIterator => { + val distances = distancesIterator.toList + val distanceVector = distances.map(_._3).toArray + + BreezeLabeledVector(distances.head._1, BreezeDenseVector(distanceVector)) + } + } + + /** Approximate the affinity by fitting a Gaussian-like function + * + * @param dissimilarityVectors The dissimilarity vectors which represents the distance to the + * other vectors in the data set. + * @param resultingParameters The user defined parameters of the algorithm + * @return Returns new set of [[BreezeLabeledVector]] with dissimilarity vector + */ + def computeAffinity(dissimilarityVectors: DataSet[BreezeLabeledVector], + resultingParameters: ParameterMap): DataSet[BreezeLabeledVector] = { + val logPerplexity = Math.log(resultingParameters(Perplexity)) + val maxIterations = resultingParameters(MaxIterations) + val errorTolerance = resultingParameters(ErrorTolerance) + + dissimilarityVectors.map(vec => { + val breezeVec = binarySearch(vec.data, logPerplexity, maxIterations, errorTolerance) + BreezeLabeledVector(vec.idx, breezeVec) + }) + } + + /** Normalizes the input vectors so each row sums up to one. + * + * @param affinityVectors The affinity vectors which is the quantification of the relationship + * between the original vectors. + * @return Returns new set of [[BreezeLabeledVector]] with represents the binding + * probabilities, which is in fact the affinity where each row sums up to one. + */ + def computeBindingProbabilities(affinityVectors: DataSet[BreezeLabeledVector]): + DataSet[BreezeLabeledVector] = + affinityVectors.map(vec => BreezeLabeledVector(vec.idx, vec.data :/ sum(vec.data))) + + /** Compute the final outlier probability by taking the product of the column. + * + * @param bindingProbabilityVectors The binding probability vectors where the binding + * probability is based on the affinity and represents the + * probability of a vector binding with another vector. + * @return Returns a single double which represents the final outlierness of the input vector. + */ + def computeOutlierProbability(bindingProbabilityVectors: DataSet[BreezeLabeledVector]): + DataSet[(Int, Double)] = bindingProbabilityVectors + .flatMap(vec => vec.data.toArray.zipWithIndex.map(pair => { + + // The DistanceMatrix removed the diagonal, but we need to compute the product + // of the column, so we need to correct the offset. + val columnIndex = if (pair._2 >= vec.idx) { + 1 + } else { + 0 + } + + (columnIndex + pair._2, pair._1) + })).groupBy(0).reduceGroup { + probabilities => { + var rowNumber = -1 + var outlierProbability = 1.0 + for (probability <- probabilities) { + rowNumber = probability._1 + outlierProbability = outlierProbability * (1.0 - probability._2) + } + + (rowNumber, outlierProbability) + } + } + + /** Performs a binary search to get affinities in such a way that each conditional Gaussian has + * the same perplexity. + * + * @param dissimilarityVector The input dissimilarity vector which represents the current + * vector distance to the other vectors in the data set + * @param logPerplexity The log of the perplexity, which represents the probability of having + * affinity with another vector. + * @param maxIterations The maximum iterations to limit the computational time. + * @param tolerance The allowed tolerance to sacrifice precision for decreased computational + * time. + * @param beta: The current beta + * @param betaMin The lower bound of beta + * @param betaMax The upper bound of beta + * @param iteration The current iteration + * @return Returns the affinity vector of the input vector. + */ + def binarySearch( + dissimilarityVector: BreezeVector[Double], + logPerplexity: Double, + maxIterations: Int, + tolerance: Double, + beta: Double = 1.0, + betaMin: Double = Double.NegativeInfinity, + betaMax: Double = Double.PositiveInfinity, + iteration: Int = 0) + : BreezeVector[Double] = { + + val newAffinity = dissimilarityVector.map(d => Math.exp(-d * beta)) + val sumA = sum(newAffinity) + val hCurr = Math.log(sumA) + beta * sum(dissimilarityVector :* newAffinity) / sumA + val hDiff = hCurr - logPerplexity + + if (iteration < maxIterations && Math.abs(hDiff) > tolerance) { + // Compute the Gaussian kernel and entropy for the current precision + val (newBeta, newBetaMin, newBetaMax) = if (hDiff.isNaN) { + (beta / 10.0, betaMin, betaMax) // Reduce beta to get it in range + } else { + if (hDiff > 0) { + val newBeta = + if (betaMax == Double.PositiveInfinity || betaMax == Double.NegativeInfinity) { + beta * 2.0 + } else { + (beta + betaMax) / 2.0 + } + + (newBeta, beta, betaMax) + } else { + val newBeta = + if (betaMin == Double.PositiveInfinity || betaMin == Double.NegativeInfinity) { + beta / 2.0 + } else { + (beta + betaMin) / 2.0 + } + + (newBeta, betaMin, beta) + } + } + + binarySearch(dissimilarityVector, + logPerplexity, + maxIterations, + tolerance, + newBeta, + newBetaMin, + newBetaMax, + iteration + 1) + } + else { + newAffinity + } + } +} diff --git a/src/main/scala/org/apache/flink/ml/package.scala b/src/main/scala/org/apache/flink/ml/package.scala new file mode 100644 index 0000000000000..82b13e59e4925 --- /dev/null +++ b/src/main/scala/org/apache/flink/ml/package.scala @@ -0,0 +1,119 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink + +import org.apache.flink.api.common.functions.{RichFilterFunction, RichMapFunction} +import org.apache.flink.api.common.typeinfo.TypeInformation +import org.apache.flink.api.java.operators.DataSink +import org.apache.flink.api.scala.{DataSet, ExecutionEnvironment} +import org.apache.flink.configuration.Configuration +import org.apache.flink.ml.common.LabeledVector + +import scala.reflect.ClassTag + +package object ml { + + /** Pimp my [[ExecutionEnvironment]] to directly support `readLibSVM` + * + * @param executionEnvironment + */ + implicit class RichExecutionEnvironment(executionEnvironment: ExecutionEnvironment) { + def readLibSVM(path: String): DataSet[LabeledVector] = { + MLUtils.readLibSVM(executionEnvironment, path) + } + } + + /** Pimp my [[DataSet]] to directly support `writeAsLibSVM` + * + * @param dataSet + */ + implicit class RichLabeledDataSet(dataSet: DataSet[LabeledVector]) { + def writeAsLibSVM(path: String): DataSink[String] = { + MLUtils.writeLibSVM(path, dataSet) + } + } + + implicit class RichDataSet[T](dataSet: DataSet[T]) { + def mapWithBcVariable[B, O: TypeInformation: ClassTag]( + broadcastVariable: DataSet[B])( + fun: (T, B) => O) + : DataSet[O] = { + dataSet.map(new BroadcastSingleElementMapper[T, B, O](dataSet.clean(fun))) + .withBroadcastSet(broadcastVariable, "broadcastVariable") + } + + def filterWithBcVariable[B, O](broadcastVariable: DataSet[B])(fun: (T, B) => Boolean) + : DataSet[T] = { + dataSet.filter(new BroadcastSingleElementFilter[T, B](dataSet.clean(fun))) + .withBroadcastSet(broadcastVariable, "broadcastVariable") + } + + def mapWithBcVariableIteration[B, O: TypeInformation: ClassTag]( + broadcastVariable: DataSet[B])(fun: (T, B, Int) => O) + : DataSet[O] = { + dataSet.map(new BroadcastSingleElementMapperWithIteration[T, B, O](dataSet.clean(fun))) + .withBroadcastSet(broadcastVariable, "broadcastVariable") + } + } + + private class BroadcastSingleElementMapper[T, B, O]( + fun: (T, B) => O) + extends RichMapFunction[T, O] { + var broadcastVariable: B = _ + + @throws(classOf[Exception]) + override def open(configuration: Configuration): Unit = { + broadcastVariable = getRuntimeContext.getBroadcastVariable[B]("broadcastVariable").get(0) + } + + override def map(value: T): O = { + fun(value, broadcastVariable) + } + } + + private class BroadcastSingleElementMapperWithIteration[T, B, O]( + fun: (T, B, Int) => O) + extends RichMapFunction[T, O] { + var broadcastVariable: B = _ + + @throws(classOf[Exception]) + override def open(configuration: Configuration): Unit = { + broadcastVariable = getRuntimeContext.getBroadcastVariable[B]("broadcastVariable").get(0) + } + + override def map(value: T): O = { + fun(value, broadcastVariable, getIterationRuntimeContext.getSuperstepNumber) + } + } + + private class BroadcastSingleElementFilter[T, B]( + fun: (T, B) => Boolean) + extends RichFilterFunction[T] { + var broadcastVariable: B = _ + + @throws(classOf[Exception]) + override def open(configuration: Configuration): Unit = { + broadcastVariable = getRuntimeContext.getBroadcastVariable[B]("broadcastVariable").get(0) + } + + override def filter(value: T): Boolean = { + fun(value, broadcastVariable) + } + } +} diff --git a/src/main/scala/org/apache/flink/ml/pipeline/ChainedPredictor.scala b/src/main/scala/org/apache/flink/ml/pipeline/ChainedPredictor.scala new file mode 100644 index 0000000000000..1fc1e0f939fa0 --- /dev/null +++ b/src/main/scala/org/apache/flink/ml/pipeline/ChainedPredictor.scala @@ -0,0 +1,139 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.pipeline + +import org.apache.flink.api.common.typeinfo.TypeInformation +import org.apache.flink.api.scala.DataSet +import org.apache.flink.ml.common.ParameterMap + +/** [[Predictor]] which represents a pipeline of possibly multiple [[Transformer]] and a trailing + * [[Predictor]]. + * + * The [[ChainedPredictor]] can be used as a regular [[Predictor]]. Upon calling the fit method, + * the input data is piped through all preceding [[Transformer]] in the pipeline and the resulting + * data is given to the trailing [[Predictor]]. The same holds true for the predict operation. + * + * The pipeline mechanism has been inspired by scikit-learn + * + * @param transformer Preceding [[Transformer]] of the pipeline + * @param predictor Trailing [[Predictor]] of the pipeline + * @tparam T Type of the preceding [[Transformer]] + * @tparam P Type of the trailing [[Predictor]] + */ +case class ChainedPredictor[T <: Transformer[T], P <: Predictor[P]](transformer: T, predictor: P) + extends Predictor[ChainedPredictor[T, P]]{} + +object ChainedPredictor{ + + /** [[PredictDataSetOperation]] for the [[ChainedPredictor]]. + * + * The [[PredictDataSetOperation]] requires the [[TransformDataSetOperation]] of the preceding + * [[Transformer]] and the [[PredictDataSetOperation]] of the trailing [[Predictor]]. Upon + * calling predict, the testing data is first transformed by the preceding [[Transformer]] and + * the result is then used to calculate the prediction via the trailing [[Predictor]]. + * + * @param transformOperation [[TransformDataSetOperation]] for the preceding [[Transformer]] + * @param predictOperation [[PredictDataSetOperation]] for the trailing [[Predictor]] + * @tparam T Type of the preceding [[Transformer]] + * @tparam P Type of the trailing [[Predictor]] + * @tparam Testing Type of the testing data + * @tparam Intermediate Type of the intermediate data produced by the preceding [[Transformer]] + * @tparam Prediction Type of the predicted data generated by the trailing [[Predictor]] + * @return + */ + implicit def chainedPredictOperation[ + T <: Transformer[T], + P <: Predictor[P], + Testing, + Intermediate, + Prediction]( + implicit transformOperation: TransformDataSetOperation[T, Testing, Intermediate], + predictOperation: PredictDataSetOperation[P, Intermediate, Prediction]) + : PredictDataSetOperation[ChainedPredictor[T, P], Testing, Prediction] = { + + new PredictDataSetOperation[ChainedPredictor[T, P], Testing, Prediction] { + override def predictDataSet( + instance: ChainedPredictor[T, P], + predictParameters: ParameterMap, + input: DataSet[Testing]) + : DataSet[Prediction] = { + + val testing = instance.transformer.transform(input, predictParameters) + instance.predictor.predict(testing, predictParameters) + } + } + } + + /** [[FitOperation]] for the [[ChainedPredictor]]. + * + * The [[FitOperation]] requires the [[FitOperation]] and the [[TransformDataSetOperation]] of + * the preceding [[Transformer]] as well as the [[FitOperation]] of the trailing [[Predictor]]. + * Upon calling fit, the preceding [[Transformer]] is first fitted to the training data. + * The training data is then transformed by the fitted [[Transformer]]. The transformed data + * is then used to fit the [[Predictor]]. + * + * @param fitOperation [[FitOperation]] of the preceding [[Transformer]] + * @param transformOperation [[TransformDataSetOperation]] of the preceding [[Transformer]] + * @param predictorFitOperation [[PredictDataSetOperation]] of the trailing [[Predictor]] + * @tparam L Type of the preceding [[Transformer]] + * @tparam R Type of the trailing [[Predictor]] + * @tparam I Type of the training data + * @tparam T Type of the intermediate data + * @return + */ + implicit def chainedFitOperation[L <: Transformer[L], R <: Predictor[R], I, T](implicit + fitOperation: FitOperation[L, I], + transformOperation: TransformDataSetOperation[L, I, T], + predictorFitOperation: FitOperation[R, T]): FitOperation[ChainedPredictor[L, R], I] = { + new FitOperation[ChainedPredictor[L, R], I] { + override def fit( + instance: ChainedPredictor[L, R], + fitParameters: ParameterMap, + input: DataSet[I]) + : Unit = { + instance.transformer.fit(input, fitParameters) + val intermediateResult = instance.transformer.transform(input, fitParameters) + instance.predictor.fit(intermediateResult, fitParameters) + } + } + } + + implicit def chainedEvaluationOperation[ + T <: Transformer[T], + P <: Predictor[P], + Testing, + Intermediate, + PredictionValue]( + implicit transformOperation: TransformDataSetOperation[T, Testing, Intermediate], + evaluateOperation: EvaluateDataSetOperation[P, Intermediate, PredictionValue], + testingTypeInformation: TypeInformation[Testing], + predictionValueTypeInformation: TypeInformation[PredictionValue]) + : EvaluateDataSetOperation[ChainedPredictor[T, P], Testing, PredictionValue] = { + new EvaluateDataSetOperation[ChainedPredictor[T, P], Testing, PredictionValue] { + override def evaluateDataSet( + instance: ChainedPredictor[T, P], + evaluateParameters: ParameterMap, + testing: DataSet[Testing]) + : DataSet[(PredictionValue, PredictionValue)] = { + val intermediate = instance.transformer.transform(testing, evaluateParameters) + instance.predictor.evaluate(intermediate, evaluateParameters) + } + } + } +} diff --git a/src/main/scala/org/apache/flink/ml/pipeline/ChainedTransformer.scala b/src/main/scala/org/apache/flink/ml/pipeline/ChainedTransformer.scala new file mode 100644 index 0000000000000..4d001d1569204 --- /dev/null +++ b/src/main/scala/org/apache/flink/ml/pipeline/ChainedTransformer.scala @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.pipeline + +import org.apache.flink.api.scala.DataSet +import org.apache.flink.ml.common.ParameterMap + +/** [[Transformer]] which represents the chaining of two [[Transformer]]. + * + * A [[ChainedTransformer]] can be treated as regular [[Transformer]]. Upon calling the fit or + * transform operation, the data is piped through all [[Transformer]] of the pipeline. + * + * The pipeline mechanism has been inspired by scikit-learn + * + * @param left Left [[Transformer]] of the pipeline + * @param right Right [[Transformer]] of the pipeline + * @tparam L Type of the left [[Transformer]] + * @tparam R Type of the right [[Transformer]] + */ +case class ChainedTransformer[L <: Transformer[L], R <: Transformer[R]](left: L, right: R) + extends Transformer[ChainedTransformer[L, R]] { +} + +object ChainedTransformer{ + + /** [[TransformDataSetOperation]] implementation for [[ChainedTransformer]]. + * + * First the transform operation of the left [[Transformer]] is called with the input data. This + * generates intermediate data which is fed to the right [[Transformer]]'s transform operation. + * + * @param transformOpLeft [[TransformDataSetOperation]] for the left [[Transformer]] + * @param transformOpRight [[TransformDataSetOperation]] for the right [[Transformer]] + * @tparam L Type of the left [[Transformer]] + * @tparam R Type of the right [[Transformer]] + * @tparam I Type of the input data + * @tparam T Type of the intermediate output data + * @tparam O Type of the output data + * @return + */ + implicit def chainedTransformOperation[ + L <: Transformer[L], + R <: Transformer[R], + I, + T, + O](implicit + transformOpLeft: TransformDataSetOperation[L, I, T], + transformOpRight: TransformDataSetOperation[R, T, O]) + : TransformDataSetOperation[ChainedTransformer[L,R], I, O] = { + + new TransformDataSetOperation[ChainedTransformer[L, R], I, O] { + override def transformDataSet( + chain: ChainedTransformer[L, R], + transformParameters: ParameterMap, + input: DataSet[I]): DataSet[O] = { + val intermediateResult = transformOpLeft.transformDataSet( + chain.left, + transformParameters, + input) + transformOpRight.transformDataSet(chain.right, transformParameters, intermediateResult) + } + } + } + + /** [[FitOperation]] implementation for [[ChainedTransformer]]. + * + * First the fit operation of the left [[Transformer]] is called with the input data. Then + * the data is transformed by this [[Transformer]] and the given to the fit operation of the + * right [[Transformer]]. + * + * @param leftFitOperation [[FitOperation]] for the left [[Transformer]] + * @param leftTransformOperation [[TransformDataSetOperation]] for the left [[Transformer]] + * @param rightFitOperation [[FitOperation]] for the right [[Transformer]] + * @tparam L Type of the left [[Transformer]] + * @tparam R Type of the right [[Transformer]] + * @tparam I Type of the input data + * @tparam T Type of the intermediate output data + * @return + */ + implicit def chainedFitOperation[L <: Transformer[L], R <: Transformer[R], I, T](implicit + leftFitOperation: FitOperation[L, I], + leftTransformOperation: TransformDataSetOperation[L, I, T], + rightFitOperation: FitOperation[R, T]): FitOperation[ChainedTransformer[L, R], I] = { + new FitOperation[ChainedTransformer[L, R], I] { + override def fit( + instance: ChainedTransformer[L, R], + fitParameters: ParameterMap, + input: DataSet[I]): Unit = { + instance.left.fit(input, fitParameters) + val intermediateResult = instance.left.transform(input, fitParameters) + instance.right.fit(intermediateResult, fitParameters) + } + } + } +} diff --git a/src/main/scala/org/apache/flink/ml/pipeline/Estimator.scala b/src/main/scala/org/apache/flink/ml/pipeline/Estimator.scala new file mode 100644 index 0000000000000..e4f8e65ea22b0 --- /dev/null +++ b/src/main/scala/org/apache/flink/ml/pipeline/Estimator.scala @@ -0,0 +1,181 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.pipeline + +import scala.reflect.ClassTag +import scala.reflect.runtime.universe._ + +import org.apache.flink.api.scala.DataSet +import org.apache.flink.ml.common.{FlinkMLTools, ParameterMap, WithParameters} + +/** Base trait for Flink's pipeline operators. + * + * An estimator can be fitted to input data. In order to do that the implementing class has + * to provide an implementation of a [[FitOperation]] with the correct input type. In order to make + * the [[FitOperation]] retrievable by the Scala compiler, the implementation should be placed + * in the companion object of the implementing class. + * + * The pipeline mechanism has been inspired by scikit-learn + * + * @tparam Self + */ +trait Estimator[Self] extends WithParameters { + that: Self => + + /** Fits the estimator to the given input data. The fitting logic is contained in the + * [[FitOperation]]. The computed state will be stored in the implementing class. + * + * @param training Training data + * @param fitParameters Additional parameters for the [[FitOperation]] + * @param fitOperation [[FitOperation]] which encapsulates the algorithm logic + * @tparam Training Type of the training data + * @return + */ + def fit[Training]( + training: DataSet[Training], + fitParameters: ParameterMap = ParameterMap.Empty)(implicit + fitOperation: FitOperation[Self, Training]): Unit = { + FlinkMLTools.registerFlinkMLTypes(training.getExecutionEnvironment) + fitOperation.fit(this, fitParameters, training) + } +} + +object Estimator{ + + /** Fallback [[FitOperation]] type class implementation which is used if no other + * [[FitOperation]] with the right input types could be found in the scope of the implementing + * class. The fallback [[FitOperation]] makes the system fail in the pre-flight phase by + * throwing a [[RuntimeException]] which states the reason for the failure. Usually the error + * is a missing [[FitOperation]] implementation for the input types or the wrong chaining + * of pipeline operators which have incompatible input/output types. + * + * @tparam Self Type of the pipeline operator + * @tparam Training Type of training data + * @return + */ + implicit def fallbackFitOperation[ + Self: TypeTag, + Training: TypeTag] + : FitOperation[Self, Training] = { + new FitOperation[Self, Training]{ + override def fit( + instance: Self, + fitParameters: ParameterMap, + input: DataSet[Training]) + : Unit = { + val self = typeOf[Self] + val training = typeOf[Training] + + throw new RuntimeException("There is no FitOperation defined for " + self + + " which trains on a DataSet[" + training + "]") + } + } + } + + /** Fallback [[PredictDataSetOperation]] if a [[Predictor]] is called with a not supported input + * data type. The fallback [[PredictDataSetOperation]] lets the system fail with a + * [[RuntimeException]] stating which input and output data types were inferred but for which no + * [[PredictDataSetOperation]] could be found. + * + * @tparam Self Type of the [[Predictor]] + * @tparam Testing Type of the testing data + * @return + */ + implicit def fallbackPredictOperation[ + Self: TypeTag, + Testing: TypeTag] + : PredictDataSetOperation[Self, Testing, Any] = { + new PredictDataSetOperation[Self, Testing, Any] { + override def predictDataSet( + instance: Self, + predictParameters: ParameterMap, + input: DataSet[Testing]) + : DataSet[Any] = { + val self = typeOf[Self] + val testing = typeOf[Testing] + + throw new RuntimeException("There is no PredictOperation defined for " + self + + " which takes a DataSet[" + testing + "] as input.") + } + } + } + + /** Fallback [[TransformDataSetOperation]] for [[Transformer]] which do not support the input or + * output type with which they are called. This is usualy the case if pipeline operators are + * chained which have incompatible input/output types. In order to detect these failures, the + * fallback [[TransformDataSetOperation]] throws a [[RuntimeException]] with the corresponding + * input/output types. Consequently, a wrong pipeline will be detected at pre-flight phase of + * Flink and thus prior to execution time. + * + * @tparam Self Type of the [[Transformer]] for which the [[TransformDataSetOperation]] is + * defined + * @tparam IN Input data type of the [[TransformDataSetOperation]] + * @return + */ + implicit def fallbackTransformOperation[ + Self: TypeTag, + IN: TypeTag] + : TransformDataSetOperation[Self, IN, Any] = { + new TransformDataSetOperation[Self, IN, Any] { + override def transformDataSet( + instance: Self, + transformParameters: ParameterMap, + input: DataSet[IN]) + : DataSet[Any] = { + val self = typeOf[Self] + val in = typeOf[IN] + + throw new RuntimeException("There is no TransformOperation defined for " + + self + " which takes a DataSet[" + in + + "] as input.") + } + } + } + + implicit def fallbackEvaluateOperation[ + Self: TypeTag, + Testing: TypeTag] + : EvaluateDataSetOperation[Self, Testing, Any] = { + new EvaluateDataSetOperation[Self, Testing, Any] { + override def evaluateDataSet( + instance: Self, + predictParameters: ParameterMap, + input: DataSet[Testing]) + : DataSet[(Any, Any)] = { + val self = typeOf[Self] + val testing = typeOf[Testing] + + throw new RuntimeException("There is no PredictOperation defined for " + self + + " which takes a DataSet[" + testing + "] as input.") + } + } + } +} + +/** Type class for the fit operation of an [[Estimator]]. + * + * The [[FitOperation]] contains a self type parameter so that the Scala compiler looks into + * the companion object of this class to find implicit values. + * + * @tparam Self Type of the [[Estimator]] subclass for which the [[FitOperation]] is defined + * @tparam Training Type of the training data + */ +trait FitOperation[Self, Training]{ + def fit(instance: Self, fitParameters: ParameterMap, input: DataSet[Training]): Unit +} diff --git a/src/main/scala/org/apache/flink/ml/pipeline/Predictor.scala b/src/main/scala/org/apache/flink/ml/pipeline/Predictor.scala new file mode 100644 index 0000000000000..0c06316399a82 --- /dev/null +++ b/src/main/scala/org/apache/flink/ml/pipeline/Predictor.scala @@ -0,0 +1,258 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.pipeline + +import org.apache.flink.api.common.typeinfo.TypeInformation + +import org.apache.flink.api.scala._ +import org.apache.flink.ml._ +import org.apache.flink.ml.common.{FlinkMLTools, ParameterMap, WithParameters} + +/** Predictor trait for Flink's pipeline operators. + * + * A [[Predictor]] calculates predictions for testing data based on the model it learned during + * the fit operation (training phase). In order to do that, the implementing class has to provide + * a [[FitOperation]] and a [[PredictDataSetOperation]] implementation for the correct types. The + * implicit values should be put into the scope of the companion object of the implementing class + * to make them retrievable for the Scala compiler. + * + * The pipeline mechanism has been inspired by scikit-learn + * + * @tparam Self Type of the implementing class + */ +trait Predictor[Self] extends Estimator[Self] with WithParameters { + that: Self => + + /** Predict testing data according the learned model. The implementing class has to provide + * a corresponding implementation of [[PredictDataSetOperation]] which contains the prediction + * logic. + * + * @param testing Testing data which shall be predicted + * @param predictParameters Additional parameters for the prediction + * @param predictor [[PredictDataSetOperation]] which encapsulates the prediction logic + * @tparam Testing Type of the testing data + * @tparam Prediction Type of the prediction data + * @return + */ + def predict[Testing, Prediction]( + testing: DataSet[Testing], + predictParameters: ParameterMap = ParameterMap.Empty)(implicit + predictor: PredictDataSetOperation[Self, Testing, Prediction]) + : DataSet[Prediction] = { + FlinkMLTools.registerFlinkMLTypes(testing.getExecutionEnvironment) + predictor.predictDataSet(this, predictParameters, testing) + } + + /** Evaluates the testing data by computing the prediction value and returning a pair of true + * label value and prediction value. It is important that the implementation chooses a Testing + * type from which it can extract the true label value. + * + * @param testing + * @param evaluateParameters + * @param evaluator + * @tparam Testing + * @tparam PredictionValue + * @return + */ + def evaluate[Testing, PredictionValue]( + testing: DataSet[Testing], + evaluateParameters: ParameterMap = ParameterMap.Empty)(implicit + evaluator: EvaluateDataSetOperation[Self, Testing, PredictionValue]) + : DataSet[(PredictionValue, PredictionValue)] = { + FlinkMLTools.registerFlinkMLTypes(testing.getExecutionEnvironment) + evaluator.evaluateDataSet(this, evaluateParameters, testing) + } +} + +object Predictor { + + /** Default [[PredictDataSetOperation]] which takes a [[PredictOperation]] to calculate a tuple + * of testing element and its prediction value. + * + * Note: We have to put the TypeInformation implicit values for Testing and PredictionValue after + * the PredictOperation implicit parameter. Otherwise, if it's defined as a context bound, then + * the Scala compiler does not find the implicit [[PredictOperation]] value. + * + * @param predictOperation + * @param testingTypeInformation + * @param predictionValueTypeInformation + * @tparam Instance + * @tparam Model + * @tparam Testing + * @tparam PredictionValue + * @return + */ + implicit def defaultPredictDataSetOperation[ + Instance <: Estimator[Instance], + Model, + Testing, + PredictionValue]( + implicit predictOperation: PredictOperation[Instance, Model, Testing, PredictionValue], + testingTypeInformation: TypeInformation[Testing], + predictionValueTypeInformation: TypeInformation[PredictionValue]) + : PredictDataSetOperation[Instance, Testing, (Testing, PredictionValue)] = { + new PredictDataSetOperation[Instance, Testing, (Testing, PredictionValue)] { + override def predictDataSet( + instance: Instance, + predictParameters: ParameterMap, + input: DataSet[Testing]) + : DataSet[(Testing, PredictionValue)] = { + val resultingParameters = instance.parameters ++ predictParameters + + val model = predictOperation.getModel(instance, resultingParameters) + + implicit val resultTypeInformation = createTypeInformation[(Testing, PredictionValue)] + + input.mapWithBcVariable(model){ + (element, model) => { + (element, predictOperation.predict(element, model)) + } + } + } + } + } + + /** Default [[EvaluateDataSetOperation]] which takes a [[PredictOperation]] to calculate a tuple + * of true label value and predicted label value. + * + * Note: We have to put the TypeInformation implicit values for Testing and PredictionValue after + * the PredictOperation implicit parameter. Otherwise, if it's defined as a context bound, then + * the Scala compiler does not find the implicit [[PredictOperation]] value. + * + * @param predictOperation + * @param testingTypeInformation + * @param predictionValueTypeInformation + * @tparam Instance + * @tparam Model + * @tparam Testing + * @tparam PredictionValue + * @return + */ + implicit def defaultEvaluateDataSetOperation[ + Instance <: Estimator[Instance], + Model, + Testing, + PredictionValue]( + implicit predictOperation: PredictOperation[Instance, Model, Testing, PredictionValue], + testingTypeInformation: TypeInformation[Testing], + predictionValueTypeInformation: TypeInformation[PredictionValue]) + : EvaluateDataSetOperation[Instance, (Testing, PredictionValue), PredictionValue] = { + new EvaluateDataSetOperation[Instance, (Testing, PredictionValue), PredictionValue] { + override def evaluateDataSet( + instance: Instance, + evaluateParameters: ParameterMap, + testing: DataSet[(Testing, PredictionValue)]) + : DataSet[(PredictionValue, PredictionValue)] = { + val resultingParameters = instance.parameters ++ evaluateParameters + val model = predictOperation.getModel(instance, resultingParameters) + + implicit val resultTypeInformation = createTypeInformation[(Testing, PredictionValue)] + + testing.mapWithBcVariable(model){ + (element, model) => { + (element._2, predictOperation.predict(element._1, model)) + } + } + } + } + } +} + +/** Type class for the predict operation of [[Predictor]]. This predict operation works on DataSets. + * + * [[Predictor]]s either have to implement this trait or the [[PredictOperation]] trait. The + * implementation has to be made available as an implicit value or function in the scope of + * their companion objects. + * + * The first type parameter is the type of the implementing [[Predictor]] class so that the Scala + * compiler includes the companion object of this class in the search scope for the implicit + * values. + * + * @tparam Self Type of [[Predictor]] implementing class + * @tparam Testing Type of testing data + * @tparam Prediction Type of predicted data + */ +trait PredictDataSetOperation[Self, Testing, Prediction] extends Serializable{ + + /** Calculates the predictions for all elements in the [[DataSet]] input + * + * @param instance The Predictor instance that we will use to make the predictions + * @param predictParameters The parameters for the prediction + * @param input The DataSet containing the unlabeled examples + * @return + */ + def predictDataSet( + instance: Self, + predictParameters: ParameterMap, + input: DataSet[Testing]) + : DataSet[Prediction] +} + +/** Type class for predict operation. It takes an element and the model and then computes the + * prediction value for this element. + * + * It is sufficient for a [[Predictor]] to only implement this trait to support the evaluate and + * predict method. + * + * @tparam Instance The concrete type of the [[Predictor]] that we will use for predictions + * @tparam Model The representation of the predictive model for the algorithm, for example a + * Vector of weights + * @tparam Testing The type of the example that we will use to make the predictions (input) + * @tparam Prediction The type of the label that the prediction operation will produce (output) + * + */ +trait PredictOperation[Instance, Model, Testing, Prediction] extends Serializable{ + + /** Defines how to retrieve the model of the type for which this operation was defined + * + * @param instance The Predictor instance that we will use to make the predictions + * @param predictParameters The parameters for the prediction + * @return A DataSet with the model representation as its only element + */ + def getModel(instance: Instance, predictParameters: ParameterMap): DataSet[Model] + + /** Calculates the prediction for a single element given the model of the [[Predictor]]. + * + * @param value The unlabeled example on which we make the prediction + * @param model The model representation of the prediciton algorithm + * @return A label for the provided example of type [[Prediction]] + */ + def predict(value: Testing, model: Model): + Prediction +} + +/** Type class for the evaluate operation of [[Predictor]]. This evaluate operation works on + * DataSets. + * + * It takes a [[DataSet]] of some type. For each element of this [[DataSet]] the evaluate method + * computes the prediction value and returns a tuple of true label value and prediction value. + * + * @tparam Instance The concrete type of the Predictor instance that we will use to make the + * predictions + * @tparam Testing The type of the example that we will use to make the predictions (input) + * @tparam Prediction The type of the label that the prediction operation will produce (output) + * + */ +trait EvaluateDataSetOperation[Instance, Testing, Prediction] extends Serializable{ + def evaluateDataSet( + instance: Instance, + evaluateParameters: ParameterMap, + testing: DataSet[Testing]) + : DataSet[(Prediction, Prediction)] +} diff --git a/src/main/scala/org/apache/flink/ml/pipeline/Transformer.scala b/src/main/scala/org/apache/flink/ml/pipeline/Transformer.scala new file mode 100644 index 0000000000000..892179b1b331b --- /dev/null +++ b/src/main/scala/org/apache/flink/ml/pipeline/Transformer.scala @@ -0,0 +1,164 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.pipeline + +import org.apache.flink.api.common.typeinfo.TypeInformation +import org.apache.flink.api.scala.DataSet +import org.apache.flink.ml._ +import org.apache.flink.ml.common.{FlinkMLTools, ParameterMap, WithParameters} + +import scala.reflect.ClassTag + +/** Transformer trait for Flink's pipeline operators. + * + * A Transformer transforms a [[DataSet]] of an input type into a [[DataSet]] of an output type. + * Furthermore, a [[Transformer]] is also an [[Estimator]], because some transformations depend + * on the training data. In order to do that the implementing class has to provide a + * [[TransformDataSetOperation]] and [[FitOperation]] implementation. The Scala compiler finds + * these implicit values if it is put in the scope of the companion object of the implementing + * class. + * + * [[Transformer]] can be chained with other [[Transformer]] and [[Predictor]] to create + * pipelines. These pipelines can consist of an arbitrary number of [[Transformer]] and at most + * one trailing [[Predictor]]. + * + * The pipeline mechanism has been inspired by scikit-learn + * + * @tparam Self + */ +trait Transformer[Self <: Transformer[Self]] + extends Estimator[Self] + with WithParameters + with Serializable { + that: Self => + + /** Transform operation which transforms an input [[DataSet]] of type I into an ouptut [[DataSet]] + * of type O. The actual transform operation is implemented within the + * [[TransformDataSetOperation]]. + * + * @param input Input [[DataSet]] of type I + * @param transformParameters Additional parameters for the [[TransformDataSetOperation]] + * @param transformOperation [[TransformDataSetOperation]] which encapsulates the algorithm's + * logic + * @tparam Input Input data type + * @tparam Output Ouptut data type + * @return + */ + def transform[Input, Output]( + input: DataSet[Input], + transformParameters: ParameterMap = ParameterMap.Empty) + (implicit transformOperation: TransformDataSetOperation[Self, Input, Output]) + : DataSet[Output] = { + FlinkMLTools.registerFlinkMLTypes(input.getExecutionEnvironment) + transformOperation.transformDataSet(that, transformParameters, input) + } + + /** Chains two [[Transformer]] to form a [[ChainedTransformer]]. + * + * @param transformer Right side transformer of the resulting pipeline + * @tparam T Type of the [[Transformer]] + * @return + */ + def chainTransformer[T <: Transformer[T]](transformer: T): ChainedTransformer[Self, T] = { + ChainedTransformer(this, transformer) + } + + /** Chains a [[Transformer]] with a [[Predictor]] to form a [[ChainedPredictor]]. + * + * @param predictor Trailing [[Predictor]] of the resulting pipeline + * @tparam P Type of the [[Predictor]] + * @return + */ + def chainPredictor[P <: Predictor[P]](predictor: P): ChainedPredictor[Self, P] = { + ChainedPredictor(this, predictor) + } +} + +object Transformer{ + implicit def defaultTransformDataSetOperation[ + Instance <: Estimator[Instance], + Model, + Input, + Output]( + implicit transformOperation: TransformOperation[Instance, Model, Input, Output], + outputTypeInformation: TypeInformation[Output], + outputClassTag: ClassTag[Output]) + : TransformDataSetOperation[Instance, Input, Output] = { + new TransformDataSetOperation[Instance, Input, Output] { + override def transformDataSet( + instance: Instance, + transformParameters: ParameterMap, + input: DataSet[Input]) + : DataSet[Output] = { + val resultingParameters = instance.parameters ++ transformParameters + val model = transformOperation.getModel(instance, resultingParameters) + + input.mapWithBcVariable(model){ + (element, model) => transformOperation.transform(element, model) + } + } + } + } +} + +/** Type class for a transform operation of [[Transformer]]. This works on [[DataSet]] of elements. + * + * The [[TransformDataSetOperation]] contains a self type parameter so that the Scala compiler + * looks into the companion object of this class to find implicit values. + * + * @tparam Instance Type of the [[Transformer]] for which the [[TransformDataSetOperation]] is + * defined + * @tparam Input Input data type + * @tparam Output Ouptut data type + */ +trait TransformDataSetOperation[Instance, Input, Output] extends Serializable{ + def transformDataSet( + instance: Instance, + transformParameters: ParameterMap, + input: DataSet[Input]) + : DataSet[Output] +} + +/** Type class for a transform operation which works on a single element and the corresponding model + * of the [[Transformer]]. + * + * @tparam Instance + * @tparam Model + * @tparam Input + * @tparam Output + */ +trait TransformOperation[Instance, Model, Input, Output] extends Serializable{ + + /** Retrieves the model of the [[Transformer]] for which this operation has been defined. + * + * @param instance + * @param transformParemters + * @return + */ + def getModel(instance: Instance, transformParemters: ParameterMap): DataSet[Model] + + /** Transforms a single element with respect to the model associated with the respective + * [[Transformer]] + * + * @param element + * @param model + * @return + */ + def transform(element: Input, model: Model): Output +} diff --git a/src/main/scala/org/apache/flink/ml/preprocessing/MinMaxScaler.scala b/src/main/scala/org/apache/flink/ml/preprocessing/MinMaxScaler.scala new file mode 100644 index 0000000000000..049af244d735b --- /dev/null +++ b/src/main/scala/org/apache/flink/ml/preprocessing/MinMaxScaler.scala @@ -0,0 +1,265 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.flink.ml.preprocessing + +import breeze.linalg +import breeze.linalg.{max, min} +import org.apache.flink.api.common.typeinfo.TypeInformation +import org.apache.flink.api.scala._ +import org.apache.flink.ml._ +import org.apache.flink.ml.common.{LabeledVector, Parameter, ParameterMap} +import org.apache.flink.ml.math.Breeze._ +import org.apache.flink.ml.math.{BreezeVectorConverter, Vector} +import org.apache.flink.ml.pipeline.{TransformDataSetOperation, FitOperation, +Transformer} +import org.apache.flink.ml.preprocessing.MinMaxScaler.{Max, Min} + +import scala.reflect.ClassTag + +/** Scales observations, so that all features are in a user-specified range. + * By default for [[MinMaxScaler]] transformer range = [0,1]. + * + * This transformer takes a subtype of [[Vector]] of values and maps it to a + * scaled subtype of [[Vector]] such that each feature lies between a user-specified range. + * + * This transformer can be prepended to all [[Transformer]] and + * [[org.apache.flink.ml.pipeline.Predictor]] implementations which expect as input a subtype + * of [[Vector]] or a [[LabeledVector]]. + * + * @example + * {{{ + * val trainingDS: DataSet[Vector] = env.fromCollection(data) + * val transformer = MinMaxScaler().setMin(-1.0) + * + * transformer.fit(trainingDS) + * val transformedDS = transformer.transform(trainingDS) + * }}} + * + * =Parameters= + * + * - [[Min]]: The minimum value of the range of the transformed data set; by default equal to 0 + * - [[Max]]: The maximum value of the range of the transformed data set; by default + * equal to 1 + */ +class MinMaxScaler extends Transformer[MinMaxScaler] { + + private [preprocessing] var metricsOption: Option[ + DataSet[(linalg.Vector[Double], linalg.Vector[Double])] + ] = None + + /** Sets the minimum for the range of the transformed data + * + * @param min the user-specified minimum value. + * @return the MinMaxScaler instance with its minimum value set to the user-specified value. + */ + def setMin(min: Double): MinMaxScaler = { + parameters.add(Min, min) + this + } + + /** Sets the maximum for the range of the transformed data + * + * @param max the user-specified maximum value. + * @return the MinMaxScaler instance with its minimum value set to the user-specified value. + */ + def setMax(max: Double): MinMaxScaler = { + parameters.add(Max, max) + this + } +} + +object MinMaxScaler { + + // ====================================== Parameters ============================================= + + case object Min extends Parameter[Double] { + override val defaultValue: Option[Double] = Some(0.0) + } + + case object Max extends Parameter[Double] { + override val defaultValue: Option[Double] = Some(1.0) + } + + // ==================================== Factory methods ========================================== + + def apply(): MinMaxScaler = { + new MinMaxScaler() + } + + // ====================================== Operations ============================================= + + /** Trains the [[MinMaxScaler]] by learning the minimum and maximum of each feature of the + * training data. These values are used in the transform step to transform the given input data. + * + * @tparam T Input data type which is a subtype of [[Vector]] + * @return [[FitOperation]] training the [[MinMaxScaler]] on subtypes of [[Vector]] + */ + implicit def fitVectorMinMaxScaler[T <: Vector] = new FitOperation[MinMaxScaler, T] { + override def fit(instance: MinMaxScaler, fitParameters: ParameterMap, input: DataSet[T]) + : Unit = { + val metrics = extractFeatureMinMaxVectors(input) + + instance.metricsOption = Some(metrics) + } + } + + /** Trains the [[MinMaxScaler]] by learning the minimum and maximum of the features of the + * training data which is of type [[LabeledVector]]. The minimum and maximum are used to + * transform the given input data. + * + */ + implicit val fitLabeledVectorMinMaxScaler = { + new FitOperation[MinMaxScaler, LabeledVector] { + override def fit( + instance: MinMaxScaler, + fitParameters: ParameterMap, + input: DataSet[LabeledVector]) + : Unit = { + val vectorDS = input.map(_.vector) + val metrics = extractFeatureMinMaxVectors(vectorDS) + + instance.metricsOption = Some(metrics) + } + } + } + + /** Calculates in one pass over the data the features' minimum and maximum values. + * + * @param dataSet The data set for which we want to calculate the minimum and maximum values. + * @return DataSet containing a single tuple of two vectors (minVector, maxVector). + * The first vector represents the minimum values vector and the second is the maximum + * values vector. + */ + private def extractFeatureMinMaxVectors[T <: Vector](dataSet: DataSet[T]) + : DataSet[(linalg.Vector[Double], linalg.Vector[Double])] = { + + val minMax = dataSet.map { + v => (v.asBreeze, v.asBreeze) + }.reduce { + (minMax1, minMax2) => { + + val tempMinimum = min(minMax1._1, minMax2._1) + val tempMaximum = max(minMax1._2, minMax2._2) + + (tempMinimum, tempMaximum) + } + } + minMax + } + + /** [[TransformDataSetOperation]] which scales input data of subtype of [[Vector]] with respect to + * the calculated minimum and maximum of the training data. The minimum and maximum + * values of the resulting data is configurable. + * + * @tparam T Type of the input and output data which has to be a subtype of [[Vector]] + * @return [[TransformDataSetOperation]] scaling subtypes of [[Vector]] such that the feature + * values are in the configured range + */ + implicit def transformVectors[T <: Vector : BreezeVectorConverter : TypeInformation : ClassTag] + = { + new TransformDataSetOperation[MinMaxScaler, T, T] { + override def transformDataSet( + instance: MinMaxScaler, + transformParameters: ParameterMap, + input: DataSet[T]) + : DataSet[T] = { + + val resultingParameters = instance.parameters ++ transformParameters + val min = resultingParameters(Min) + val max = resultingParameters(Max) + + instance.metricsOption match { + case Some(metrics) => { + input.mapWithBcVariable(metrics) { + (vector, metrics) => { + val (broadcastMin, broadcastMax) = metrics + scaleVector(vector, broadcastMin, broadcastMax, min, max) + } + } + } + + case None => + throw new RuntimeException("The MinMaxScaler has not been fitted to the data. " + + "This is necessary to estimate the minimum and maximum of the data.") + } + } + } + } + + implicit val transformLabeledVectors = { + new TransformDataSetOperation[MinMaxScaler, LabeledVector, LabeledVector] { + override def transformDataSet(instance: MinMaxScaler, + transformParameters: ParameterMap, + input: DataSet[LabeledVector]): DataSet[LabeledVector] = { + val resultingParameters = instance.parameters ++ transformParameters + val min = resultingParameters(Min) + val max = resultingParameters(Max) + + instance.metricsOption match { + case Some(metrics) => { + input.mapWithBcVariable(metrics) { + (labeledVector, metrics) => { + val (broadcastMin, broadcastMax) = metrics + val LabeledVector(label, vector) = labeledVector + + LabeledVector(label, scaleVector(vector, broadcastMin, broadcastMax, min, max)) + } + } + } + + case None => + throw new RuntimeException("The MinMaxScaler has not been fitted to the data. " + + "This is necessary to estimate the minimum and maximum of the data.") + } + } + } + } + + /** Scales a vector such that it's features lie in the range [min, max] + * + * @param vector Vector to scale + * @param broadcastMin Vector containing for each feature the minimal value in the training set + * @param broadcastMax Vector containing for each feature the maximal value in the training set + * @param min Minimal value of range + * @param max Maximal value of range + * @tparam T Type of [[Vector]] + * @return Scaled feature vector + */ + private def scaleVector[T <: Vector: BreezeVectorConverter]( + vector: T, + broadcastMin: linalg.Vector[Double], + broadcastMax: linalg.Vector[Double], + min: Double, + max: Double) + : T = { + var myVector = vector.asBreeze + + //handle the case where a feature takes only one value + val rangePerFeature = (broadcastMax - broadcastMin) + for (i <- 0 until rangePerFeature.size) { + if (rangePerFeature(i) == 0.0) { + rangePerFeature(i)= 1.0 + } + } + + myVector -= broadcastMin + myVector :/= rangePerFeature + myVector = (myVector :* (max - min)) + min + myVector.fromBreeze + } +} diff --git a/src/main/scala/org/apache/flink/ml/preprocessing/PolynomialFeatures.scala b/src/main/scala/org/apache/flink/ml/preprocessing/PolynomialFeatures.scala new file mode 100644 index 0000000000000..24ef77bb74e05 --- /dev/null +++ b/src/main/scala/org/apache/flink/ml/preprocessing/PolynomialFeatures.scala @@ -0,0 +1,209 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.preprocessing + +import org.apache.flink.api.common.typeinfo.TypeInformation +import org.apache.flink.api.scala.{DataSet, _} +import org.apache.flink.ml.common.{LabeledVector, Parameter, ParameterMap} +import org.apache.flink.ml.math.{Vector, VectorBuilder} +import org.apache.flink.ml.pipeline.{FitOperation, TransformDataSetOperation, Transformer} +import org.apache.flink.ml.preprocessing.PolynomialFeatures.Degree + +import scala.reflect.ClassTag + +/** Maps a vector into the polynomial feature space. + * + * This transformer takes a a vector of values `(x, y, z, ...)` and maps it into the + * polynomial feature space of degree `d`. That is to say, it calculates the following + * representation: + * + * `(x, y, z, x^2, xy, y^2, yz, z^2, x^3, x^2y, x^2z, xyz, ...)^T` + * + * This transformer can be prepended to all [[org.apache.flink.ml.pipeline.Transformer]] and + * [[org.apache.flink.ml.pipeline.Predictor]] implementations which expect an input of + * [[LabeledVector]]. + * + * @example + * {{{ + * val trainingDS: DataSet[LabeledVector] = ... + * + * val polyFeatures = PolynomialFeatures() + * .setDegree(3) + * + * val mlr = MultipleLinearRegression() + * + * val pipeline = polyFeatures.chainPredictor(mlr) + * + * pipeline.fit(trainingDS) + * }}} + * + * =Parameters= + * + * - [[org.apache.flink.ml.preprocessing.PolynomialFeatures.Degree]]: Maximum polynomial degree + */ +class PolynomialFeatures extends Transformer[PolynomialFeatures] { + + def setDegree(degree: Int): PolynomialFeatures = { + parameters.add(Degree, degree) + this + } +} + +object PolynomialFeatures{ + + // ====================================== Parameters ============================================= + + case object Degree extends Parameter[Int] { + override val defaultValue: Option[Int] = Some(1) + } + + // =================================== Factory methods =========================================== + + def apply(): PolynomialFeatures = { + new PolynomialFeatures() + } + + // ====================================== Operations ============================================= + + /** The [[PolynomialFeatures]] transformer does not need a fitting phase. + * + * @tparam T The fitting works with arbitrary input types + * @return + */ + implicit def fitNoOp[T] = { + new FitOperation[PolynomialFeatures, T]{ + override def fit( + instance: PolynomialFeatures, + fitParameters: ParameterMap, + input: DataSet[T]) + : Unit = {} + } + } + + /** [[org.apache.flink.ml.pipeline.TransformDataSetOperation]] to map a [[Vector]] into the + * polynomial feature space. + * + * @tparam T Subclass of [[Vector]] + * @return + */ + implicit def transformVectorIntoPolynomialBase[ + T <: Vector : VectorBuilder: TypeInformation: ClassTag + ] = { + new TransformDataSetOperation[PolynomialFeatures, T, T] { + override def transformDataSet( + instance: PolynomialFeatures, + transformParameters: ParameterMap, + input: DataSet[T]) + : DataSet[T] = { + val resultingParameters = instance.parameters ++ transformParameters + + val degree = resultingParameters(Degree) + + input.map { + vector => { + calculatePolynomial(degree, vector) + } + } + } + } + } + + /** [[org.apache.flink.ml.pipeline.TransformDataSetOperation]] to map a [[LabeledVector]] into the + * polynomial feature space + */ + implicit val transformLabeledVectorIntoPolynomialBase = + new TransformDataSetOperation[PolynomialFeatures, LabeledVector, LabeledVector] { + + override def transformDataSet( + instance: PolynomialFeatures, + transformParameters: ParameterMap, + input: DataSet[LabeledVector]) + : DataSet[LabeledVector] = { + val resultingParameters = instance.parameters ++ transformParameters + + val degree = resultingParameters(Degree) + + input.map { + labeledVector => { + val vector = labeledVector.vector + val label = labeledVector.label + + val transformedVector = calculatePolynomial(degree, vector) + + LabeledVector(label, transformedVector) + } + } + } + } + + + private def calculatePolynomial[T <: Vector: VectorBuilder](degree: Int, vector: T): T = { + val builder = implicitly[VectorBuilder[T]] + builder.build(calculateCombinedCombinations(degree, vector)) + } + + /** Calculates for a given vector its representation in the polynomial feature space. + * + * @param degree Maximum degree of polynomial + * @param vector Values of the polynomial variables + * @return List of polynomial values + */ + private def calculateCombinedCombinations(degree: Int, vector: Vector): List[Double] = { + if(degree == 0) { + List() + } else { + val partialResult = calculateCombinedCombinations(degree - 1, vector) + + val combinations = calculateCombinations(vector.size, degree) + + val result = combinations map { + combination => + combination.zipWithIndex.map{ + case (exp, idx) => math.pow(vector(idx), exp) + }.fold(1.0)(_ * _) + } + + result ::: partialResult + } + + } + + /** Calculates all possible combinations of a polynom of degree `value`, whereas the polynom + * can consist of up to `length` factors. The return value is the list of the exponents of the + * individual factors + * + * @param length maximum number of factors + * @param value degree of polynomial + * @return List of lists which contain the exponents of the individual factors + */ + private def calculateCombinations(length: Int, value: Int): List[List[Int]] = { + if(length == 0) { + List() + } else if (length == 1) { + List(List(value)) + } else { + value to 0 by -1 flatMap { + v => + calculateCombinations(length - 1, value - v) map { + v::_ + } + } toList + } + } +} diff --git a/src/main/scala/org/apache/flink/ml/preprocessing/Splitter.scala b/src/main/scala/org/apache/flink/ml/preprocessing/Splitter.scala new file mode 100644 index 0000000000000..b216396134321 --- /dev/null +++ b/src/main/scala/org/apache/flink/ml/preprocessing/Splitter.scala @@ -0,0 +1,210 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.preprocessing + +import org.apache.flink.api.common.typeinfo.{TypeInformation, BasicTypeInfo} +import org.apache.flink.api.java.Utils +import org.apache.flink.api.scala._ +import org.apache.flink.api.scala.DataSet +import org.apache.flink.api.scala.utils._ + + +import org.apache.flink.ml.common.{FlinkMLTools, ParameterMap, WithParameters} +import org.apache.flink.util.Collector +import _root_.scala.reflect.ClassTag + +object Splitter { + + case class TrainTestDataSet[T: TypeInformation : ClassTag](training: DataSet[T], + testing: DataSet[T]) + + case class TrainTestHoldoutDataSet[T: TypeInformation : ClassTag](training: DataSet[T], + testing: DataSet[T], + holdout: DataSet[T]) + // -------------------------------------------------------------------------------------------- + // randomSplit + // -------------------------------------------------------------------------------------------- + /** + * Split a DataSet by the probability fraction of each element. + * + * @param input DataSet to be split + * @param fraction Probability that each element is chosen, should be [0,1] This fraction + * refers to the first element in the resulting array. + * @param precise Sampling by default is random and can result in slightly lop-sided + * sample sets. When precise is true, equal sample set size are forced, + * however this is somewhat less efficient. + * @param seed Random number generator seed. + * @return An array of two datasets + */ + + def randomSplit[T: TypeInformation : ClassTag]( + input: DataSet[T], + fraction: Double, + precise: Boolean = false, + seed: Long = Utils.RNG.nextLong()) + : Array[DataSet[T]] = { + import org.apache.flink.api.scala._ + + val indexedInput: DataSet[(Long, T)] = input.zipWithUniqueId + + if ((fraction >= 1) || (fraction <= 0)) { + throw new IllegalArgumentException("sampling fraction outside of (0,1) bounds is nonsensical") + } + + val leftSplit: DataSet[(Long, T)] = precise match { + case false => indexedInput.sample(false, fraction, seed) + case true => { + val count = indexedInput.count() // todo: count only needed for precise and kills perf. + val numOfSamples = math.round(fraction * count).toInt + indexedInput.sampleWithSize(false, numOfSamples, seed) + } + } + + val leftSplitLight = leftSplit.map(o => (o._1, false)) + + val rightSplit: DataSet[T] = indexedInput.leftOuterJoin[(Long, Boolean)](leftSplitLight) + .where(0) + .equalTo(0).apply { + (full: (Long,T) , left: (Long, Boolean), collector: Collector[T]) => + if (left == null) { + collector.collect(full._2) + } + } + + Array(leftSplit.map(o => o._2), rightSplit) + } + + // -------------------------------------------------------------------------------------------- + // multiRandomSplit + // -------------------------------------------------------------------------------------------- + /** + * Split a DataSet by the probability fraction of each element of a vector. + * + * @param input DataSet to be split + * @param fracArray An array of PROPORTIONS for splitting the DataSet. Unlike the + * randomSplit function, number greater than 1 do not lead to over + * sampling. The number of splits is dictated by the length of this array. + * The number are normalized, eg. Array(1.0, 2.0) would yield + * two data sets with a 33/66% split. + * @param seed Random number generator seed. + * @return An array of DataSets whose length is equal to the length of fracArray + */ + def multiRandomSplit[T: TypeInformation : ClassTag]( + input: DataSet[T], + fracArray: Array[Double], + seed: Long = Utils.RNG.nextLong()) + : Array[DataSet[T]] = { + + import org.apache.commons.math3.distribution.EnumeratedIntegerDistribution + + val eid = new EnumeratedIntegerDistribution((0 to fracArray.length - 1).toArray, fracArray) + + eid.reseedRandomGenerator(seed) + + val tempDS: DataSet[(Int,T)] = input.map(o => (eid.sample, o)) + + val splits = fracArray.length + val outputArray = new Array[DataSet[T]]( splits ) + + for (k <- 0 to splits-1){ + outputArray(k) = tempDS.filter(o => o._1 == k) + .map(o => o._2) + } + + outputArray + } + + // -------------------------------------------------------------------------------------------- + // kFoldSplit + // -------------------------------------------------------------------------------------------- + /** + * Split a DataSet into an array of TrainTest DataSets + * + * @param input DataSet to be split + * @param kFolds The number of TrainTest DataSets to be returns. Each 'testing' will be + * 1/k of the dataset, randomly sampled, the training will be the remainder + * of the dataset. The DataSet is split into kFolds first, so that no + * observation will occurin in multiple folds. + * @param seed Random number generator seed. + * @return An array of TrainTestDataSets + */ + def kFoldSplit[T: TypeInformation : ClassTag]( + input: DataSet[T], + kFolds: Int, + seed: Long = Utils.RNG.nextLong()) + : Array[TrainTestDataSet[T]] = { + + val fracs = Array.fill(kFolds)(1.0) + val dataSetArray = multiRandomSplit(input, fracs, seed) + + dataSetArray.map( ds => TrainTestDataSet(dataSetArray.filter(_ != ds) + .reduce(_ union _), + ds)) + + } + + // -------------------------------------------------------------------------------------------- + // trainTestSplit + // -------------------------------------------------------------------------------------------- + /** + * A wrapper for randomSplit that yields a TrainTestDataSet + * + * @param input DataSet to be split + * @param fraction Probability that each element is chosen, should be [0,1]. + * This fraction refers to the training element in TrainTestSplit + * @param precise Sampling by default is random and can result in slightly lop-sided + * sample sets. When precise is true, equal sample set size are forced, + * however this is somewhat less efficient. + * @param seed Random number generator seed. + * @return A TrainTestDataSet + */ + def trainTestSplit[T: TypeInformation : ClassTag]( + input: DataSet[T], + fraction: Double = 0.6, + precise: Boolean = false, + seed: Long = Utils.RNG.nextLong()) + : TrainTestDataSet[T] = { + val dataSetArray = randomSplit(input, fraction, precise, seed) + TrainTestDataSet(dataSetArray(0), dataSetArray(1)) + } + + // -------------------------------------------------------------------------------------------- + // trainTestHoldoutSplit + // -------------------------------------------------------------------------------------------- + /** + * A wrapper for multiRandomSplit that yields a TrainTestHoldoutDataSet + * + * @param input DataSet to be split + * @param fracTuple A tuple of three doubles, where the first element specifies the + * size of the training set, the second element the testing set, and + * the third element is the holdout set. These are proportional and + * will be normalized internally. + * @param seed Random number generator seed. + * @return A TrainTestDataSet + */ + def trainTestHoldoutSplit[T: TypeInformation : ClassTag]( + input: DataSet[T], + fracTuple: Tuple3[Double, Double, Double] = (0.6,0.3,0.1), + seed: Long = Utils.RNG.nextLong()) + : TrainTestHoldoutDataSet[T] = { + val fracArray = Array(fracTuple._1, fracTuple._2, fracTuple._3) + val dataSetArray = multiRandomSplit(input, fracArray, seed) + TrainTestHoldoutDataSet(dataSetArray(0), dataSetArray(1), dataSetArray(2)) + } +} diff --git a/src/main/scala/org/apache/flink/ml/preprocessing/StandardScaler.scala b/src/main/scala/org/apache/flink/ml/preprocessing/StandardScaler.scala new file mode 100644 index 0000000000000..c95160654986d --- /dev/null +++ b/src/main/scala/org/apache/flink/ml/preprocessing/StandardScaler.scala @@ -0,0 +1,302 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.preprocessing + +import breeze.linalg +import breeze.numerics.sqrt +import breeze.numerics.sqrt._ +import org.apache.flink.api.common.typeinfo.TypeInformation +import org.apache.flink.api.scala._ +import org.apache.flink.ml.common.{LabeledVector, Parameter, ParameterMap} +import org.apache.flink.ml.math.Breeze._ +import org.apache.flink.ml.math.{BreezeVectorConverter, Vector} +import org.apache.flink.ml.pipeline.{TransformOperation, FitOperation, +Transformer} +import org.apache.flink.ml.preprocessing.StandardScaler.{Mean, Std} + +import scala.reflect.ClassTag + +/** Scales observations, so that all features have a user-specified mean and standard deviation. + * By default for [[StandardScaler]] transformer mean=0.0 and std=1.0. + * + * This transformer takes a subtype of [[Vector]] of values and maps it to a + * scaled subtype of [[Vector]] such that each feature has a user-specified mean and standard + * deviation. + * + * This transformer can be prepended to all [[Transformer]] and + * [[org.apache.flink.ml.pipeline.Predictor]] implementations which expect as input a subtype + * of [[Vector]]. + * + * @example + * {{{ + * val trainingDS: DataSet[Vector] = env.fromCollection(data) + * val transformer = StandardScaler().setMean(10.0).setStd(2.0) + * + * transformer.fit(trainingDS) + * val transformedDS = transformer.transform(trainingDS) + * }}} + * + * =Parameters= + * + * - [[Mean]]: The mean value of transformed data set; by default equal to 0 + * - [[Std]]: The standard deviation of the transformed data set; by default + * equal to 1 + */ +class StandardScaler extends Transformer[StandardScaler] { + + private[preprocessing] var metricsOption: Option[ + DataSet[(linalg.Vector[Double], linalg.Vector[Double])] + ] = None + + /** Sets the target mean of the transformed data + * + * @param mu the user-specified mean value. + * @return the StandardScaler instance with its mean value set to the user-specified value + */ + def setMean(mu: Double): StandardScaler = { + parameters.add(Mean, mu) + this + } + + /** Sets the target standard deviation of the transformed data + * + * @param std the user-specified std value. In case the user gives 0.0 value as input, + * the std is set to the default value: 1.0. + * @return the StandardScaler instance with its std value set to the user-specified value + */ + def setStd(std: Double): StandardScaler = { + if (std == 0.0) { + return this + } + parameters.add(Std, std) + this + } +} + +object StandardScaler { + + // ====================================== Parameters ============================================= + + case object Mean extends Parameter[Double] { + override val defaultValue: Option[Double] = Some(0.0) + } + + case object Std extends Parameter[Double] { + override val defaultValue: Option[Double] = Some(1.0) + } + + // ==================================== Factory methods ========================================== + + def apply(): StandardScaler = { + new StandardScaler() + } + + // ====================================== Operations ============================================= + + /** Trains the [[org.apache.flink.ml.preprocessing.StandardScaler]] by learning the mean and + * standard deviation of the training data. These values are used inthe transform step + * to transform the given input data. + * + * @tparam T Input data type which is a subtype of [[Vector]] + * @return + */ + implicit def fitVectorStandardScaler[T <: Vector] = new FitOperation[StandardScaler, T] { + override def fit(instance: StandardScaler, fitParameters: ParameterMap, input: DataSet[T]) + : Unit = { + val metrics = extractFeatureMetrics(input) + + instance.metricsOption = Some(metrics) + } + } + + /** Trains the [[StandardScaler]] by learning the mean and standard deviation of the training + * data which is of type [[LabeledVector]]. The mean and standard deviation are used to + * transform the given input data. + * + */ + implicit val fitLabeledVectorStandardScaler = { + new FitOperation[StandardScaler, LabeledVector] { + override def fit( + instance: StandardScaler, + fitParameters: ParameterMap, + input: DataSet[LabeledVector]) + : Unit = { + val vectorDS = input.map(_.vector) + val metrics = extractFeatureMetrics(vectorDS) + + instance.metricsOption = Some(metrics) + } + } + } + + /** Trains the [[StandardScaler]] by learning the mean and standard deviation of the training + * data which is of type ([[Vector]], Double). The mean and standard deviation are used to + * transform the given input data. + * + */ + implicit def fitLabelVectorTupleStandardScaler + [T <: Vector: BreezeVectorConverter: TypeInformation: ClassTag] = { + new FitOperation[StandardScaler, (T, Double)] { + override def fit( + instance: StandardScaler, + fitParameters: ParameterMap, + input: DataSet[(T, Double)]) + : Unit = { + val vectorDS = input.map(_._1) + val metrics = extractFeatureMetrics(vectorDS) + + instance.metricsOption = Some(metrics) + } + } + } + + /** Calculates in one pass over the data the features' mean and standard deviation. + * For the calculation of the Standard deviation with one pass over the data, + * the Youngs & Cramer algorithm was used: + * [[http://www.cs.yale.edu/publications/techreports/tr222.pdf]] + * + * + * @param dataSet The data set for which we want to calculate mean and variance + * @return DataSet containing a single tuple of two vectors (meanVector, stdVector). + * The first vector represents the mean vector and the second is the standard + * deviation vector. + */ + private def extractFeatureMetrics[T <: Vector](dataSet: DataSet[T]) + : DataSet[(linalg.Vector[Double], linalg.Vector[Double])] = { + val metrics = dataSet.map{ + v => (1.0, v.asBreeze, linalg.Vector.zeros[Double](v.size)) + }.reduce{ + (metrics1, metrics2) => { + /* We use formula 1.5b of the cited technical report for the combination of partial + * sum of squares. According to 1.5b: + * val temp1 : m/n(m+n) + * val temp2 : n/m + */ + val temp1 = metrics1._1 / (metrics2._1 * (metrics1._1 + metrics2._1)) + val temp2 = metrics2._1 / metrics1._1 + val tempVector = (metrics1._2 * temp2) - metrics2._2 + val tempS = (metrics1._3 + metrics2._3) + (tempVector :* tempVector) * temp1 + + (metrics1._1 + metrics2._1, metrics1._2 + metrics2._2, tempS) + } + }.map{ + metric => { + val varianceVector = sqrt(metric._3 / metric._1) + + for (i <- 0 until varianceVector.size) { + if (varianceVector(i) == 0.0) { + varianceVector.update(i, 1.0) + } + } + (metric._2 / metric._1, varianceVector) + } + } + metrics + } + + /** Base class for StandardScaler's [[TransformOperation]]. This class has to be extended for + * all types which are supported by [[StandardScaler]]'s transform operation. + * + * @tparam T + */ + abstract class StandardScalerTransformOperation[T: TypeInformation: ClassTag] + extends TransformOperation[ + StandardScaler, + (linalg.Vector[Double], linalg.Vector[Double]), + T, + T] { + + var mean: Double = _ + var std: Double = _ + + override def getModel( + instance: StandardScaler, + transformParameters: ParameterMap) + : DataSet[(linalg.Vector[Double], linalg.Vector[Double])] = { + mean = transformParameters(Mean) + std = transformParameters(Std) + + instance.metricsOption match { + case Some(metrics) => metrics + case None => + throw new RuntimeException("The StandardScaler has not been fitted to the data. " + + "This is necessary to estimate the mean and standard deviation of the data.") + } + } + + def scale[V <: Vector: BreezeVectorConverter]( + vector: V, + model: (linalg.Vector[Double], linalg.Vector[Double])) + : V = { + val (broadcastMean, broadcastStd) = model + var myVector = vector.asBreeze + myVector -= broadcastMean + myVector :/= broadcastStd + myVector = (myVector :* std) + mean + myVector.fromBreeze + } + } + + /** [[TransformOperation]] to transform [[Vector]] types + * + * @tparam T + * @return + */ + implicit def transformVectors[T <: Vector: BreezeVectorConverter: TypeInformation: ClassTag] = { + new StandardScalerTransformOperation[T]() { + override def transform( + vector: T, + model: (linalg.Vector[Double], linalg.Vector[Double])) + : T = { + scale(vector, model) + } + } + } + + /** [[TransformOperation]] to transform tuples of type ([[Vector]], [[Double]]). + * + * @tparam T + * @return + */ + implicit def transformTupleVectorDouble[ + T <: Vector: BreezeVectorConverter: TypeInformation: ClassTag] = { + new StandardScalerTransformOperation[(T, Double)] { + override def transform( + element: (T, Double), + model: (linalg.Vector[Double], linalg.Vector[Double])) + : (T, Double) = { + (scale(element._1, model), element._2) + } + } + } + + /** [[TransformOperation]] to transform [[LabeledVector]]. + * + */ + implicit val transformLabeledVector = new StandardScalerTransformOperation[LabeledVector] { + override def transform( + element: LabeledVector, + model: (linalg.Vector[Double], linalg.Vector[Double])) + : LabeledVector = { + val LabeledVector(label, vector) = element + + LabeledVector(label, scale(vector, model)) + } + } +} diff --git a/src/main/scala/org/apache/flink/ml/recommendation/ALS.scala b/src/main/scala/org/apache/flink/ml/recommendation/ALS.scala new file mode 100644 index 0000000000000..a90627f5e389a --- /dev/null +++ b/src/main/scala/org/apache/flink/ml/recommendation/ALS.scala @@ -0,0 +1,1060 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.recommendation + +import java.{util, lang} + +import org.apache.flink.api.common.operators.base.JoinOperatorBase.JoinHint +import org.apache.flink.api.scala._ +import org.apache.flink.api.common.operators.Order +import org.apache.flink.core.memory.{DataOutputView, DataInputView} +import org.apache.flink.ml.common._ +import org.apache.flink.ml.pipeline.{FitOperation, PredictDataSetOperation, Predictor} +import org.apache.flink.types.Value +import org.apache.flink.util.Collector +import org.apache.flink.api.common.functions.{Partitioner => FlinkPartitioner, GroupReduceFunction, CoGroupFunction} + +import com.github.fommil.netlib.BLAS.{ getInstance => blas } +import com.github.fommil.netlib.LAPACK.{ getInstance => lapack } +import org.netlib.util.intW + +import scala.collection.mutable +import scala.collection.mutable.ArrayBuffer +import scala.util.Random + +/** Alternating least squares algorithm to calculate a matrix factorization. + * + * Given a matrix `R`, ALS calculates two matricess `U` and `V` such that `R ~~ U^TV`. The + * unknown row dimension is given by the number of latent factors. Since matrix factorization + * is often used in the context of recommendation, we'll call the first matrix the user and the + * second matrix the item matrix. The `i`th column of the user matrix is `u_i` and the `i`th + * column of the item matrix is `v_i`. The matrix `R` is called the ratings matrix and + * `(R)_{i,j} = r_{i,j}`. + * + * In order to find the user and item matrix, the following problem is solved: + * + * `argmin_{U,V} sum_(i,j\ with\ r_{i,j} != 0) (r_{i,j} - u_{i}^Tv_{j})^2 + + * lambda (sum_(i) n_{u_i} ||u_i||^2 + sum_(j) n_{v_j} ||v_j||^2)` + * + * with `\lambda` being the regularization factor, `n_{u_i}` being the number of items the user `i` + * has rated and `n_{v_j}` being the number of times the item `j` has been rated. This + * regularization scheme to avoid overfitting is called weighted-lambda-regularization. Details + * can be found in the work of [[http://dx.doi.org/10.1007/978-3-540-68880-8_32 Zhou et al.]]. + * + * By fixing one of the matrices `U` or `V` one obtains a quadratic form which can be solved. The + * solution of the modified problem is guaranteed to decrease the overall cost function. By + * applying this step alternately to the matrices `U` and `V`, we can iteratively improve the + * matrix factorization. + * + * The matrix `R` is given in its sparse representation as a tuple of `(i, j, r)` where `i` is the + * row index, `j` is the column index and `r` is the matrix value at position `(i,j)`. + * + * @example + * {{{ + * val inputDS: DataSet[(Int, Int, Double)] = env.readCsvFile[(Int, Int, Double)]( + * pathToTrainingFile) + * + * val als = ALS() + * .setIterations(10) + * .setNumFactors(10) + * + * als.fit(inputDS) + * + * val data2Predict: DataSet[(Int, Int)] = env.readCsvFile[(Int, Int)](pathToData) + * + * als.predict(data2Predict) + * }}} + * + * =Parameters= + * + * - [[org.apache.flink.ml.recommendation.ALS.NumFactors]]: + * The number of latent factors. It is the dimension of the calculated user and item vectors. + * (Default value: '''10''') + * + * - [[org.apache.flink.ml.recommendation.ALS.Lambda]]: + * Regularization factor. Tune this value in order to avoid overfitting/generalization. + * (Default value: '''1''') + * + * - [[org.apache.flink.ml.regression.MultipleLinearRegression.Iterations]]: + * The number of iterations to perform. (Default value: '''10''') + * + * - [[org.apache.flink.ml.recommendation.ALS.Blocks]]: + * The number of blocks into which the user and item matrix a grouped. The fewer + * blocks one uses, the less data is sent redundantly. However, bigger blocks entail bigger + * update messages which have to be stored on the Heap. If the algorithm fails because of + * an OutOfMemoryException, then try to increase the number of blocks. (Default value: '''None''') + * + * - [[org.apache.flink.ml.recommendation.ALS.Seed]]: + * Random seed used to generate the initial item matrix for the algorithm. + * (Default value: '''0''') + * + * - [[org.apache.flink.ml.recommendation.ALS.TemporaryPath]]: + * Path to a temporary directory into which intermediate results are stored. If + * this value is set, then the algorithm is split into two preprocessing steps, the ALS iteration + * and a post-processing step which calculates a last ALS half-step. The preprocessing steps + * calculate the [[org.apache.flink.ml.recommendation.ALS.OutBlockInformation]] and + * [[org.apache.flink.ml.recommendation.ALS.InBlockInformation]] for the given rating matrix. + * The result of the individual steps are stored in the specified directory. By splitting the + * algorithm into multiple smaller steps, Flink does not have to split the available memory + * amongst too many operators. This allows the system to process bigger individual messasges and + * improves the overall performance. (Default value: '''None''') + * + * The ALS implementation is based on Spark's MLLib implementation of ALS which you can find + * [[https://github.com/apache/spark/blob/master/mllib/src/main/scala/org/apache/spark/mllib/ + * recommendation/ALS.scala here]]. + */ +class ALS extends Predictor[ALS] { + + import ALS._ + + // Stores the matrix factorization after the fitting phase + var factorsOption: Option[(DataSet[Factors], DataSet[Factors])] = None + + /** Sets the number of latent factors/row dimension of the latent model + * + * @param numFactors + * @return + */ + def setNumFactors(numFactors: Int): ALS = { + parameters.add(NumFactors, numFactors) + this + } + + /** Sets the regularization coefficient lambda + * + * @param lambda + * @return + */ + def setLambda(lambda: Double): ALS = { + parameters.add(Lambda, lambda) + this + } + + /** Sets the number of iterations of the ALS algorithm + * + * @param iterations + * @return + */ + def setIterations(iterations: Int): ALS = { + parameters.add(Iterations, iterations) + this + } + + /** Sets the number of blocks into which the user and item matrix shall be partitioned + * + * @param blocks + * @return + */ + def setBlocks(blocks: Int): ALS = { + parameters.add(Blocks, blocks) + this + } + + /** Sets the random seed for the initial item matrix initialization + * + * @param seed + * @return + */ + def setSeed(seed: Long): ALS = { + parameters.add(Seed, seed) + this + } + + /** Sets the temporary path into which intermediate results are written in order to increase + * performance. + * + * @param temporaryPath + * @return + */ + def setTemporaryPath(temporaryPath: String): ALS = { + parameters.add(TemporaryPath, temporaryPath) + this + } + + /** Empirical risk of the trained model (matrix factorization). + * + * @param labeledData Reference data + * @param riskParameters Additional parameters for the empirical risk calculation + * @return + */ + def empiricalRisk( + labeledData: DataSet[(Long, Long, Double)], + riskParameters: ParameterMap = ParameterMap.Empty) + : DataSet[Double] = { + val resultingParameters = parameters ++ riskParameters + + val lambda = resultingParameters(Lambda) + + val data = labeledData map { + x => (x._1, x._2) + } + + factorsOption match { + case Some((userFactors, itemFactors)) => { + val predictions = data.join(userFactors, JoinHint.REPARTITION_HASH_SECOND).where(0) + .equalTo(0).join(itemFactors, JoinHint.REPARTITION_HASH_SECOND).where("_1._2") + .equalTo(0).map { + triple => { + val (((uID, iID), uFactors), iFactors) = triple + + val uFactorsVector = uFactors.factors + val iFactorsVector = iFactors.factors + + val squaredUNorm2 = blas.ddot( + uFactorsVector.length, + uFactorsVector, + 1, + uFactorsVector, + 1) + val squaredINorm2 = blas.ddot( + iFactorsVector.length, + iFactorsVector, + 1, + iFactorsVector, + 1) + + val prediction = blas.ddot(uFactorsVector.length, uFactorsVector, 1, iFactorsVector, 1) + + (uID, iID, prediction, squaredUNorm2, squaredINorm2) + } + } + + labeledData.join(predictions).where(0,1).equalTo(0,1) { + (left, right) => { + val (_, _, expected) = left + val (_, _, predicted, squaredUNorm2, squaredINorm2) = right + + val residual = expected - predicted + + residual * residual + lambda * (squaredUNorm2 + squaredINorm2) + } + } reduce { + _ + _ + } + } + + case None => throw new RuntimeException("The ALS model has not been fitted to data. " + + "Prior to predicting values, it has to be trained on data.") + } + } +} + +object ALS { + val USER_FACTORS_FILE = "userFactorsFile" + val ITEM_FACTORS_FILE = "itemFactorsFile" + + // ========================================= Parameters ========================================== + + case object NumFactors extends Parameter[Int] { + val defaultValue: Option[Int] = Some(10) + } + + case object Lambda extends Parameter[Double] { + val defaultValue: Option[Double] = Some(1.0) + } + + case object Iterations extends Parameter[Int] { + val defaultValue: Option[Int] = Some(10) + } + + case object Blocks extends Parameter[Int] { + val defaultValue: Option[Int] = None + } + + case object Seed extends Parameter[Long] { + val defaultValue: Option[Long] = Some(0L) + } + + case object TemporaryPath extends Parameter[String] { + val defaultValue: Option[String] = None + } + + // ==================================== ALS type definitions ===================================== + + /** Representation of a user-item rating + * + * @param user User ID of the rating user + * @param item Item iD of the rated item + * @param rating Rating value + */ + case class Rating(user: Long, item: Long, rating: Double) + + /** Latent factor model vector + * + * @param id + * @param factors + */ + case class Factors(id: Long, factors: Array[Double]) { + override def toString = s"($id, ${factors.mkString(",")})" + } + + case class Factorization(userFactors: DataSet[Factors], itemFactors: DataSet[Factors]) + + case class OutBlockInformation(elementIDs: Array[Long], outLinks: OutLinks) { + override def toString: String = { + s"OutBlockInformation:((${elementIDs.mkString(",")}), ($outLinks))" + } + } + + class OutLinks(var links: Array[scala.collection.mutable.BitSet]) extends Value { + def this() = this(null) + + override def toString: String = { + s"${links.mkString("\n")}" + } + + override def write(out: DataOutputView): Unit = { + out.writeInt(links.length) + links foreach { + link => { + val bitMask = link.toBitMask + out.writeInt(bitMask.length) + for (element <- bitMask) { + out.writeLong(element) + } + } + } + } + + override def read(in: DataInputView): Unit = { + val length = in.readInt() + links = new Array[scala.collection.mutable.BitSet](length) + + for (i <- 0 until length) { + val bitMaskLength = in.readInt() + val bitMask = new Array[Long](bitMaskLength) + for (j <- 0 until bitMaskLength) { + bitMask(j) = in.readLong() + } + links(i) = mutable.BitSet.fromBitMask(bitMask) + } + } + + def apply(idx: Int) = links(idx) + } + + case class InBlockInformation(elementIDs: Array[Long], ratingsForBlock: Array[BlockRating]) { + + override def toString: String = { + s"InBlockInformation:((${elementIDs.mkString(",")}), (${ratingsForBlock.mkString("\n")}))" + } + } + + case class BlockRating(var ratings: Array[(Array[Int], Array[Double])]) { + def apply(idx: Int) = ratings(idx) + + override def toString: String = { + ratings.map { + case (left, right) => s"((${left.mkString(",")}),(${right.mkString(",")}))" + }.mkString(",") + } + } + + case class BlockedFactorization(userFactors: DataSet[(Int, Array[Array[Double]])], + itemFactors: DataSet[(Int, Array[Array[Double]])]) + + class BlockIDPartitioner extends FlinkPartitioner[Int] { + override def partition(blockID: Int, numberOfPartitions: Int): Int = { + blockID % numberOfPartitions + } + } + + class BlockIDGenerator(blocks: Int) extends Serializable { + def apply(id: Long): Int = { + (id % blocks).toInt + } + } + + // ================================= Factory methods ============================================= + + def apply(): ALS = { + new ALS() + } + + // ===================================== Operations ============================================== + + /** Predict operation which calculates the matrix entry for the given indices */ + implicit val predictRating = new PredictDataSetOperation[ + ALS, + (Long, Long), + (Long, Long, Double)] { + override def predictDataSet( + instance: ALS, + predictParameters: ParameterMap, + input: DataSet[(Long, Long)]) + : DataSet[(Long, Long, Double)] = { + + instance.factorsOption match { + case Some((userFactors, itemFactors)) => { + input.join(userFactors, JoinHint.REPARTITION_HASH_SECOND).where(0).equalTo(0) + .join(itemFactors, JoinHint.REPARTITION_HASH_SECOND).where("_1._2").equalTo(0).map { + triple => { + val (((uID, iID), uFactors), iFactors) = triple + + val uFactorsVector = uFactors.factors + val iFactorsVector = iFactors.factors + + val prediction = blas.ddot( + uFactorsVector.length, + uFactorsVector, + 1, + iFactorsVector, + 1) + + (uID, iID, prediction) + } + } + } + + case None => throw new RuntimeException("The ALS model has not been fitted to data. " + + "Prior to predicting values, it has to be trained on data.") + } + } + } + + implicit val predictRatingInt = new PredictDataSetOperation[ALS, (Int, Int), (Int, Int, Double)] { + override def predictDataSet( + instance: ALS, + predictParameters: ParameterMap, + input: DataSet[(Int, Int)]) + : DataSet[(Int, Int, Double)] = { + val longInput = input.map { x => (x._1.toLong, x._2.toLong)} + + val longResult = implicitly[PredictDataSetOperation[ALS, (Long, Long), (Long, Long, Double)]] + .predictDataSet( + instance, + predictParameters, + longInput) + + longResult.map{ x => (x._1.toInt, x._2.toInt, x._3)} + } + } + + /** Calculates the matrix factorization for the given ratings. A rating is defined as + * a tuple of user ID, item ID and the corresponding rating. + * + * @return Factorization containing the user and item matrix + */ + implicit val fitALS = new FitOperation[ALS, (Long, Long, Double)] { + override def fit( + instance: ALS, + fitParameters: ParameterMap, + input: DataSet[(Long, Long, Double)]) + : Unit = { + val resultParameters = instance.parameters ++ fitParameters + + val userBlocks = resultParameters.get(Blocks).getOrElse(input.count.toInt) + val itemBlocks = userBlocks + val persistencePath = resultParameters.get(TemporaryPath) + val seed = resultParameters(Seed) + val factors = resultParameters(NumFactors) + val iterations = resultParameters(Iterations) + val lambda = resultParameters(Lambda) + + val ratings = input.map { + entry => { + val (userID, itemID, rating) = entry + Rating(userID, itemID, rating) + } + } + + val blockIDPartitioner = new BlockIDPartitioner() + + val ratingsByUserBlock = ratings.map{ + rating => + val blockID = (rating.user % userBlocks).toInt + (blockID, rating) + } partitionCustom(blockIDPartitioner, 0) + + val ratingsByItemBlock = ratings map { + rating => + val blockID = (rating.item % itemBlocks).toInt + (blockID, new Rating(rating.item, rating.user, rating.rating)) + } partitionCustom(blockIDPartitioner, 0) + + val (uIn, uOut) = createBlockInformation(userBlocks, itemBlocks, ratingsByUserBlock, + blockIDPartitioner) + val (iIn, iOut) = createBlockInformation(itemBlocks, userBlocks, ratingsByItemBlock, + blockIDPartitioner) + + val (userIn, userOut) = persistencePath match { + case Some(path) => FlinkMLTools.persist(uIn, uOut, path + "userIn", path + "userOut") + case None => (uIn, uOut) + } + + val (itemIn, itemOut) = persistencePath match { + case Some(path) => FlinkMLTools.persist(iIn, iOut, path + "itemIn", path + "itemOut") + case None => (iIn, iOut) + } + + val initialItems = itemOut.partitionCustom(blockIDPartitioner, 0).map{ + outInfos => + val blockID = outInfos._1 + val infos = outInfos._2 + + (blockID, infos.elementIDs.map{ + id => + val random = new Random(id ^ seed) + randomFactors(factors, random) + }) + }.withForwardedFields("0") + + // iteration to calculate the item matrix + val items = initialItems.iterate(iterations) { + items => { + val users = updateFactors(userBlocks, items, itemOut, userIn, factors, lambda, + blockIDPartitioner) + updateFactors(itemBlocks, users, userOut, itemIn, factors, lambda, blockIDPartitioner) + } + } + + val pItems = persistencePath match { + case Some(path) => FlinkMLTools.persist(items, path + "items") + case None => items + } + + // perform last half-step to calculate the user matrix + val users = updateFactors(userBlocks, pItems, itemOut, userIn, factors, lambda, + blockIDPartitioner) + + instance.factorsOption = Some(( + unblock(users, userOut, blockIDPartitioner), + unblock(pItems, itemOut, blockIDPartitioner))) + } + } + + implicit val fitALSInt = new FitOperation[ALS, (Int, Int, Double)] { + override def fit( + instance: ALS, + fitParameters: ParameterMap, + input: DataSet[(Int, Int, Double)]) + : Unit = { + + val longInput = input.map { x => (x._1.toLong, x._2.toLong, x._3)} + + implicitly[FitOperation[ALS, (Long, Long, Double)]].fit(instance, fitParameters, longInput) + } + } + + /** Calculates a single half step of the ALS optimization. The result is the new value for + * either the user or item matrix, depending with which matrix the method was called. + * + * @param numUserBlocks Number of blocks in the respective dimension + * @param items Fixed matrix value for the half step + * @param itemOut Out information to know where to send the vectors + * @param userIn In information for the cogroup step + * @param factors Number of latent factors + * @param lambda Regularization constant + * @param blockIDPartitioner Custom Flink partitioner + * @return New value for the optimized matrix (either user or item) + */ + def updateFactors(numUserBlocks: Int, + items: DataSet[(Int, Array[Array[Double]])], + itemOut: DataSet[(Int, OutBlockInformation)], + userIn: DataSet[(Int, InBlockInformation)], + factors: Int, + lambda: Double, blockIDPartitioner: FlinkPartitioner[Int]): + DataSet[(Int, Array[Array[Double]])] = { + // send the item vectors to the blocks whose users have rated the items + val partialBlockMsgs = itemOut.join(items).where(0).equalTo(0). + withPartitioner(blockIDPartitioner).apply { + (left, right, col: Collector[(Int, Int, Array[Array[Double]])]) => { + val blockID = left._1 + val outInfo = left._2 + val factors = right._2 + var userBlock = 0 + var itemIdx = 0 + + while(userBlock < numUserBlocks){ + itemIdx = 0 + val buffer = new ArrayBuffer[Array[Double]] + while(itemIdx < outInfo.elementIDs.length){ + if(outInfo.outLinks(userBlock)(itemIdx)){ + buffer += factors(itemIdx) + } + itemIdx += 1 + } + + if(buffer.nonEmpty){ + // send update message to userBlock + col.collect(userBlock, blockID, buffer.toArray) + } + + userBlock += 1 + } + } + } + + // collect the partial update messages and calculate for each user block the new user vectors + partialBlockMsgs.coGroup(userIn).where(0).equalTo(0).sortFirstGroup(1, Order.ASCENDING). + withPartitioner(blockIDPartitioner).apply{ + new CoGroupFunction[(Int, Int, Array[Array[Double]]), (Int, + InBlockInformation), (Int, Array[Array[Double]])](){ + + // in order to save space, store only the upper triangle of the XtX matrix + val triangleSize = (factors*factors - factors)/2 + factors + val matrix = Array.fill(triangleSize)(0.0) + val fullMatrix = Array.fill(factors * factors)(0.0) + val userXtX = new ArrayBuffer[Array[Double]]() + val userXy = new ArrayBuffer[Array[Double]]() + val numRatings = new ArrayBuffer[Int]() + + override def coGroup(left: lang.Iterable[(Int, Int, Array[Array[Double]])], + right: lang.Iterable[(Int, InBlockInformation)], + collector: Collector[(Int, Array[Array[Double]])]): Unit = { + // there is only one InBlockInformation per user block + val inInfo = right.iterator().next()._2 + val updates = left.iterator() + + val numUsers = inInfo.elementIDs.length + var blockID = -1 + + var i = 0 + + // clear old matrices and allocate new ones + val matricesToClear = if (numUsers > userXtX.length) { + val oldLength = userXtX.length + + while(i < (numUsers - oldLength)) { + userXtX += Array.fill(triangleSize)(0.0) + userXy += Array.fill(factors)(0.0) + numRatings.+=(0) + + i += 1 + } + + oldLength + } else { + numUsers + } + + i = 0 + while(i < matricesToClear){ + numRatings(i) = 0 + + util.Arrays.fill(userXtX(i), 0.0) + util.Arrays.fill(userXy(i), 0.0) + + i += 1 + } + + var itemBlock = 0 + + // build XtX matrices and Xy vector + while(updates.hasNext){ + val update = updates.next() + val blockFactors = update._3 + blockID = update._1 + + var p = 0 + while(p < blockFactors.length){ + val vector = blockFactors(p) + + outerProduct(vector, matrix, factors) + + val (users, ratings) = inInfo.ratingsForBlock(itemBlock)(p) + + var i = 0 + while (i < users.length) { + numRatings(users(i)) += 1 + blas.daxpy(matrix.length, 1, matrix, 1, userXtX(users(i)), 1) + blas.daxpy(vector.length, ratings(i), vector, 1, userXy(users(i)), 1) + + i += 1 + } + p += 1 + } + + itemBlock += 1 + } + + val array = new Array[Array[Double]](numUsers) + + i = 0 + while(i < numUsers){ + generateFullMatrix(userXtX(i), fullMatrix, factors) + + var f = 0 + + // add regularization constant + while(f < factors){ + fullMatrix(f*factors + f) += lambda * numRatings(i) + f += 1 + } + + // calculate new user vector + val result = new intW(0) + lapack.dposv("U", factors, 1, fullMatrix, factors , userXy(i), factors, result) + array(i) = userXy(i) + + i += 1 + } + + collector.collect((blockID, array)) + } + } + }.withForwardedFieldsFirst("0").withForwardedFieldsSecond("0") + } + + /** Creates the meta information needed to route the item and user vectors to the respective user + * and item blocks. + * * @param userBlocks + * @param itemBlocks + * @param ratings + * @param blockIDPartitioner + * @return + */ + def createBlockInformation(userBlocks: Int, itemBlocks: Int, ratings: DataSet[(Int, Rating)], + blockIDPartitioner: BlockIDPartitioner): + (DataSet[(Int, InBlockInformation)], DataSet[(Int, OutBlockInformation)]) = { + val blockIDGenerator = new BlockIDGenerator(itemBlocks) + + val usersPerBlock = createUsersPerBlock(ratings) + + val outBlockInfos = createOutBlockInformation(ratings, usersPerBlock, itemBlocks, + blockIDGenerator) + + val inBlockInfos = createInBlockInformation(ratings, usersPerBlock, blockIDGenerator) + + (inBlockInfos, outBlockInfos) + } + + /** Calculates the userIDs in ascending order of each user block + * + * @param ratings + * @return + */ + def createUsersPerBlock(ratings: DataSet[(Int, Rating)]): DataSet[(Int, Array[Long])] = { + ratings.map{ x => (x._1, x._2.user)}.withForwardedFields("0").groupBy(0). + sortGroup(1, Order.ASCENDING).reduceGroup { + users => { + val result = ArrayBuffer[Long]() + var id = -1 + var oldUser = -1L + + while(users.hasNext) { + val user = users.next() + + id = user._1 + + if (user._2 != oldUser) { + result.+=(user._2) + oldUser = user._2 + } + } + + val userIDs = result.toArray + (id, userIDs) + } + }.withForwardedFields("0") + } + + /** Creates the outgoing block information + * + * Creates for every user block the outgoing block information. The out block information + * contains for every item block a [[scala.collection.mutable.BitSet]] which indicates which + * user vector has to be sent to this block. If a vector v has to be sent to a block b, then + * bitsets(b)'s bit v is set to 1, otherwise 0. Additionally the user IDataSet are replaced by + * the user vector's index value. + * + * @param ratings + * @param usersPerBlock + * @param itemBlocks + * @param blockIDGenerator + * @return + */ + def createOutBlockInformation(ratings: DataSet[(Int, Rating)], + usersPerBlock: DataSet[(Int, Array[Long])], + itemBlocks: Int, blockIDGenerator: BlockIDGenerator): + DataSet[(Int, OutBlockInformation)] = { + ratings.coGroup(usersPerBlock).where(0).equalTo(0).apply { + (ratings, users) => + val userIDs = users.next()._2 + val numUsers = userIDs.length + + val userIDToPos = userIDs.zipWithIndex.toMap + + val shouldDataSend = Array.fill(itemBlocks)(new scala.collection.mutable.BitSet(numUsers)) + var blockID = -1 + while (ratings.hasNext) { + val r = ratings.next() + + val pos = + try { + userIDToPos(r._2.user) + }catch{ + case e: NoSuchElementException => + throw new RuntimeException(s"Key ${r._2.user} not found. BlockID $blockID. " + + s"Elements in block ${userIDs.take(5).mkString(", ")}. " + + s"UserIDList contains ${userIDs.contains(r._2.user)}.", e) + } + + blockID = r._1 + shouldDataSend(blockIDGenerator(r._2.item))(pos) = true + } + + (blockID, OutBlockInformation(userIDs, new OutLinks(shouldDataSend))) + }.withForwardedFieldsFirst("0").withForwardedFieldsSecond("0") + } + + /** Creates the incoming block information + * + * Creates for every user block the incoming block information. The incoming block information + * contains the userIDs of the users in the respective block and for every item block a + * BlockRating instance. The BlockRating instance describes for every incoming set of item + * vectors of an item block, which user rated these items and what the rating was. For that + * purpose it contains for every incoming item vector a tuple of an id array us and a rating + * array rs. The array us contains the indices of the users having rated the respective + * item vector with the ratings in rs. + * + * @param ratings + * @param usersPerBlock + * @param blockIDGenerator + * @return + */ + def createInBlockInformation(ratings: DataSet[(Int, Rating)], + usersPerBlock: DataSet[(Int, Array[Long])], + blockIDGenerator: BlockIDGenerator): + DataSet[(Int, InBlockInformation)] = { + // Group for every user block the users which have rated the same item and collect their ratings + val partialInInfos = ratings.map { x => (x._1, x._2.item, x._2.user, x._2.rating)} + .withForwardedFields("0").groupBy(0, 1).reduceGroup { + x => + var userBlockID = -1 + var itemID = -1L + val userIDs = ArrayBuffer[Long]() + val ratings = ArrayBuffer[Double]() + + while (x.hasNext) { + val (uBlockID, item, user, rating) = x.next + userBlockID = uBlockID + itemID = item + + userIDs += user + ratings += rating + } + + (userBlockID, blockIDGenerator(itemID), itemID, (userIDs.toArray, ratings.toArray)) + }.withForwardedFields("0") + + // Aggregate all ratings for items belonging to the same item block. Sort ascending with + // respect to the itemID, because later the item vectors of the update message are sorted + // accordingly. + val collectedPartialInfos = partialInInfos.groupBy(0, 1).sortGroup(2, Order.ASCENDING). + reduceGroup { + new GroupReduceFunction[(Int, Int, Long, (Array[Long], Array[Double])), (Int, + Int, Array[(Array[Long], Array[Double])])](){ + val buffer = new ArrayBuffer[(Array[Long], Array[Double])] + + override def reduce(iterable: lang.Iterable[(Int, Int, Long, (Array[Long], + Array[Double]))], collector: Collector[(Int, Int, Array[(Array[Long], + Array[Double])])]): Unit = { + + val infos = iterable.iterator() + var counter = 0 + + var blockID = -1 + var itemBlockID = -1 + + while (infos.hasNext && counter < buffer.length) { + val info = infos.next() + blockID = info._1 + itemBlockID = info._2 + + buffer(counter) = info._4 + + counter += 1 + } + + while (infos.hasNext) { + val info = infos.next() + blockID = info._1 + itemBlockID = info._2 + + buffer += info._4 + + counter += 1 + } + + val array = new Array[(Array[Long], Array[Double])](counter) + + buffer.copyToArray(array) + + collector.collect((blockID, itemBlockID, array)) + } + } + }.withForwardedFields("0", "1") + + // Aggregate all item block ratings with respect to their user block ID. Sort the blocks with + // respect to their itemBlockID, because the block update messages are sorted the same way + collectedPartialInfos.coGroup(usersPerBlock).where(0).equalTo(0). + sortFirstGroup(1, Order.ASCENDING).apply{ + new CoGroupFunction[(Int, Int, Array[(Array[Long], Array[Double])]), + (Int, Array[Long]), (Int, InBlockInformation)] { + val buffer = ArrayBuffer[BlockRating]() + + override def coGroup(partialInfosIterable: + lang.Iterable[(Int, Int, Array[(Array[Long], Array[Double])])], + userIterable: lang.Iterable[(Int, Array[Long])], + collector: Collector[(Int, InBlockInformation)]): Unit = { + + val users = userIterable.iterator() + val partialInfos = partialInfosIterable.iterator() + + val userWrapper = users.next() + val id = userWrapper._1 + val userIDs = userWrapper._2 + val userIDToPos = userIDs.zipWithIndex.toMap + + var counter = 0 + + while (partialInfos.hasNext && counter < buffer.length) { + val partialInfo = partialInfos.next() + // entry contains the ratings and userIDs of a complete item block + val entry = partialInfo._3 + + val blockRelativeIndicesRatings = new Array[(Array[Int], Array[Double])](entry.size) + + // transform userIDs to positional indices + for (row <- 0 until entry.length) { + val rowEntries = entry(row)._1 + val rowIndices = new Array[Int](rowEntries.length) + + for (col <- 0 until rowEntries.length) { + rowIndices(col) = userIDToPos(rowEntries(col)) + } + + blockRelativeIndicesRatings(row) = (rowIndices, entry(row)._2) + } + + buffer(counter).ratings = blockRelativeIndicesRatings + + counter += 1 + } + + while (partialInfos.hasNext) { + val partialInfo = partialInfos.next() + // entry contains the ratings and userIDs of a complete item block + val entry = partialInfo._3 + val blockRelativeIndicesRatings = new Array[(Array[Int], Array[Double])](entry.size) + + // transform userIDs to positional indices + for (row <- 0 until entry.length) { + val rowEntries = entry(row)._1 + val rowIndices = new Array[Int](rowEntries.length) + + for (col <- 0 until rowEntries.length) { + rowIndices(col) = userIDToPos(rowEntries(col)) + } + + blockRelativeIndicesRatings(row) = (rowIndices, entry(row)._2) + } + + buffer += new BlockRating(blockRelativeIndicesRatings) + + counter += 1 + } + + val array = new Array[BlockRating](counter) + + buffer.copyToArray(array) + + collector.collect((id, InBlockInformation(userIDs, array))) + } + } + }.withForwardedFieldsFirst("0").withForwardedFieldsSecond("0") + } + + /** Unblocks the blocked user and item matrix representation so that it is at DataSet of + * column vectors. + * + * @param users + * @param outInfo + * @param blockIDPartitioner + * @return + */ + def unblock(users: DataSet[(Int, Array[Array[Double]])], + outInfo: DataSet[(Int, OutBlockInformation)], + blockIDPartitioner: BlockIDPartitioner): DataSet[Factors] = { + users.join(outInfo).where(0).equalTo(0).withPartitioner(blockIDPartitioner).apply { + (left, right, col: Collector[Factors]) => { + val outInfo = right._2 + val factors = left._2 + + for(i <- 0 until outInfo.elementIDs.length){ + val id = outInfo.elementIDs(i) + val factorVector = factors(i) + col.collect(Factors(id, factorVector)) + } + } + } + } + + // ================================ Math helper functions ======================================== + + def outerProduct(vector: Array[Double], matrix: Array[Double], factors: Int): Unit = { + var row = 0 + var pos = 0 + while(row < factors){ + var col = 0 + while(col <= row){ + matrix(pos) = vector(row) * vector(col) + col += 1 + pos += 1 + } + + row += 1 + } + } + + def generateFullMatrix(triangularMatrix: Array[Double], fullMatrix: Array[Double], + factors: Int): Unit = { + var row = 0 + var pos = 0 + + while(row < factors){ + var col = 0 + while(col < row){ + fullMatrix(row*factors + col) = triangularMatrix(pos) + fullMatrix(col*factors + row) = triangularMatrix(pos) + + pos += 1 + col += 1 + } + + fullMatrix(row*factors + row) = triangularMatrix(pos) + + pos += 1 + row += 1 + } + } + + def generateRandomMatrix(users: DataSet[Int], factors: Int, seed: Long): DataSet[Factors] = { + users map { + id =>{ + val random = new Random(id ^ seed) + Factors(id, randomFactors(factors, random)) + } + } + } + + def randomFactors(factors: Int, random: Random): Array[Double] = { + Array.fill(factors)(random.nextDouble()) + } +} diff --git a/src/main/scala/org/apache/flink/ml/regression/MultipleLinearRegression.scala b/src/main/scala/org/apache/flink/ml/regression/MultipleLinearRegression.scala new file mode 100644 index 0000000000000..1ef2386959c84 --- /dev/null +++ b/src/main/scala/org/apache/flink/ml/regression/MultipleLinearRegression.scala @@ -0,0 +1,234 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.regression + +import org.apache.flink.api.scala.DataSet +import org.apache.flink.ml.math.{Breeze, Vector} +import org.apache.flink.ml.common._ + +import org.apache.flink.api.scala._ +import org.apache.flink.ml.optimization.LearningRateMethod.LearningRateMethodTrait + +import org.apache.flink.ml.optimization._ +import org.apache.flink.ml.pipeline.{PredictOperation, FitOperation, Predictor} + + +/** Multiple linear regression using the ordinary least squares (OLS) estimator. + * + * The linear regression finds a solution to the problem + * + * `y = w_0 + w_1*x_1 + w_2*x_2 ... + w_n*x_n = w_0 + w^T*x` + * + * such that the sum of squared residuals is minimized + * + * `min_{w, w_0} \sum (y - w^T*x - w_0)^2` + * + * The minimization problem is solved by (stochastic) gradient descent. For each labeled vector + * `(x,y)`, the gradient is calculated. The weighted average of all gradients is subtracted from + * the current value `w` which gives the new value of `w_new`. The weight is defined as + * `stepsize/math.sqrt(iteration)`. + * + * The optimization runs at most a maximum number of iterations or, if a convergence threshold has + * been set, until the convergence criterion has been met. As convergence criterion the relative + * change of the sum of squared residuals is used: + * + * `(S_{k-1} - S_k)/S_{k-1} < \rho` + * + * with S_k being the sum of squared residuals in iteration k and `\rho` being the convergence + * threshold. + * + * At the moment, the whole partition is used for SGD, making it effectively a batch gradient + * descent. Once a sampling operator has been introduced, the algorithm can be optimized. + * + * @example + * {{{ + * val mlr = MultipleLinearRegression() + * .setIterations(10) + * .setStepsize(0.5) + * .setConvergenceThreshold(0.001) + * + * val trainingDS: DataSet[LabeledVector] = ... + * val testingDS: DataSet[Vector] = ... + * + * mlr.fit(trainingDS) + * + * val predictions = mlr.predict(testingDS) + * }}} + * + * =Parameters= + * + * - [[org.apache.flink.ml.regression.MultipleLinearRegression.Iterations]]: + * Maximum number of iterations. + * + * - [[org.apache.flink.ml.regression.MultipleLinearRegression.Stepsize]]: + * Initial step size for the gradient descent method. + * This value controls how far the gradient descent method moves in the opposite direction of the + * gradient. Tuning this parameter might be crucial to make it stable and to obtain a better + * performance. + * + * - [[org.apache.flink.ml.regression.MultipleLinearRegression.ConvergenceThreshold]]: + * Threshold for relative change of sum of squared residuals until convergence. + * + * - [[LearningRateMethodTrait]]: + * The method used to calculate the effective learning rate for each iteration step. See + * [[LearningRateMethod]] for all supported methods. + * + */ +class MultipleLinearRegression extends Predictor[MultipleLinearRegression] { + import org.apache.flink.ml._ + import MultipleLinearRegression._ + + // Stores the weights of the linear model after the fitting phase + var weightsOption: Option[DataSet[WeightVector]] = None + + def setIterations(iterations: Int): MultipleLinearRegression = { + parameters.add(Iterations, iterations) + this + } + + def setStepsize(stepsize: Double): MultipleLinearRegression = { + parameters.add(Stepsize, stepsize) + this + } + + def setConvergenceThreshold(convergenceThreshold: Double): MultipleLinearRegression = { + parameters.add(ConvergenceThreshold, convergenceThreshold) + this + } + + def setLearningRateMethod( + learningRateMethod: LearningRateMethodTrait) + : MultipleLinearRegression = { + parameters.add(LearningRateMethodValue, learningRateMethod) + this + } + + def squaredResidualSum(input: DataSet[LabeledVector]): DataSet[Double] = { + weightsOption match { + case Some(weights) => { + input.mapWithBcVariable(weights){ + (dataPoint, weights) => lossFunction.loss(dataPoint, weights) + }.reduce { + _ + _ + } + } + + case None => { + throw new RuntimeException("The MultipleLinearRegression has not been fitted to the " + + "data. This is necessary to learn the weight vector of the linear function.") + } + } + + } +} + +object MultipleLinearRegression { + + val WEIGHTVECTOR_BROADCAST = "weights_broadcast" + + val lossFunction = GenericLossFunction(SquaredLoss, LinearPrediction) + + // ====================================== Parameters ============================================= + + case object Stepsize extends Parameter[Double] { + val defaultValue = Some(0.1) + } + + case object Iterations extends Parameter[Int] { + val defaultValue = Some(10) + } + + case object ConvergenceThreshold extends Parameter[Double] { + val defaultValue = None + } + + case object LearningRateMethodValue extends Parameter[LearningRateMethodTrait] { + val defaultValue = None + } + + // ======================================== Factory methods ====================================== + + def apply(): MultipleLinearRegression = { + new MultipleLinearRegression() + } + + // ====================================== Operations ============================================= + + /** Trains the linear model to fit the training data. The resulting weight vector is stored in + * the [[MultipleLinearRegression]] instance. + * + */ + implicit val fitMLR = new FitOperation[MultipleLinearRegression, LabeledVector] { + override def fit( + instance: MultipleLinearRegression, + fitParameters: ParameterMap, + input: DataSet[LabeledVector]) + : Unit = { + val map = instance.parameters ++ fitParameters + + // retrieve parameters of the algorithm + val numberOfIterations = map(Iterations) + val stepsize = map(Stepsize) + val convergenceThreshold = map.get(ConvergenceThreshold) + val learningRateMethod = map.get(LearningRateMethodValue) + + val lossFunction = GenericLossFunction(SquaredLoss, LinearPrediction) + + val optimizer = GradientDescent() + .setIterations(numberOfIterations) + .setStepsize(stepsize) + .setLossFunction(lossFunction) + + convergenceThreshold match { + case Some(threshold) => optimizer.setConvergenceThreshold(threshold) + case None => + } + + learningRateMethod match { + case Some(method) => optimizer.setLearningRateMethod(method) + case None => + } + + instance.weightsOption = Some(optimizer.optimize(input, None)) + } + } + + implicit def predictVectors[T <: Vector] = { + new PredictOperation[MultipleLinearRegression, WeightVector, T, Double]() { + override def getModel(self: MultipleLinearRegression, predictParameters: ParameterMap) + : DataSet[WeightVector] = { + self.weightsOption match { + case Some(weights) => weights + + + case None => { + throw new RuntimeException("The MultipleLinearRegression has not been fitted to the " + + "data. This is necessary to learn the weight vector of the linear function.") + } + } + } + override def predict(value: T, model: WeightVector): Double = { + import Breeze._ + val WeightVector(weights, weight0) = model + val dotProduct = value.asBreeze.dot(weights.asBreeze) + dotProduct + weight0 + } + } + } +} diff --git a/src/test/resources/log4j-test.properties b/src/test/resources/log4j-test.properties new file mode 100644 index 0000000000000..c51bf71c0403b --- /dev/null +++ b/src/test/resources/log4j-test.properties @@ -0,0 +1,38 @@ +################################################################################ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +log4j.rootLogger=OFF, console + +# ----------------------------------------------------------------------------- +# Console (use 'console') +# ----------------------------------------------------------------------------- +log4j.appender.console=org.apache.log4j.ConsoleAppender +log4j.appender.console.layout=org.apache.log4j.PatternLayout +log4j.appender.console.layout.ConversionPattern=%d{HH:mm:ss,SSS} %-5p %-60c %x - %m%n + +# ----------------------------------------------------------------------------- +# File (use 'file') +# ----------------------------------------------------------------------------- +log4j.appender.file=org.apache.log4j.FileAppender +log4j.appender.file.file=${log.dir}/flinkML.log +log4j.appender.file.append=false +log4j.appender.file.layout=org.apache.log4j.PatternLayout +log4j.appender.file.layout.ConversionPattern=%d{HH:mm:ss,SSS} %-5p %-60c %x - %m%n + +# suppress the irrelevant (wrong) warnings from the netty channel handler +log4j.logger.org.jboss.netty.channel.DefaultChannelPipeline=ERROR, console diff --git a/src/test/resources/logback-test.xml b/src/test/resources/logback-test.xml new file mode 100644 index 0000000000000..940fe0fc61c3b --- /dev/null +++ b/src/test/resources/logback-test.xml @@ -0,0 +1,42 @@ + + + + + + %d{HH:mm:ss.SSS} [%thread] [%X{sourceThread} - %X{akkaSource}] %-5level %logger{60} - %msg%n + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/src/test/scala/org/apache/flink/ml/MLUtilsSuite.scala b/src/test/scala/org/apache/flink/ml/MLUtilsSuite.scala new file mode 100644 index 0000000000000..800857842c6d5 --- /dev/null +++ b/src/test/scala/org/apache/flink/ml/MLUtilsSuite.scala @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml + +import java.io.File + +import org.apache.flink.api.scala._ +import org.apache.flink.core.testutils.CommonTestUtils +import org.apache.flink.ml.common.LabeledVector +import org.apache.flink.ml.math.SparseVector +import org.apache.flink.ml.util.FlinkTestBase + +import org.scalatest.{FlatSpec, Matchers} + +import scala.io.Source + +class MLUtilsSuite extends FlatSpec with Matchers with FlinkTestBase { + + behavior of "The RichExecutionEnvironment" + + it should "read a libSVM/SVMLight input file" in { + val env = ExecutionEnvironment.getExecutionEnvironment + + val content = + """ + |1 2:10.0 4:4.5 8:4.2 # foo + |-1 1:9.0 4:-4.5 7:2.4 # bar + |0.4 3:1.0 8:-5.6 10:1.0 + |-42.1 2:2.0 4:-6.1 3:5.1 # svm + """.stripMargin + + val expectedLabeledVectors = Set( + LabeledVector(1, SparseVector.fromCOO(10, (1, 10), (3, 4.5), (7, 4.2))), + LabeledVector(-1, SparseVector.fromCOO(10, (0, 9), (3, -4.5), (6, 2.4))), + LabeledVector(0.4, SparseVector.fromCOO(10, (2, 1), (7, -5.6), (9, 1))), + LabeledVector(-42.1, SparseVector.fromCOO(10, (1, 2), (3, -6.1), (2, 5.1))) + ) + + val inputFilePath = CommonTestUtils.createTempFile(content) + + val svmInput = env.readLibSVM(inputFilePath) + + val labeledVectors = svmInput.collect() + + labeledVectors.size should be(expectedLabeledVectors.size) + + for(lVector <- labeledVectors) { + expectedLabeledVectors.contains(lVector) should be(true) + } + + } + + it should "write a libSVM/SVMLight output file" in { + val env = ExecutionEnvironment.getExecutionEnvironment + + val labeledVectors = Seq( + LabeledVector(1.0, SparseVector.fromCOO(10, (1, 10), (3, 4.5), (7, 4.2))), + LabeledVector(-1.0, SparseVector.fromCOO(10, (0, 9), (3, -4.5), (6, 2.4))), + LabeledVector(0.4, SparseVector.fromCOO(10, (2, 1), (7, -5.6), (9, 1))), + LabeledVector(-42.1, SparseVector.fromCOO(10, (1, 2), (3, -6.1), (2, 5.1))) + ) + + val expectedLines = List( + "1.0 2:10.0 4:4.5 8:4.2", + "-1.0 1:9.0 4:-4.5 7:2.4", + "0.4 3:1.0 8:-5.6 10:1.0", + "-42.1 2:2.0 3:5.1 4:-6.1" + ) + + val labeledVectorsDS = env.fromCollection(labeledVectors) + + val tempFile = File.createTempFile("flink_test_", ".tmp") + val outputFilePath = tempFile.getAbsolutePath + + labeledVectorsDS.writeAsLibSVM(outputFilePath) + + env.execute() + + val src = Source.fromFile(tempFile) + + var counter = 0 + + for(l <- src.getLines()) { + expectedLines.exists(_.equals(l)) should be(true) + counter += 1 + } + + counter should be(expectedLines.size) + + tempFile.delete() + } +} diff --git a/src/test/scala/org/apache/flink/ml/classification/Classification.scala b/src/test/scala/org/apache/flink/ml/classification/Classification.scala new file mode 100644 index 0000000000000..edb1dc3155ad0 --- /dev/null +++ b/src/test/scala/org/apache/flink/ml/classification/Classification.scala @@ -0,0 +1,133 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.classification + +import org.apache.flink.ml.common.LabeledVector +import org.apache.flink.ml.math.DenseVector + +object Classification { + + /** Centered data of fisheriris data set + * + */ + val trainingData = Seq[LabeledVector]( + LabeledVector(1.0000, DenseVector(-0.2060, -0.2760)), + LabeledVector(1.0000, DenseVector(-0.4060, -0.1760)), + LabeledVector(1.0000, DenseVector(-0.0060, -0.1760)), + LabeledVector(1.0000, DenseVector(-0.9060, -0.3760)), + LabeledVector(1.0000, DenseVector(-0.3060, -0.1760)), + LabeledVector(1.0000, DenseVector(-0.4060, -0.3760)), + LabeledVector(1.0000, DenseVector(-0.2060, -0.0760)), + LabeledVector(1.0000, DenseVector(-1.6060, -0.6760)), + LabeledVector(1.0000, DenseVector(-0.3060, -0.3760)), + LabeledVector(1.0000, DenseVector(-1.0060, -0.2760)), + LabeledVector(1.0000, DenseVector(-1.4060, -0.6760)), + LabeledVector(1.0000, DenseVector(-0.7060, -0.1760)), + LabeledVector(1.0000, DenseVector(-0.9060, -0.6760)), + LabeledVector(1.0000, DenseVector(-0.2060, -0.2760)), + LabeledVector(1.0000, DenseVector(-1.3060, -0.3760)), + LabeledVector(1.0000, DenseVector(-0.5060, -0.2760)), + LabeledVector(1.0000, DenseVector(-0.4060, -0.1760)), + LabeledVector(1.0000, DenseVector(-0.8060, -0.6760)), + LabeledVector(1.0000, DenseVector(-0.4060, -0.1760)), + LabeledVector(1.0000, DenseVector(-1.0060, -0.5760)), + LabeledVector(1.0000, DenseVector(-0.1060, 0.1240)), + LabeledVector(1.0000, DenseVector(-0.9060, -0.3760)), + LabeledVector(1.0000, DenseVector(-0.0060, -0.1760)), + LabeledVector(1.0000, DenseVector(-0.2060, -0.4760)), + LabeledVector(1.0000, DenseVector(-0.6060, -0.3760)), + LabeledVector(1.0000, DenseVector(-0.5060, -0.2760)), + LabeledVector(1.0000, DenseVector(-0.1060, -0.2760)), + LabeledVector(1.0000, DenseVector(0.0940, 0.0240)), + LabeledVector(1.0000, DenseVector(-0.4060, -0.1760)), + LabeledVector(1.0000, DenseVector(-1.4060, -0.6760)), + LabeledVector(1.0000, DenseVector(-1.1060, -0.5760)), + LabeledVector(1.0000, DenseVector(-1.2060, -0.6760)), + LabeledVector(1.0000, DenseVector(-1.0060, -0.4760)), + LabeledVector(1.0000, DenseVector(0.1940, -0.0760)), + LabeledVector(1.0000, DenseVector(-0.4060, -0.1760)), + LabeledVector(1.0000, DenseVector(-0.4060, -0.0760)), + LabeledVector(1.0000, DenseVector(-0.2060, -0.1760)), + LabeledVector(1.0000, DenseVector(-0.5060, -0.3760)), + LabeledVector(1.0000, DenseVector(-0.8060, -0.3760)), + LabeledVector(1.0000, DenseVector(-0.9060, -0.3760)), + LabeledVector(1.0000, DenseVector(-0.5060, -0.4760)), + LabeledVector(1.0000, DenseVector(-0.3060, -0.2760)), + LabeledVector(1.0000, DenseVector(-0.9060, -0.4760)), + LabeledVector(1.0000, DenseVector(-1.6060, -0.6760)), + LabeledVector(1.0000, DenseVector(-0.7060, -0.3760)), + LabeledVector(1.0000, DenseVector(-0.7060, -0.4760)), + LabeledVector(1.0000, DenseVector(-0.7060, -0.3760)), + LabeledVector(1.0000, DenseVector(-0.6060, -0.3760)), + LabeledVector(1.0000, DenseVector(-1.9060, -0.5760)), + LabeledVector(1.0000, DenseVector(-0.8060, -0.3760)), + LabeledVector(-1.0000, DenseVector(1.0940, 0.8240)), + LabeledVector(-1.0000, DenseVector(0.1940, 0.2240)), + LabeledVector(-1.0000, DenseVector(0.9940, 0.4240)), + LabeledVector(-1.0000, DenseVector(0.6940, 0.1240)), + LabeledVector(-1.0000, DenseVector(0.8940, 0.5240)), + LabeledVector(-1.0000, DenseVector(1.6940, 0.4240)), + LabeledVector(-1.0000, DenseVector(-0.4060, 0.0240)), + LabeledVector(-1.0000, DenseVector(1.3940, 0.1240)), + LabeledVector(-1.0000, DenseVector(0.8940, 0.1240)), + LabeledVector(-1.0000, DenseVector(1.1940, 0.8240)), + LabeledVector(-1.0000, DenseVector(0.1940, 0.3240)), + LabeledVector(-1.0000, DenseVector(0.3940, 0.2240)), + LabeledVector(-1.0000, DenseVector(0.5940, 0.4240)), + LabeledVector(-1.0000, DenseVector(0.0940, 0.3240)), + LabeledVector(-1.0000, DenseVector(0.1940, 0.7240)), + LabeledVector(-1.0000, DenseVector(0.3940, 0.6240)), + LabeledVector(-1.0000, DenseVector(0.5940, 0.1240)), + LabeledVector(-1.0000, DenseVector(1.7940, 0.5240)), + LabeledVector(-1.0000, DenseVector(1.9940, 0.6240)), + LabeledVector(-1.0000, DenseVector(0.0940, -0.1760)), + LabeledVector(-1.0000, DenseVector(0.7940, 0.6240)), + LabeledVector(-1.0000, DenseVector(-0.0060, 0.3240)), + LabeledVector(-1.0000, DenseVector(1.7940, 0.3240)), + LabeledVector(-1.0000, DenseVector(-0.0060, 0.1240)), + LabeledVector(-1.0000, DenseVector(0.7940, 0.4240)), + LabeledVector(-1.0000, DenseVector(1.0940, 0.1240)), + LabeledVector(-1.0000, DenseVector(-0.1060, 0.1240)), + LabeledVector(-1.0000, DenseVector(-0.0060, 0.1240)), + LabeledVector(-1.0000, DenseVector(0.6940, 0.4240)), + LabeledVector(-1.0000, DenseVector(0.8940, -0.0760)), + LabeledVector(-1.0000, DenseVector(1.1940, 0.2240)), + LabeledVector(-1.0000, DenseVector(1.4940, 0.3240)), + LabeledVector(-1.0000, DenseVector(0.6940, 0.5240)), + LabeledVector(-1.0000, DenseVector(0.1940, -0.1760)), + LabeledVector(-1.0000, DenseVector(0.6940, -0.2760)), + LabeledVector(-1.0000, DenseVector(1.1940, 0.6240)), + LabeledVector(-1.0000, DenseVector(0.6940, 0.7240)), + LabeledVector(-1.0000, DenseVector(0.5940, 0.1240)), + LabeledVector(-1.0000, DenseVector(-0.1060, 0.1240)), + LabeledVector(-1.0000, DenseVector(0.4940, 0.4240)), + LabeledVector(-1.0000, DenseVector(0.6940, 0.7240)), + LabeledVector(-1.0000, DenseVector(0.1940, 0.6240)), + LabeledVector(-1.0000, DenseVector(0.1940, 0.2240)), + LabeledVector(-1.0000, DenseVector(0.9940, 0.6240)), + LabeledVector(-1.0000, DenseVector(0.7940, 0.8240)), + LabeledVector(-1.0000, DenseVector(0.2940, 0.6240)), + LabeledVector(-1.0000, DenseVector(0.0940, 0.2240)), + LabeledVector(-1.0000, DenseVector(0.2940, 0.3240)), + LabeledVector(-1.0000, DenseVector(0.4940, 0.6240)), + LabeledVector(-1.0000, DenseVector(0.1940, 0.1240)) + ) + + val expectedWeightVector = DenseVector(-1.95, -3.45) +} diff --git a/src/test/scala/org/apache/flink/ml/classification/SVMITSuite.scala b/src/test/scala/org/apache/flink/ml/classification/SVMITSuite.scala new file mode 100644 index 0000000000000..79a4fb7668c69 --- /dev/null +++ b/src/test/scala/org/apache/flink/ml/classification/SVMITSuite.scala @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.classification + +import org.apache.flink.ml.util.FlinkTestBase +import org.scalatest.{FlatSpec, Matchers} +import org.apache.flink.ml.math.DenseVector + +import org.apache.flink.api.scala._ + +class SVMITSuite extends FlatSpec with Matchers with FlinkTestBase { + + behavior of "The SVM using CoCoA implementation" + + it should "train a SVM" in { + val env = ExecutionEnvironment.getExecutionEnvironment + + val svm = SVM(). + setBlocks(env.getParallelism). + setIterations(100). + setLocalIterations(100). + setRegularization(0.002). + setStepsize(0.1). + setSeed(0) + + val trainingDS = env.fromCollection(Classification.trainingData) + + svm.fit(trainingDS) + + val weightVector = svm.weightsOption.get.collect().head + + weightVector.valueIterator.zip(Classification.expectedWeightVector.valueIterator).foreach { + case (weight, expectedWeight) => + weight should be(expectedWeight +- 0.1) + } + } + + it should "make (mostly) correct predictions" in { + val env = ExecutionEnvironment.getExecutionEnvironment + + val svm = SVM(). + setBlocks(env.getParallelism). + setIterations(100). + setLocalIterations(100). + setRegularization(0.002). + setStepsize(0.1). + setSeed(0) + + val trainingDS = env.fromCollection(Classification.trainingData) + + val test = trainingDS.map(x => (x.vector, x.label)) + + svm.fit(trainingDS) + + val predictionPairs = svm.evaluate(test) + + val absoluteErrorSum = predictionPairs.collect().map{ + case (truth, prediction) => Math.abs(truth - prediction)}.sum + + absoluteErrorSum should be < 15.0 + } + + it should "be possible to get the raw decision function values" in { + val env = ExecutionEnvironment.getExecutionEnvironment + + val svm = SVM(). + setBlocks(env.getParallelism) + .setOutputDecisionFunction(false) + + val customWeights = env.fromElements(DenseVector(1.0, 1.0, 1.0)) + + svm.weightsOption = Option(customWeights) + + val test = env.fromElements(DenseVector(5.0, 5.0, 5.0)) + + val thresholdedPrediction = svm.predict(test).map(vectorLabel => vectorLabel._2).collect().head + + thresholdedPrediction should be (1.0 +- 1e-9) + + svm.setOutputDecisionFunction(true) + + val rawPrediction = svm.predict(test).map(vectorLabel => vectorLabel._2).collect().head + + rawPrediction should be (15.0 +- 1e-9) + + + } +} diff --git a/src/test/scala/org/apache/flink/ml/common/FlinkMLToolsSuite.scala b/src/test/scala/org/apache/flink/ml/common/FlinkMLToolsSuite.scala new file mode 100644 index 0000000000000..0ca04b2c05179 --- /dev/null +++ b/src/test/scala/org/apache/flink/ml/common/FlinkMLToolsSuite.scala @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.common + +import org.apache.flink.api.java.typeutils.runtime.kryo.KryoSerializer +import org.apache.flink.api.scala.ExecutionEnvironment +import org.apache.flink.ml.util.FlinkTestBase +import org.scalatest.{FlatSpec, Matchers} + +class FlinkMLToolsSuite extends FlatSpec with Matchers with FlinkTestBase { + behavior of "FlinkMLTools" + + it should "register the required types" in { + val env = ExecutionEnvironment.getExecutionEnvironment + + FlinkMLTools.registerFlinkMLTypes(env) + + val executionConfig = env.getConfig + + val serializer = new KryoSerializer[Nothing](classOf[Nothing], executionConfig) + + val kryo = serializer.getKryo() + + kryo.getRegistration(classOf[org.apache.flink.ml.math.DenseVector]).getId > 0 should be(true) + kryo.getRegistration(classOf[org.apache.flink.ml.math.SparseVector]).getId > 0 should be(true) + kryo.getRegistration(classOf[org.apache.flink.ml.math.DenseMatrix]).getId > 0 should be(true) + kryo.getRegistration(classOf[org.apache.flink.ml.math.SparseMatrix]).getId > 0 should be(true) + + kryo.getRegistration(classOf[breeze.linalg.DenseMatrix[_]]).getId > 0 should be(true) + kryo.getRegistration(classOf[breeze.linalg.CSCMatrix[_]]).getId > 0 should be(true) + kryo.getRegistration(classOf[breeze.linalg.DenseVector[_]]).getId > 0 should be(true) + kryo.getRegistration(classOf[breeze.linalg.SparseVector[_]]).getId > 0 should be(true) + + kryo.getRegistration(breeze.linalg.DenseVector.zeros[Double](0).getClass).getId > 0 should + be(true) + kryo.getRegistration(breeze.linalg.SparseVector.zeros[Double](0).getClass).getId > 0 should + be(true) + kryo.getRegistration(breeze.linalg.DenseMatrix.zeros[Double](0, 0).getClass).getId > 0 should + be(true) + kryo.getRegistration(breeze.linalg.CSCMatrix.zeros[Double](0, 0).getClass).getId > 0 should + be(true) + } + +} diff --git a/src/test/scala/org/apache/flink/ml/math/BreezeMathSuite.scala b/src/test/scala/org/apache/flink/ml/math/BreezeMathSuite.scala new file mode 100644 index 0000000000000..0fcdaf073d92a --- /dev/null +++ b/src/test/scala/org/apache/flink/ml/math/BreezeMathSuite.scala @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.math + +import Breeze._ +import breeze.linalg + +import org.scalatest.{Matchers, FlatSpec} + +class BreezeMathSuite extends FlatSpec with Matchers { + + behavior of "Breeze vector conversion" + + it should "convert a DenseMatrix into breeze.linalg.DenseMatrix and vice versa" in { + val numRows = 5 + val numCols = 4 + + val data = Array.range(0, numRows * numCols) + val expectedData = Array.range(0, numRows * numCols).map(_ * 2) + + val denseMatrix = DenseMatrix(numRows, numCols, data) + val expectedMatrix = DenseMatrix(numRows, numCols, expectedData) + + val m = denseMatrix.asBreeze + + val result = (m * 2.0).fromBreeze + + result should equal(expectedMatrix) + } + + it should "convert a SparseMatrix into breeze.linalg.CSCMatrix" in { + val numRows = 5 + val numCols = 4 + + val sparseMatrix = SparseMatrix.fromCOO(numRows, numCols, + (0, 1, 1), + (4, 3, 13), + (3, 2, 45), + (4, 0, 12)) + + val expectedMatrix = SparseMatrix.fromCOO(numRows, numCols, + (0, 1, 2), + (4, 3, 26), + (3, 2, 90), + (4, 0, 24)) + + val sm = sparseMatrix.asBreeze + + val result = (sm * 2.0).fromBreeze + + result should equal(expectedMatrix) + } + + it should "convert a dense Flink vector into a dense Breeze vector and vice versa" in { + val vector = DenseVector(1, 2, 3) + + val breezeVector = vector.asBreeze + + val flinkVector = breezeVector.fromBreeze + + breezeVector.getClass should be(new linalg.DenseVector[Double](0).getClass()) + flinkVector.getClass should be (classOf[DenseVector]) + + flinkVector should equal(vector) + } + + it should "convert a sparse Flink vector into a sparse Breeze vector and given the right " + + "converter back into a dense Flink vector" in { + implicit val converter = implicitly[BreezeVectorConverter[DenseVector]] + + val vector = SparseVector.fromCOO(3, (1, 1.0), (2, 2.0)) + + val breezeVector = vector.asBreeze + + val flinkVector = breezeVector.fromBreeze + + breezeVector.getClass should be(new linalg.SparseVector[Double](null).getClass()) + flinkVector.getClass should be (classOf[DenseVector]) + + flinkVector.equalsVector(vector) should be(true) + } +} diff --git a/src/test/scala/org/apache/flink/ml/math/DenseMatrixSuite.scala b/src/test/scala/org/apache/flink/ml/math/DenseMatrixSuite.scala new file mode 100644 index 0000000000000..684a1df9419f5 --- /dev/null +++ b/src/test/scala/org/apache/flink/ml/math/DenseMatrixSuite.scala @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.math + +import org.scalatest.{Matchers, FlatSpec} + +class DenseMatrixSuite extends FlatSpec with Matchers { + + behavior of "Flink's DenseMatrix" + + it should "contain the initialization data" in { + val numRows = 10 + val numCols = 13 + + val data = Array.range(0, numRows*numCols) + + val matrix = DenseMatrix(numRows, numCols, data) + + assertResult(numRows)(matrix.numRows) + assertResult(numCols)(matrix.numCols) + + for(row <- 0 until numRows; col <- 0 until numCols) { + assertResult(data(col*numRows + row))(matrix(row, col)) + } + } + + it should "fail in case of invalid element access" in { + val numRows = 10 + val numCols = 13 + + val matrix = DenseMatrix.zeros(numRows, numCols) + + intercept[IllegalArgumentException] { + matrix(-1, 2) + } + + intercept[IllegalArgumentException] { + matrix(0, -1) + } + + intercept[IllegalArgumentException] { + matrix(numRows, 0) + } + + intercept[IllegalArgumentException] { + matrix(0, numCols) + } + + intercept[IllegalArgumentException] { + matrix(numRows, numCols) + } + } + + it should "be copyable" in { + val numRows = 4 + val numCols = 5 + + val data = Array.range(0, numRows*numCols) + + val denseMatrix = DenseMatrix.apply(numRows, numCols, data) + + val copy = denseMatrix.copy + + denseMatrix should equal(copy) + + copy(0, 0) = 1 + + denseMatrix should not equal copy + } +} diff --git a/src/test/scala/org/apache/flink/ml/math/DenseVectorSuite.scala b/src/test/scala/org/apache/flink/ml/math/DenseVectorSuite.scala new file mode 100644 index 0000000000000..add5947e44f68 --- /dev/null +++ b/src/test/scala/org/apache/flink/ml/math/DenseVectorSuite.scala @@ -0,0 +1,176 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.math + +import org.scalatest.{Matchers, FlatSpec} + +class DenseVectorSuite extends FlatSpec with Matchers { + + behavior of "Flink's DenseVector" + + it should "contain the initialization data" in { + val data = Array.range(1, 10) + + val vector = DenseVector(data) + + assertResult(data.length)(vector.size) + + data.zip(vector.map(_._2)).foreach { case (expected, actual) => assertResult(expected)(actual) } + } + + it should "fail in case of an illegal element access" in { + val size = 10 + + val vector = DenseVector.zeros(size) + + intercept[IllegalArgumentException] { + vector(-1) + } + + intercept[IllegalArgumentException] { + vector(size) + } + } + + it should "calculate dot product with DenseVector" in { + val vec1 = DenseVector(Array(1, 0, 1)) + val vec2 = DenseVector(Array(0, 1, 0)) + + vec1.dot(vec2) should be(0) + } + + it should "calculate dot product with SparseVector" in { + val vec1 = DenseVector(Array(1, 0, 1)) + val vec2 = SparseVector.fromCOO(3, (0, 1), (1, 1)) + + vec1.dot(vec2) should be(1) + } + + it should "calculate dot product with SparseVector 2" in { + val vec1 = DenseVector(Array(1, 0, 1, 0, 0)) + val vec2 = SparseVector.fromCOO(5, (2, 1), (4, 1)) + + vec1.dot(vec2) should be(1) + } + + it should "fail in case of calculation dot product with different size vector" in { + val vec1 = DenseVector(Array(1, 0)) + val vec2 = DenseVector(Array(0)) + + intercept[IllegalArgumentException] { + vec1.dot(vec2) + } + } + + it should "calculate outer product with DenseVector correctly as DenseMatrix" in { + val vec1 = DenseVector(Array(1, 0, 1)) + val vec2 = DenseVector(Array(0, 1, 0)) + + vec1.outer(vec2) should be(an[DenseMatrix]) + vec1.outer(vec2) should be(DenseMatrix(3, 3, Array(0, 1, 0, 0, 0, 0, 0, 1, 0))) + } + + it should "calculate outer product with SparseVector correctly as SparseMatrix" in { + val vec1 = DenseVector(Array(1, 0, 1)) + val vec2 = SparseVector(3, Array(1), Array(1)) + + vec1.outer(vec2) should be(an[SparseMatrix]) + vec1.outer(vec2) should be(SparseMatrix.fromCOO(3, 3, (0, 1, 1), (2, 1, 1))) + } + + it should "calculate outer product with a DenseVector correctly as DenseMatrix 2" in { + val vec1 = DenseVector(Array(1, 0, 1, 0, 0)) + val vec2 = DenseVector(Array(0, 0, 1, 0, 1)) + + val values = Array(0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) + vec1.outer(vec2) should be(DenseMatrix(5, 5, values)) + } + + it should "calculate outer product with a SparseVector correctly as SparseMatrix 2" in { + val vec1 = DenseVector(Array(1, 0, 1, 0, 0)) + val vec2 = SparseVector.fromCOO(5, (2, 1), (4, 1)) + + val entries = Iterable((0, 2, 1.0), (0, 4, 1.0), (2, 2, 1.0), (2, 4, 1.0)) + vec1.outer(vec2) should be(SparseMatrix.fromCOO(5, 5, entries)) + } + + it should "DenseVector right outer product with one-dimensional DenseVector as identity" in { + val vec = DenseVector(Array(1, 0, 1, 0, 0)) + val unit = DenseVector(1) + + vec.outer(unit) should equal(DenseMatrix(vec.size, 1, vec.data)) + } + + it should "DenseVector right outer product with one-dimensional SparseVector as identity" in { + val vec = DenseVector(Array(1, 0, 1, 0, 0)) + val unit = SparseVector(1, Array(0), Array(1)) + + vec.outer(unit) should equal(SparseMatrix.fromCOO(vec.size, 1, (0, 0, 1), (2, 0, 1))) + } + + it should "DenseVector left outer product with one-dimensional unit DenseVector as identity" in { + val vec = DenseVector(Array(1, 2, 3, 4, 5)) + val unit = DenseVector(1) + + unit.outer(vec) should equal(DenseMatrix(1, vec.size, vec.data)) + } + + it should "SparseVector left outer product with one-dimensional unit DenseVector as identity" in { + val vec = SparseVector(5, Array(0, 1, 2, 3, 4), Array(1, 2, 3, 4, 5)) + val unit = DenseVector(1) + + val entries = Iterable((0, 0, 1.0), (0, 1, 2.0), (0, 2, 3.0), (0, 3, 4.0), (0, 4, 5.0)) + unit.outer(vec) should equal(SparseMatrix.fromCOO(1, vec.size, entries)) + } + + it should "DenseVector outer product via multiplication if both vectors are one-dimensional" in { + val vec1 = DenseVector(Array(2)) + val vec2 = DenseVector(Array(3)) + + vec1.outer(vec2) should be(DenseMatrix(1, 1, 2 * 3)) + } + + it should "SparseVector outer product via multiplication if both vectors are one-dimensional" in { + val vec1 = DenseVector(Array(2)) + val vec2 = SparseVector(1, Array(0), Array(3)) + + vec1.outer(vec2) should be(SparseMatrix.fromCOO(1, 1, (0, 0, 2 * 3))) + } + + it should "calculate magnitude of vector" in { + val vec = DenseVector(Array(1, 4, 8)) + + vec.magnitude should be(Math.sqrt((1 * 1) + (4 * 4) + (8 * 8))) + } + + it should "convert from and to Breeze vector" in { + import Breeze._ + + val flinkVector = DenseVector(1, 2, 3) + val breezeVector = breeze.linalg.DenseVector.apply(1.0, 2.0, 3.0) + + // use the vector BreezeVectorConverter + flinkVector should equal(breezeVector.fromBreeze) + + // use the sparse vector BreezeVectorConverter + flinkVector should equal(breezeVector.fromBreeze(DenseVector.denseVectorConverter)) + + flinkVector.asBreeze should be(breezeVector) + } +} diff --git a/src/test/scala/org/apache/flink/ml/math/SparseMatrixSuite.scala b/src/test/scala/org/apache/flink/ml/math/SparseMatrixSuite.scala new file mode 100644 index 0000000000000..68f5571ba0db7 --- /dev/null +++ b/src/test/scala/org/apache/flink/ml/math/SparseMatrixSuite.scala @@ -0,0 +1,134 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.math + +import org.scalatest.{Matchers, FlatSpec} + +class SparseMatrixSuite extends FlatSpec with Matchers { + + behavior of "Flink's SparseMatrix" + + it should "contain a single element provided as a coordinate list (COO)" in { + val sparseMatrix = SparseMatrix.fromCOO(4, 4, (0, 0, 1)) + + sparseMatrix(0, 0) should equal(1) + + for(i <- 1 until sparseMatrix.size) { + val row = i / sparseMatrix.numCols + val col = i % sparseMatrix.numCols + + sparseMatrix(row, col) should equal(0) + } + } + + it should "be initialized from a coordinate list representation (COO)" in { + val data = List[(Int, Int, Double)]((0, 0, 0), (0, 1, 0), (3, 4, 43), (2, 1, 17), + (3, 3, 88), (4 , 2, 99), (1, 4, 91), (3, 4, -1)) + + val numRows = 5 + val numCols = 5 + + val sparseMatrix = SparseMatrix.fromCOO(numRows, numCols, data) + + val expectedSparseMatrix = SparseMatrix.fromCOO(5, 5, (3, 4, 42), (2, 1, 17), (3, 3, 88), + (4, 2, 99), (1, 4, 91), (0, 0, 0), (0, 1, 0)) + + val expectedDenseMatrix = DenseMatrix.zeros(5, 5) + expectedDenseMatrix(3, 4) = 42 + expectedDenseMatrix(2, 1) = 17 + expectedDenseMatrix(3, 3) = 88 + expectedDenseMatrix(4, 2) = 99 + expectedDenseMatrix(1, 4) = 91 + + sparseMatrix should equal(expectedSparseMatrix) + sparseMatrix.equalsMatrix(expectedDenseMatrix) should be(true) + + sparseMatrix.toDenseMatrix.data.sameElements(expectedDenseMatrix.data) should be(true) + + val dataMap = data. + map{ case (row, col, value) => (row, col) -> value }. + groupBy{_._1}. + mapValues{ + entries => + entries.map(_._2).sum + } + + for(row <- 0 until numRows; col <- 0 until numCols) { + sparseMatrix(row, col) should be(dataMap.getOrElse((row, col), 0)) + } + + // test access to defined field even though it was set to 0 + sparseMatrix(0, 1) = 10 + + // test that a non-defined field is not accessible + intercept[IllegalArgumentException]{ + sparseMatrix(1, 1) = 1 + } + } + + it should "fail when accessing zero elements or using invalid indices" in { + val data = List[(Int, Int, Double)]((0, 0, 0), (0, 1, 0), (3, 4, 43), (2, 1, 17), + (3, 3, 88), (4 , 2, 99), (1, 4, 91), (3, 4, -1)) + + val numRows = 5 + val numCols = 5 + + val sparseMatrix = SparseMatrix.fromCOO(numRows, numCols, data) + + intercept[IllegalArgumentException] { + sparseMatrix(-1, 4) + } + + intercept[IllegalArgumentException] { + sparseMatrix(numRows, 0) + } + + intercept[IllegalArgumentException] { + sparseMatrix(0, numCols) + } + + intercept[IllegalArgumentException] { + sparseMatrix(3, -1) + } + } + + it should "fail when elements of the COO list have invalid indices" in { + intercept[IllegalArgumentException]{ + val sparseMatrix = SparseMatrix.fromCOO(5 ,5, (5, 0, 10), (0, 0, 0), (0, 1, 0), (3, 4, 43), + (2, 1, 17)) + } + + intercept[IllegalArgumentException]{ + val sparseMatrix = SparseMatrix.fromCOO(5, 5, (0, 0, 0), (0, 1, 0), (3, 4, 43), (2, 1, 17), + (-1, 4, 20)) + } + } + + it should "be copyable" in { + val sparseMatrix = SparseMatrix.fromCOO(4, 4, (0, 1, 2), (2, 3, 1), (2, 0, 42), (1, 3, 3)) + + val copy = sparseMatrix.copy + + sparseMatrix should equal(copy) + + copy(2, 3) = 2 + + sparseMatrix should not equal copy + } +} diff --git a/src/test/scala/org/apache/flink/ml/math/SparseVectorSuite.scala b/src/test/scala/org/apache/flink/ml/math/SparseVectorSuite.scala new file mode 100644 index 0000000000000..bf2e38ae45d6e --- /dev/null +++ b/src/test/scala/org/apache/flink/ml/math/SparseVectorSuite.scala @@ -0,0 +1,227 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.math + +import org.scalatest.{Matchers, FlatSpec} + +class SparseVectorSuite extends FlatSpec with Matchers { + + behavior of "Flink's SparseVector" + + it should "contain a single element provided as coordinate list (COO)" in { + val sparseVector = SparseVector.fromCOO(3, (0, 1)) + + sparseVector(0) should equal(1) + + for (index <- 1 until 3) { + sparseVector(index) should equal(0) + } + } + + it should "contain the initialization data provided as coordinate list (COO)" in { + val data = List[(Int, Double)]((0, 1), (2, 0), (4, 42), (0, 3)) + val size = 5 + val sparseVector = SparseVector.fromCOO(size, data) + + val expectedSparseVector = SparseVector.fromCOO(5, (0, 4), (4, 42), (2, 0)) + val expectedDenseVector = DenseVector.zeros(5) + + expectedDenseVector(0) = 4 + expectedDenseVector(4) = 42 + + sparseVector should equal(expectedSparseVector) + sparseVector.equalsVector(expectedDenseVector) should be(true) + + val denseVector = sparseVector.toDenseVector + + denseVector should equal(expectedDenseVector) + + val dataMap = data. + groupBy { + _._1 + }. + mapValues { + entries => + entries.map(_._2).sum + } + + for (index <- 0 until size) { + sparseVector(index) should be(dataMap.getOrElse(index, 0)) + } + } + + it should "fail when accessing elements using an invalid index" in { + val sparseVector = SparseVector.fromCOO(5, (1, 1), (3, 3), (4, 4)) + + intercept[IllegalArgumentException] { + sparseVector(-1) + } + + intercept[IllegalArgumentException] { + sparseVector(5) + } + } + + it should "fail when the COO list contains elements with invalid indices" in { + intercept[IllegalArgumentException] { + val sparseVector = SparseVector.fromCOO(5, (0, 1), (-1, 34), (3, 2)) + } + + intercept[IllegalArgumentException] { + val sparseVector = SparseVector.fromCOO(5, (0, 1), (4, 3), (5, 1)) + } + } + + it should "be copyable" in { + val sparseVector = SparseVector.fromCOO(5, (0, 1), (4, 3), (3, 2)) + + val copy = sparseVector.copy + + sparseVector should equal(copy) + + copy(3) = 3 + + sparseVector should not equal (copy) + } + + it should "calculate dot product with SparseVector" in { + val vec1 = SparseVector.fromCOO(4, (0, 1), (2, 1)) + val vec2 = SparseVector.fromCOO(4, (1, 1), (3, 1)) + + vec1.dot(vec2) should be(0) + } + + it should "calculate dot product with SparseVector 2" in { + val vec1 = SparseVector.fromCOO(5, (2, 3), (4, 1)) + val vec2 = SparseVector.fromCOO(5, (4, 2), (2, 1)) + + vec1.dot(vec2) should be(5) + } + + it should "calculate dot product with DenseVector" in { + val vec1 = SparseVector.fromCOO(4, (0, 1), (2, 1)) + val vec2 = DenseVector(Array(0, 1, 0, 1)) + + vec1.dot(vec2) should be(0) + } + + it should "fail in case of calculation dot product with different size vector" in { + val vec1 = SparseVector.fromCOO(4, (0, 1), (2, 1)) + val vec2 = DenseVector(Array(0, 1, 0)) + + intercept[IllegalArgumentException] { + vec1.dot(vec2) + } + } + + it should "calculate outer product with SparseVector correctly as SparseMatrix" in { + val vec1 = SparseVector(3, Array(0, 2), Array(1, 1)) + val vec2 = SparseVector(3, Array(1), Array(1)) + + vec1.outer(vec2) should be(an[SparseMatrix]) + vec1.outer(vec2) should be(SparseMatrix.fromCOO(3, 3, (0, 1, 1), (2, 1, 1))) + } + + it should "calculate outer product with DenseVector correctly as SparseMatrix" in { + val vec1 = SparseVector(3, Array(0, 2), Array(1, 1)) + val vec2 = DenseVector(Array(0, 1, 0)) + + vec1.outer(vec2) should be(an[SparseMatrix]) + vec1.outer(vec2) should be(SparseMatrix.fromCOO(3, 3, (0, 1, 1), (2, 1, 1))) + } + + it should "calculate outer product with a DenseVector correctly as SparseMatrix 2" in { + val vec1 = SparseVector(5, Array(0, 2), Array(1, 1)) + val vec2 = DenseVector(Array(0, 0, 1, 0, 1)) + + val entries = Iterable((0, 2, 1.0), (0, 4, 1.0), (2, 2, 1.0), (2, 4, 1.0)) + vec1.outer(vec2) should be(SparseMatrix.fromCOO(5, 5, entries)) + } + + it should "calculate outer product with a SparseVector correctly as SparseMatrix 2" in { + val vec1 = SparseVector(5, Array(0, 2), Array(1, 1)) + val vec2 = SparseVector.fromCOO(5, (2, 1), (4, 1)) + + val entries = Iterable((0, 2, 1.0), (0, 4, 1.0), (2, 2, 1.0), (2, 4, 1.0)) + vec1.outer(vec2) should be(SparseMatrix.fromCOO(5, 5, entries)) + } + + + it should s"""calculate right outer product with DenseVector + |with one-dimensional unit DenseVector as identity""".stripMargin in { + val vec = SparseVector(5, Array(0, 2), Array(1, 1)) + val unit = DenseVector(1) + + vec.outer(unit) should equal(SparseMatrix.fromCOO(vec.size, 1, (0, 0, 1), (2, 0, 1))) + } + + it should s"""calculate right outer product with DenseVector + |with one-dimensional unit SparseVector as identity""".stripMargin in { + val vec = SparseVector(5, Array(0, 2), Array(1, 1)) + val unit = SparseVector(1, Array(0), Array(1)) + + vec.outer(unit) should equal(SparseMatrix.fromCOO(vec.size, 1, (0, 0, 1), (2, 0, 1))) + } + + it should s"""calculate left outer product for SparseVector + |with one-dimensional unit DenseVector as identity""".stripMargin in { + val vec = SparseVector(5, Array(0, 1, 2, 3, 4), Array(1, 2, 3, 4, 5)) + val unit = DenseVector(1) + + val entries = Iterable((0, 0, 1.0), (0, 1, 2.0), (0, 2, 3.0), (0, 3, 4.0), (0, 4, 5.0)) + unit.outer(vec) should equal(SparseMatrix.fromCOO(1, vec.size, entries)) + } + + it should s"""calculate outer product with SparseVector + |via multiplication if both vectors are one-dimensional""".stripMargin in { + val vec1 = SparseVector.fromCOO(1, (0, 2)) + val vec2 = SparseVector.fromCOO(1, (0, 3)) + + vec1.outer(vec2) should be(SparseMatrix.fromCOO(1, 1, (0, 0, 2 * 3))) + } + + it should s"""calculate outer product with DenseVector + |via multiplication if both vectors are one-dimensional""".stripMargin in { + val vec1 = SparseVector(1, Array(0), Array(2)) + val vec2 = DenseVector(Array(3)) + + vec1.outer(vec2) should be(SparseMatrix.fromCOO(1, 1, (0, 0, 2 * 3))) + } + + it should "calculate magnitude of vector" in { + val vec = SparseVector.fromCOO(3, (0, 1), (1, 4), (2, 8)) + + vec.magnitude should be(9) + } + + it should "convert from and to Breeze vectors" in { + import Breeze._ + + val flinkVector = SparseVector.fromCOO(3, (1, 1.0), (2, 2.0)) + val breezeVector = breeze.linalg.SparseVector(3)(1 -> 1.0, 2 -> 2.0) + + // use the vector BreezeVectorConverter + flinkVector should equal(breezeVector.fromBreeze) + + // use the sparse vector BreezeVectorConverter + flinkVector should equal(breezeVector.fromBreeze(SparseVector.sparseVectorConverter)) + + flinkVector.asBreeze should be(breezeVector) + } +} diff --git a/src/test/scala/org/apache/flink/ml/math/distributed/DistributedRowMatrixSuite.scala b/src/test/scala/org/apache/flink/ml/math/distributed/DistributedRowMatrixSuite.scala new file mode 100644 index 0000000000000..cc4339e182d81 --- /dev/null +++ b/src/test/scala/org/apache/flink/ml/math/distributed/DistributedRowMatrixSuite.scala @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.math.distributed + +import org.apache.flink.api.scala._ +import org.apache.flink.ml.util.FlinkTestBase +import org.scalatest.{FlatSpec, Matchers} + +class DistributedRowMatrixSuite extends FlatSpec with Matchers with FlinkTestBase { + behavior of "DistributedRowMatrix" + + val rawSampleData = List( + (0, 0, 3.0), + (0, 1, 3.0), + (0, 3, 4.0), + (2, 3, 4.0), + (1, 4, 3.0), + (1, 1, 3.0), + (2, 1, 3.0), + (2, 2, 3.0) + ) + + it should "contain the initialization data" in { + val env = ExecutionEnvironment.getExecutionEnvironment + val rowDataset = env.fromCollection(rawSampleData) + val dmatrix = DistributedRowMatrix.fromCOO(rowDataset, 3, 5) + + dmatrix.toCOO.toSet.filter(_._3 != 0) shouldBe rawSampleData.toSet + } + + it should "return the correct dimensions when provided by the user" in { + val env = ExecutionEnvironment.getExecutionEnvironment + val rowDataset = env.fromCollection(rawSampleData) + val dmatrix = DistributedRowMatrix.fromCOO(rowDataset, 3, 5) + + dmatrix.numCols shouldBe 5 + dmatrix.numRows shouldBe 3 + } + + + it should "return a sparse local matrix containing the initialization data" in { + val env = ExecutionEnvironment.getExecutionEnvironment + val rowDataset = env.fromCollection(rawSampleData) + val dmatrix = DistributedRowMatrix.fromCOO(rowDataset, 3, 5) + + dmatrix.toLocalSparseMatrix.iterator.filter(_._3 != 0).toSet shouldBe rawSampleData.toSet + } + + it should "return a dense local matrix containing the initialization data" in { + val env = ExecutionEnvironment.getExecutionEnvironment + val rowDataset = env.fromCollection(rawSampleData) + val dmatrix = DistributedRowMatrix.fromCOO(rowDataset, 3, 5) + + dmatrix.toLocalDenseMatrix.iterator.filter(_._3 != 0).toSet shouldBe rawSampleData.toSet + } + + "add" should "correctly add two distributed row matrices" in { + val env = ExecutionEnvironment.getExecutionEnvironment + val rawSampleSum1 = List( + (0, 0, 1.0), + (7, 4, 3.0), + (0, 1, 8.0), + (2, 8, 12.0) + ) + + val rawSampleSum2 = List( + (0, 0, 2.0), + (3, 4, 4.0), + (2, 8, 8.0) + ) + + val addBlockMatrix1 = DistributedRowMatrix.fromCOO(env.fromCollection(rawSampleSum1), 10, 10) + val addBlockMatrix2 = DistributedRowMatrix.fromCOO(env.fromCollection(rawSampleSum2), 10, 10) + + val expected = List( + (0, 0, 3.0), + (0, 1, 8.0), + (3, 4, 4.0), + (2, 8, 20.0), + (7, 4, 3.0) + ) + val result = addBlockMatrix1 + .add(addBlockMatrix2) + .toLocalSparseMatrix + .filter(_._3 != 0.0) + result.toSet shouldEqual expected.toSet + } +} diff --git a/src/test/scala/org/apache/flink/ml/metrics/distances/DistanceMetricSuite.scala b/src/test/scala/org/apache/flink/ml/metrics/distances/DistanceMetricSuite.scala new file mode 100644 index 0000000000000..dc94d265bf4f4 --- /dev/null +++ b/src/test/scala/org/apache/flink/ml/metrics/distances/DistanceMetricSuite.scala @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.metrics.distances + +import java.io.{ByteArrayInputStream, ByteArrayOutputStream, ObjectInputStream, ObjectOutputStream} + +import org.apache.flink.ml.math.DenseVector +import org.scalatest.{FlatSpec, Matchers} + +class DistanceMetricSuite extends FlatSpec with Matchers { + val EPSILON = 1e-8 + + behavior of "Distance Measures" + + it should "calculate Euclidean distance correctly" in { + val vec1 = DenseVector(1, 9) + val vec2 = DenseVector(5, 6) + + EuclideanDistanceMetric().distance(vec1, vec2) should be(5) + } + + it should "calculate square value of Euclidean distance correctly" in { + val vec1 = DenseVector(1, 9) + val vec2 = DenseVector(5, 6) + + SquaredEuclideanDistanceMetric().distance(vec1, vec2) should be(25) + } + + it should "calculate Chebyshev distance correctly" in { + val vec1 = DenseVector(0, 3, 6) + val vec2 = DenseVector(0, 0, 0) + + ChebyshevDistanceMetric().distance(vec1, vec2) should be(6) + } + + it should "calculate Cosine distance correctly" in { + val vec1 = DenseVector(1, 0) + val vec2 = DenseVector(2, 2) + + CosineDistanceMetric().distance(vec1, vec2) should be((1 - math.sqrt(2) / 2) +- EPSILON) + } + + it should "calculate Manhattan distance correctly" in { + val vec1 = DenseVector(0, 0, 0, 1, 1, 1) + val vec2 = DenseVector(1, 1, 1, 0, 0, 0) + + ManhattanDistanceMetric().distance(vec1, vec2) should be(6) + } + + it should "calculate Minkowski distance correctly" in { + val vec1 = DenseVector(0, 0, 1, 1, 0) + val vec2 = DenseVector(1, 1, 0, 1, 2) + + MinkowskiDistanceMetric(3).distance(vec1, vec2) should be(math.pow(11, 1.0 / 3) +- EPSILON) + } + + it should "calculate Tanimoto distance correctly" in { + val vec1 = DenseVector(0, 1, 1) + val vec2 = DenseVector(1, 1, 0) + + TanimotoDistanceMetric().distance(vec1, vec2) should be(1 - (1.0 / (2 + 2 - 1)) +- EPSILON) + } + + it should "be serialized" in { + val metric = EuclideanDistanceMetric() + val byteOutput = new ByteArrayOutputStream() + val output = new ObjectOutputStream(byteOutput) + + output.writeObject(metric) + output.close() + + val byteInput = new ByteArrayInputStream(byteOutput.toByteArray) + val input = new ObjectInputStream(byteInput) + + val restoredMetric = input.readObject().asInstanceOf[DistanceMetric] + + restoredMetric should be(an[EuclideanDistanceMetric]) + } +} diff --git a/src/test/scala/org/apache/flink/ml/nn/KNNITSuite.scala b/src/test/scala/org/apache/flink/ml/nn/KNNITSuite.scala new file mode 100644 index 0000000000000..6244ee3d513e7 --- /dev/null +++ b/src/test/scala/org/apache/flink/ml/nn/KNNITSuite.scala @@ -0,0 +1,117 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.nn + +import org.apache.flink.api.common.operators.base.CrossOperatorBase.CrossHint +import org.apache.flink.api.scala._ +import org.apache.flink.ml.classification.Classification +import org.apache.flink.ml.math.DenseVector +import org.apache.flink.ml.metrics.distances.{ManhattanDistanceMetric, +SquaredEuclideanDistanceMetric} +import org.apache.flink.ml.util.FlinkTestBase +import org.scalatest.{FlatSpec, Matchers} + +class KNNITSuite extends FlatSpec with Matchers with FlinkTestBase { + behavior of "The KNN Join Implementation" + + it should "throw an exception when the given K is not valid" in { + intercept[IllegalArgumentException] { + KNN().setK(0) + } + } + + it should "throw an exception when the given count of blocks is not valid" in { + intercept[IllegalArgumentException] { + KNN().setBlocks(0) + } + } + + // calculate answer + val answer = Classification.trainingData.map { + v => (v.vector, SquaredEuclideanDistanceMetric().distance(DenseVector(0.0, 0.0), v.vector)) + }.sortBy(_._2).take(3).map(_._1).toArray + + it should "calculate kNN join correctly without using a Quadtree" in { + val env = ExecutionEnvironment.getExecutionEnvironment + + // prepare data + val trainingSet = env.fromCollection(Classification.trainingData).map(_.vector) + val testingSet = env.fromElements(DenseVector(0.0, 0.0)) + + val knn = KNN() + .setK(3) + .setBlocks(10) + .setDistanceMetric(SquaredEuclideanDistanceMetric()) + .setUseQuadTree(false) + .setSizeHint(CrossHint.SECOND_IS_SMALL) + + // run knn join + knn.fit(trainingSet) + val result = knn.predict(testingSet).collect() + + result.size should be(1) + result.head._1 should be(DenseVector(0.0, 0.0)) + result.head._2 should be(answer) + } + + it should "calculate kNN join correctly with a Quadtree" in { + val env = ExecutionEnvironment.getExecutionEnvironment + + // prepare data + val trainingSet = env.fromCollection(Classification.trainingData).map(_.vector) + val testingSet = env.fromElements(DenseVector(0.0, 0.0)) + + val knn = KNN() + .setK(3) + .setBlocks(2) // blocks set to 2 to make sure initial quadtree box is partitioned + .setDistanceMetric(SquaredEuclideanDistanceMetric()) + .setUseQuadTree(true) + .setSizeHint(CrossHint.SECOND_IS_SMALL) + + // run knn join + knn.fit(trainingSet) + val result = knn.predict(testingSet).collect() + + result.size should be(1) + result.head._1 should be(DenseVector(0.0, 0.0)) + result.head._2 should be(answer) + } + + it should "throw an exception when using a Quadtree with an incompatible metric" in { + intercept[IllegalArgumentException] { + val env = ExecutionEnvironment.getExecutionEnvironment + + // prepare data + val trainingSet = env.fromCollection(Classification.trainingData).map(_.vector) + val testingSet = env.fromElements(DenseVector(0.0, 0.0)) + + val knn = KNN() + .setK(3) + .setBlocks(10) + .setDistanceMetric(ManhattanDistanceMetric()) + .setUseQuadTree(true) + + // run knn join + knn.fit(trainingSet) + knn.predict(testingSet).collect() + } + } + +} + diff --git a/src/test/scala/org/apache/flink/ml/nn/QuadTreeSuite.scala b/src/test/scala/org/apache/flink/ml/nn/QuadTreeSuite.scala new file mode 100644 index 0000000000000..e3a510d7428da --- /dev/null +++ b/src/test/scala/org/apache/flink/ml/nn/QuadTreeSuite.scala @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.nn + +import org.apache.flink.ml.metrics.distances.EuclideanDistanceMetric +import org.apache.flink.ml.math.{Vector, DenseVector} +import org.apache.flink.ml.util.FlinkTestBase + +import org.scalatest.{Matchers, FlatSpec} + +/** Tests of [[QuadTree]] class + * + * Constructor for the [[QuadTree]] class: + * {{{ + * class QuadTree(minVec: ListBuffer[Double], maxVec: ListBuffer[Double]) + * }}} + */ + +class QuadTreeSuite extends FlatSpec with Matchers with FlinkTestBase { + behavior of "QuadTree Class" + + it should "partition into equal size sub-boxes and search for nearby objects properly" in { + val minVec = DenseVector(-1.0, -0.5) + val maxVec = DenseVector(1.0, 0.5) + + val myTree = new QuadTree(minVec, maxVec, EuclideanDistanceMetric(), 3) + + myTree.insert(DenseVector(-0.25, 0.3).asInstanceOf[Vector]) + myTree.insert(DenseVector(-0.20, 0.31).asInstanceOf[Vector]) + myTree.insert(DenseVector(-0.21, 0.29).asInstanceOf[Vector]) + + /* Tree will partition once the 4th point is added */ + myTree.insert(DenseVector(0.2, 0.27).asInstanceOf[Vector]) + myTree.insert(DenseVector(0.2, 0.26).asInstanceOf[Vector]) + myTree.insert(DenseVector(-0.21, 0.289).asInstanceOf[Vector]) + myTree.insert(DenseVector(-0.1, 0.289).asInstanceOf[Vector]) + myTree.insert(DenseVector(0.7, 0.45).asInstanceOf[Vector]) + + /* Exact values of (centers,dimensions) of root + children nodes, to test + * partitionBox and makeChildren methods; exact values are given to avoid + * essentially copying and pasting the code to automatically generate them + * from minVec/maxVec + */ + val knownCentersLengths = Set((DenseVector(0.0, 0.0), DenseVector(2.0, 1.0)), + (DenseVector(-0.5, -0.25), DenseVector(1.0, 0.5)), + (DenseVector(-0.5, 0.25), DenseVector(1.0, 0.5)), + (DenseVector(0.5, -0.25), DenseVector(1.0, 0.5)), + (DenseVector(0.5, 0.25), DenseVector(1.0, 0.5)) + ) + + /* (centers,dimensions) computed from QuadTree.makeChildren */ + var computedCentersLength = Set((DenseVector(0.0, 0.0), DenseVector(2.0, 1.0))) + for (child <- myTree.root.children) { + computedCentersLength += child.getCenterWidth().asInstanceOf[(DenseVector, DenseVector)] + } + + /* Tests search for nearby neighbors, make sure the right object is contained in neighbor + * search the neighbor search will contain more points + */ + val neighborsComputed = myTree.searchNeighbors(DenseVector(0.7001, 0.45001), 0.001) + val isNeighborInSearch = neighborsComputed.contains(DenseVector(0.7, 0.45)) + + /* Test ability to get all objects in minimal bounding box + objects in siblings' block method + * In this case, drawing a picture of the QuadTree shows that + * (-0.2, 0.31), (-0.21, 0.29), (-0.21, 0.289) + * are objects near (-0.2001, 0.31001) + */ + val siblingsObjectsComputed = myTree.searchNeighborsSiblingQueue(DenseVector(-0.2001, 0.31001)) + val isSiblingsInSearch = siblingsObjectsComputed.contains(DenseVector(-0.2, 0.31)) && + siblingsObjectsComputed.contains(DenseVector(-0.21, 0.29)) && + siblingsObjectsComputed.contains(DenseVector(-0.21, 0.289)) + + computedCentersLength should be(knownCentersLengths) + isNeighborInSearch should be(true) + isSiblingsInSearch should be(true) + } +} diff --git a/src/test/scala/org/apache/flink/ml/optimization/GradientDescentITSuite.scala b/src/test/scala/org/apache/flink/ml/optimization/GradientDescentITSuite.scala new file mode 100644 index 0000000000000..021502a04ca84 --- /dev/null +++ b/src/test/scala/org/apache/flink/ml/optimization/GradientDescentITSuite.scala @@ -0,0 +1,278 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.optimization + +import org.apache.flink.ml.common.{LabeledVector, WeightVector} +import org.apache.flink.ml.math.DenseVector +import org.apache.flink.ml.regression.RegressionData._ +import org.apache.flink.ml.util.FlinkTestBase +import org.scalatest.{Matchers, FlatSpec} + +import org.apache.flink.api.scala._ + + +class GradientDescentITSuite extends FlatSpec with Matchers with FlinkTestBase { + + // TODO(tvas): Check results again once sampling operators are in place + + behavior of "The Stochastic Gradient Descent implementation" + + it should "correctly solve an L1 regularized regression problem" in { + val env = ExecutionEnvironment.getExecutionEnvironment + + env.setParallelism(2) + + val lossFunction = GenericLossFunction(SquaredLoss, LinearPrediction) + + val sgd = GradientDescent() + .setStepsize(0.01) + .setIterations(2000) + .setLossFunction(lossFunction) + .setRegularizationPenalty(L1Regularization) + .setRegularizationConstant(0.3) + + val inputDS: DataSet[LabeledVector] = env.fromCollection(regularizationData) + + val weightDS = sgd.optimize(inputDS, None) + + val weightList: Seq[WeightVector] = weightDS.collect() + + val weightVector: WeightVector = weightList.head + + val intercept = weightVector.intercept + val weights = weightVector.weights.asInstanceOf[DenseVector].data + + expectedRegWeights zip weights foreach { + case (expectedWeight, weight) => + weight should be (expectedWeight +- 0.01) + } + + intercept should be (expectedRegWeight0 +- 0.1) + } + + it should "correctly perform one step with L2 regularization" in { + val env = ExecutionEnvironment.getExecutionEnvironment + + env.setParallelism(2) + + val lossFunction = GenericLossFunction(SquaredLoss, LinearPrediction) + + val sgd = GradientDescent() + .setStepsize(0.1) + .setIterations(1) + .setLossFunction(lossFunction) + .setRegularizationPenalty(L2Regularization) + .setRegularizationConstant(1.0) + + val inputDS: DataSet[LabeledVector] = env.fromElements(LabeledVector(1.0, DenseVector(2.0))) + val currentWeights = new WeightVector(DenseVector(1.0), 1.0) + val currentWeightsDS = env.fromElements(currentWeights) + + val weightDS = sgd.optimize(inputDS, Some(currentWeightsDS)) + + val weightList: Seq[WeightVector] = weightDS.collect() + + weightList.size should equal(1) + + val WeightVector(updatedWeights, updatedIntercept) = weightList.head + + updatedWeights(0) should be (0.5 +- 0.001) + updatedIntercept should be (0.8 +- 0.01) + } + + it should "estimate a linear function" in { + val env = ExecutionEnvironment.getExecutionEnvironment + + env.setParallelism(2) + + val lossFunction = GenericLossFunction(SquaredLoss, LinearPrediction) + + val sgd = GradientDescent() + .setStepsize(1.0) + .setIterations(800) + .setLossFunction(lossFunction) + + val inputDS = env.fromCollection(data) + val weightDS = sgd.optimize(inputDS, None) + + val weightList: Seq[WeightVector] = weightDS.collect() + + weightList.size should equal(1) + + val weightVector: WeightVector = weightList.head + + val weights = weightVector.weights.asInstanceOf[DenseVector].data + val weight0 = weightVector.intercept + + expectedWeights zip weights foreach { + case (expectedWeight, weight) => + weight should be (expectedWeight +- 0.1) + } + weight0 should be (expectedWeight0 +- 0.1) + } + + it should "estimate a linear function without an intercept" in { + val env = ExecutionEnvironment.getExecutionEnvironment + + env.setParallelism(2) + + val lossFunction = GenericLossFunction(SquaredLoss, LinearPrediction) + + val sgd = GradientDescent() + .setStepsize(0.0001) + .setIterations(100) + .setLossFunction(lossFunction) + + val inputDS = env.fromCollection(noInterceptData) + val weightDS = sgd.optimize(inputDS, None) + + val weightList: Seq[WeightVector] = weightDS.collect() + + weightList.size should equal(1) + + val weightVector: WeightVector = weightList.head + + val weights = weightVector.weights.asInstanceOf[DenseVector].data + val weight0 = weightVector.intercept + + expectedNoInterceptWeights zip weights foreach { + case (expectedWeight, weight) => + weight should be (expectedWeight +- 0.1) + } + weight0 should be (expectedNoInterceptWeight0 +- 0.1) + } + + it should "correctly perform one step of the algorithm with initial weights provided" in { + val env = ExecutionEnvironment.getExecutionEnvironment + + env.setParallelism(2) + + val lossFunction = GenericLossFunction(SquaredLoss, LinearPrediction) + + val sgd = GradientDescent() + .setStepsize(0.1) + .setIterations(1) + .setLossFunction(lossFunction) + + val inputDS: DataSet[LabeledVector] = env.fromElements(LabeledVector(1.0, DenseVector(2.0))) + val currentWeights = new WeightVector(DenseVector(1.0), 1.0) + val currentWeightsDS = env.fromElements(currentWeights) + + val weightDS = sgd.optimize(inputDS, Some(currentWeightsDS)) + + val weightList: Seq[WeightVector] = weightDS.collect() + + weightList.size should equal(1) + + val weightVector: WeightVector = weightList.head + + val updatedIntercept = weightVector.intercept + val updatedWeight = weightVector.weights(0) + + updatedWeight should be (0.6 +- 0.01) + updatedIntercept should be (0.8 +- 0.01) + + } + + it should "terminate early if the convergence criterion is reached" in { + // TODO(tvas): We need a better way to check the convergence of the weights. + // Ideally we want to have a Breeze-like system, where the optimizers carry a history and that + // can tell us whether we have converged and at which iteration + + val env = ExecutionEnvironment.getExecutionEnvironment + + env.setParallelism(2) + + val lossFunction = GenericLossFunction(SquaredLoss, LinearPrediction) + + val sgdEarlyTerminate = GradientDescent() + .setConvergenceThreshold(1e2) + .setStepsize(1.0) + .setIterations(800) + .setLossFunction(lossFunction) + + val inputDS = env.fromCollection(data) + + val weightDSEarlyTerminate = sgdEarlyTerminate.optimize(inputDS, None) + + val weightListEarly: Seq[WeightVector] = weightDSEarlyTerminate.collect() + + weightListEarly.size should equal(1) + + val weightVectorEarly: WeightVector = weightListEarly.head + val weightsEarly = weightVectorEarly.weights.asInstanceOf[DenseVector].data + val weight0Early = weightVectorEarly.intercept + + val sgdNoConvergence = GradientDescent() + .setStepsize(1.0) + .setIterations(800) + .setLossFunction(lossFunction) + + val weightDSNoConvergence = sgdNoConvergence.optimize(inputDS, None) + + val weightListNoConvergence: Seq[WeightVector] = weightDSNoConvergence.collect() + + weightListNoConvergence.size should equal(1) + + val weightVectorNoConvergence: WeightVector = weightListNoConvergence.head + val weightsNoConvergence = weightVectorNoConvergence.weights.asInstanceOf[DenseVector].data + val weight0NoConvergence = weightVectorNoConvergence.intercept + + // Since the first optimizer was set to terminate early, its weights should be different + weightsEarly zip weightsNoConvergence foreach { + case (earlyWeight, weightNoConvergence) => + weightNoConvergence should not be (earlyWeight +- 0.1) + } + weight0NoConvergence should not be (weight0Early +- 0.1) + } + + it should "come up with similar parameter estimates with xu step-size strategy" in { + val env = ExecutionEnvironment.getExecutionEnvironment + + env.setParallelism(2) + + val lossFunction = GenericLossFunction(SquaredLoss, LinearPrediction) + + val sgd = GradientDescent() + .setStepsize(1.0) + .setIterations(800) + .setLossFunction(lossFunction) + .setLearningRateMethod(LearningRateMethod.Xu(-0.75)) + + val inputDS = env.fromCollection(data) + val weightDS = sgd.optimize(inputDS, None) + + val weightList: Seq[WeightVector] = weightDS.collect() + + weightList.size should equal(1) + + val weightVector: WeightVector = weightList.head + + val weights = weightVector.weights.asInstanceOf[DenseVector].data + val weight0 = weightVector.intercept + + expectedWeights zip weights foreach { + case (expectedWeight, weight) => + weight should be (expectedWeight +- 0.1) + } + weight0 should be (expectedWeight0 +- 0.1) + } + // TODO: Need more corner cases, see sklearn tests for SGD linear model + +} diff --git a/src/test/scala/org/apache/flink/ml/optimization/LossFunctionTest.scala b/src/test/scala/org/apache/flink/ml/optimization/LossFunctionTest.scala new file mode 100644 index 0000000000000..947b12fcd3c78 --- /dev/null +++ b/src/test/scala/org/apache/flink/ml/optimization/LossFunctionTest.scala @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.optimization + +import org.apache.flink.ml.common.{LabeledVector, WeightVector} +import org.apache.flink.ml.math.DenseVector +import org.apache.flink.ml.util.FlinkTestBase +import org.scalatest.{Matchers, FlatSpec} + + +class LossFunctionTest extends FlatSpec with Matchers { + + behavior of "The optimization Loss Function implementations" + + it should "calculate squared loss and gradient correctly" in { + + val lossFunction = GenericLossFunction(SquaredLoss, LinearPrediction) + + val example = LabeledVector(1.0, DenseVector(2)) + val weightVector = new WeightVector(DenseVector(1.0), 1.0) + + val gradient = lossFunction.gradient(example, weightVector) + val loss = lossFunction.loss(example, weightVector) + + loss should be (2.0 +- 0.001) + + gradient.weights(0) should be (4.0 +- 0.001) + } + + it should "calculate logistic loss and gradient correctly" in { + + val lossFunction = GenericLossFunction(LogisticLoss, LinearPrediction) + + val examples = List( + LabeledVector(1.0, DenseVector(2)), + LabeledVector(1.0, DenseVector(20)), + LabeledVector(1.0, DenseVector(-25)) + ) + + val weightVector = new WeightVector(DenseVector(1.0), 1.0) + val expectedLosses = List(0.049, 7.58e-10, 24.0) + val expectedGradients = List(-0.095, -1.52e-8, 25.0) + + expectedLosses zip examples foreach { + case (expectedLoss, example) => { + val loss = lossFunction.loss(example, weightVector) + loss should be (expectedLoss +- 0.001) + } + } + + expectedGradients zip examples foreach { + case (expectedGradient, example) => { + val gradient = lossFunction.gradient(example, weightVector) + gradient.weights(0) should be (expectedGradient +- 0.001) + } + } + } + + it should "calculate hinge loss and gradient correctly" in { + + val lossFunction = GenericLossFunction(HingeLoss, LinearPrediction) + + val examples = List( + LabeledVector(1.0, DenseVector(2)), + LabeledVector(1.0, DenseVector(-2)) + ) + + val weightVector = new WeightVector(DenseVector(1.0), 1.0) + val expectedLosses = List(0.0, 2.0) + val expectedGradients = List(0.0, 2.0) + + expectedLosses zip examples foreach { + case (expectedLoss, example) => { + val loss = lossFunction.loss(example, weightVector) + loss should be (expectedLoss +- 0.001) + } + } + + expectedGradients zip examples foreach { + case (expectedGradient, example) => { + val gradient = lossFunction.gradient(example, weightVector) + gradient.weights(0) should be (expectedGradient +- 0.001) + } + } + } +} diff --git a/src/test/scala/org/apache/flink/ml/optimization/PredictionFunctionITSuite.scala b/src/test/scala/org/apache/flink/ml/optimization/PredictionFunctionITSuite.scala new file mode 100644 index 0000000000000..4a43f601c748f --- /dev/null +++ b/src/test/scala/org/apache/flink/ml/optimization/PredictionFunctionITSuite.scala @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.optimization + +import org.apache.flink.ml.common.WeightVector +import org.apache.flink.ml.math.DenseVector +import org.apache.flink.api.scala._ +import org.apache.flink.ml.util.FlinkTestBase + +import org.scalatest.{Matchers, FlatSpec} + +class PredictionFunctionITSuite extends FlatSpec with Matchers with FlinkTestBase { + + behavior of "The optimization framework prediction functions" + + it should "correctly calculate linear predictions" in { + val env = ExecutionEnvironment.getExecutionEnvironment + + env.setParallelism(2) + + val predFunction = LinearPrediction + + val weightVector = new WeightVector(DenseVector(-1.0, 1.0, 0.4, -0.4, 0.0), 1.0) + val features = DenseVector(1.0, 1.0, 1.0, 1.0, 1.0) + + val prediction = predFunction.predict(features, weightVector) + + prediction should be (1.0 +- 0.001) + } + + it should "correctly calculate the gradient for linear predictions" in { + val env = ExecutionEnvironment.getExecutionEnvironment + + env.setParallelism(2) + + val predFunction = LinearPrediction + + val weightVector = new WeightVector(DenseVector(-1.0, 1.0, 0.4, -0.4, 0.0), 1.0) + val features = DenseVector(1.0, 1.0, 1.0, 1.0, 1.0) + + val gradient = predFunction.gradient(features, weightVector) + + gradient shouldEqual WeightVector(DenseVector(1.0, 1.0, 1.0, 1.0, 1.0), 1.0) + } + +} diff --git a/src/test/scala/org/apache/flink/ml/optimization/RegularizationPenaltyTest.scala b/src/test/scala/org/apache/flink/ml/optimization/RegularizationPenaltyTest.scala new file mode 100644 index 0000000000000..e001558deeb98 --- /dev/null +++ b/src/test/scala/org/apache/flink/ml/optimization/RegularizationPenaltyTest.scala @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.optimization + +import org.apache.flink.ml.math.DenseVector +import org.scalatest.{FlatSpec, Matchers} + + +class RegularizationPenaltyTest extends FlatSpec with Matchers { + + behavior of "The Regularization Penalty Function implementations" + + it should "correctly update weights and loss with L2 regularization penalty" in { + val loss = 3.4 + val weights = DenseVector(0.8) + val gradient = DenseVector(2.0) + + val updatedWeights = L2Regularization.takeStep(weights, gradient, 0.3, 0.01) + val updatedLoss = L2Regularization.regLoss(loss, updatedWeights, 0.3) + + updatedWeights(0) should be (0.7776 +- 0.001) + updatedLoss should be (3.4907 +- 0.001) + } + + it should "correctly update weights and loss with L1 regularization penalty" in { + val loss = 3.4 + val weights = DenseVector(0.8) + val gradient = DenseVector(2.0) + + val updatedWeights = L1Regularization.takeStep(weights, gradient, 0.3, 0.01) + val updatedLoss = L1Regularization.regLoss(loss, updatedWeights, 0.3) + + updatedWeights(0) should be (0.777 +- 0.001) + updatedLoss should be (3.6331 +- 0.001) + } + + it should "correctly update weights and loss with no regularization penalty" in { + val loss = 3.4 + val weights = DenseVector(0.8) + val gradient = DenseVector(2.0) + + val updatedWeights = NoRegularization.takeStep(weights, gradient, 0.3, 0.01) + val updatedLoss = NoRegularization.regLoss(loss, updatedWeights, 0.3) + + updatedWeights(0) should be (0.78 +- 0.001) + updatedLoss should be (3.4 +- 0.001) + } +} diff --git a/src/test/scala/org/apache/flink/ml/outlier/StochasticOutlierSelectionITSuite.scala b/src/test/scala/org/apache/flink/ml/outlier/StochasticOutlierSelectionITSuite.scala new file mode 100644 index 0000000000000..664de0654aeaf --- /dev/null +++ b/src/test/scala/org/apache/flink/ml/outlier/StochasticOutlierSelectionITSuite.scala @@ -0,0 +1,240 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.outlier + +import breeze.linalg.{sum, DenseVector => BreezeDenseVector} +import org.apache.flink.api.scala._ +import org.apache.flink.ml.common.LabeledVector +import org.apache.flink.ml.math.DenseVector +import org.apache.flink.ml.outlier.StochasticOutlierSelection.BreezeLabeledVector +import org.apache.flink.ml.util.FlinkTestBase +import org.scalatest.{FlatSpec, Matchers} + +class StochasticOutlierSelectionITSuite extends FlatSpec with Matchers with FlinkTestBase { + behavior of "Stochastic Outlier Selection algorithm" + val EPSILON = 1e-16 + + /* + Unit-tests created based on the Python scripts of the algorithms author' + https://github.com/jeroenjanssens/scikit-sos + + For more information about SOS, see https://github.com/jeroenjanssens/sos + J.H.M. Janssens, F. Huszar, E.O. Postma, and H.J. van den Herik. Stochastic + Outlier Selection. Technical Report TiCC TR 2012-001, Tilburg University, + Tilburg, the Netherlands, 2012. + */ + + val perplexity = 3 + val errorTolerance = 0 + val maxIterations = 5000 + val parameters = new StochasticOutlierSelection().setPerplexity(perplexity).parameters + + it should "Compute the perplexity of the vector and return the correct error" in { + val vector = BreezeDenseVector(Array(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 8.0, 9.0, 10.0)) + + val output = Array( + 0.39682901665799636, + 0.15747326846175236, + 0.06248996227359784, + 0.024797830280027126, + 0.009840498605275054, + 0.0039049953849556816, + 6.149323865970302E-4, + 2.4402301428445443E-4, + 9.683541280042027E-5 + ) + + val search = StochasticOutlierSelection.binarySearch( + vector, + Math.log(perplexity), + maxIterations, + errorTolerance + ).toArray + + search should be(output) + } + + it should "Compute the distance matrix and give symmetrical distances" in { + val env = ExecutionEnvironment.getExecutionEnvironment + + val data = env.fromCollection(List( + BreezeLabeledVector(0, BreezeDenseVector(Array(1.0, 3.0))), + BreezeLabeledVector(1, BreezeDenseVector(Array(5.0, 1.0))) + )) + + val distanceMatrix = StochasticOutlierSelection + .computeDissimilarityVectors(data) + .map(_.data) + .collect() + .toArray + + distanceMatrix(0) should be(distanceMatrix(1)) + } + + it should "Compute the distance matrix and give the correct distances" in { + val env = ExecutionEnvironment.getExecutionEnvironment + + val expectedDistanceMatrix = Array( + Array(Math.sqrt(2.0), Math.sqrt(10.0)), + Array(Math.sqrt(2.0), Math.sqrt(16.0)), + Array(Math.sqrt(16.0), Math.sqrt(10.0)) + ) + + val data = env.fromCollection(Array( + BreezeLabeledVector(0, BreezeDenseVector(Array(1.0, 1.0))), + BreezeLabeledVector(1, BreezeDenseVector(Array(2.0, 2.0))), + BreezeLabeledVector(2, BreezeDenseVector(Array(5.0, 1.0))) + )) + + val distanceMatrix = StochasticOutlierSelection + .computeDissimilarityVectors(data) + .map(_.data.toArray) + .collect() + .sortBy(dist => sum(dist)) + .toArray + + distanceMatrix should be(expectedDistanceMatrix) + } + + it should "Computing the affinity matrix and return the correct affinity" in { + val env = ExecutionEnvironment.getExecutionEnvironment + + val data = env.fromCollection(List( + BreezeLabeledVector(0, BreezeDenseVector(Array(1.0, 1.0))), + BreezeLabeledVector(1, BreezeDenseVector(Array(2.0, 1.0))), + BreezeLabeledVector(2, BreezeDenseVector(Array(1.0, 2.0))), + BreezeLabeledVector(3, BreezeDenseVector(Array(2.0, 2.0))), + BreezeLabeledVector(4, BreezeDenseVector(Array(5.0, 8.0))) // The outlier! + )) + + val distanceMatrix = StochasticOutlierSelection.computeDissimilarityVectors(data) + + + val affinityMatrix = StochasticOutlierSelection.computeAffinity(distanceMatrix, parameters) + .collect() + .map(_.data.toArray) + .sortBy(dist => sum(dist)) + .toArray + + val expectedAffinityMatrix = Array( + Array( + 1.6502458086204375E-6, 3.4496775759599478E-6, 6.730049701933432E-6, 1.544221669904019E-5), + Array(0.2837044890495805, 0.4103155587026411, 0.4103155587026411, 0.0025393148189994897), + Array(0.43192525601205634, 0.30506325262816036, 0.43192525601205634, 0.0023490595181415333), + Array(0.44804626736879755, 0.3212891538762665, 0.44804626736879755, 0.0022108233460722557), + Array(0.46466276524577704, 0.46466276524577704, 0.3382687394674377, 0.002071952211368232) + ) + + affinityMatrix should be(expectedAffinityMatrix) + } + + it should "Compute the binding probabilities and return the correct probabilities" in { + val env = ExecutionEnvironment.getExecutionEnvironment + + val expectedBindingProbabilityMatrix = Array( + Array(0.00000000000000000, 0.3659685430819966, 0.36596854308199660, + 0.2664300527549236, 0.0016328610810832176), + Array(0.06050907527090226, 0.1264893287483121, 0.24677254025174370, + 0.5662290557290419, 0.0000000000000000000), + Array(0.25630819225892230, 0.3706990977807361, 0.37069909778073610, + 0.0000000000000000, 0.0022936121796053232), + Array(0.36737364041784460, 0.0000000000000000, 0.26343993596023335, + 0.3673736404178446, 0.0018127832040774768), + Array(0.36877315905154990, 0.2604492865700658, 0.00000000000000000, + 0.3687731590515499, 0.0020043953268345785) + ) + + // The distance matrix + val data = env.fromCollection(List( + BreezeLabeledVector(0, new BreezeDenseVector( + Array(0.00000000e+00, 4.64702705e-01, 4.64702705e-01, 3.38309859e-01, 2.07338848e-03))), + BreezeLabeledVector(1, new BreezeDenseVector( + Array(4.48047312e-01, 0.00000000e+00, 3.21290213e-01, 4.48047312e-01, 2.21086260e-03))), + BreezeLabeledVector(2, new BreezeDenseVector( + Array(4.31883411e-01, 3.05021457e-01, 0.00000000e+00, 4.31883411e-01, 2.34741892e-03))), + BreezeLabeledVector(3, new BreezeDenseVector( + Array(2.83688288e-01, 4.10298990e-01, 4.10298990e-01, 0.00000000e+00, 2.53862706e-03))), + BreezeLabeledVector(4, new BreezeDenseVector( + Array(1.65000529e-06, 3.44920263e-06, 6.72917236e-06, 1.54403440e-05, 0.00000000e+00))) + )) + + val bindingProbabilityMatrix = StochasticOutlierSelection.computeBindingProbabilities(data) + .map(_.data.toArray) + .collect() + .sortBy(_ (0)) // Sort by the first element, because the sum is always equal to 1 + .toArray + + bindingProbabilityMatrix should be(expectedBindingProbabilityMatrix) + } + + + it should "Compute the product of the vector, should return the correct values" in { + val env = ExecutionEnvironment.getExecutionEnvironment + + val data = env.fromCollection(List( + BreezeLabeledVector(0, BreezeDenseVector(0.5, 0.3)), + BreezeLabeledVector(1, BreezeDenseVector(0.25, 0.1)), + BreezeLabeledVector(2, BreezeDenseVector(0.8, 0.8)) + )) + + val outlierMatrix = StochasticOutlierSelection.computeOutlierProbability(data) + .map(_._2) + .collect() + .sortBy(dist => dist) + .toArray + + // The math by hand + val expectedOutlierMatrix = Array( + (1.0 - 0.5) * (1.0 - 0.0) * (1.0 - 0.8), + (1.0 - 0.0) * (1.0 - 0.25) * (1.0 - 0.8), + (1.0 - 0.3) * (1.0 - 0.1) * (1.0 - 0) + ) + + outlierMatrix should be(expectedOutlierMatrix) + } + + it should "Verifying the output of the SOS algorithm assign the one true outlier" in { + val env = ExecutionEnvironment.getExecutionEnvironment + + val data = env.fromCollection(List( + LabeledVector(0.0, DenseVector(1.0, 1.0)), + LabeledVector(1.0, DenseVector(2.0, 1.0)), + LabeledVector(2.0, DenseVector(1.0, 2.0)), + LabeledVector(3.0, DenseVector(2.0, 2.0)), + LabeledVector(4.0, DenseVector(5.0, 8.0)) // The outlier! + )) + + val sos = new StochasticOutlierSelection().setPerplexity(3) + + val outputVector = sos + .transform(data) + .collect() + + val expectedOutputVector = Map( + 0 -> 0.2790094479202896, + 1 -> 0.25775014551682535, + 2 -> 0.22136130977995766, + 3 -> 0.12707053787018444, + 4 -> 0.9922779902453757 // The outlier! + ) + + outputVector.foreach(output => + expectedOutputVector(output._1) should be(output._2 +- EPSILON)) + } +} diff --git a/src/test/scala/org/apache/flink/ml/pipeline/PipelineITSuite.scala b/src/test/scala/org/apache/flink/ml/pipeline/PipelineITSuite.scala new file mode 100644 index 0000000000000..35c04ad44f783 --- /dev/null +++ b/src/test/scala/org/apache/flink/ml/pipeline/PipelineITSuite.scala @@ -0,0 +1,211 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.pipeline + +import org.apache.flink.api.scala._ +import org.apache.flink.ml.classification.SVM +import org.apache.flink.ml.common.{ParameterMap, LabeledVector} +import org.apache.flink.ml.math._ +import org.apache.flink.ml.preprocessing.{PolynomialFeatures, StandardScaler} +import org.apache.flink.ml.regression.MultipleLinearRegression +import org.apache.flink.ml.util.FlinkTestBase +import org.scalatest.{Matchers, FlatSpec} + +class PipelineITSuite extends FlatSpec with Matchers with FlinkTestBase { + behavior of "Flink's pipelines" + + it should "support chaining of compatible transformer" in { + val env = ExecutionEnvironment.getExecutionEnvironment + + val vData = List(DenseVector(1.0, 2.0, 3.0), DenseVector(2.0, 3.0, 4.0)) + val lvData = List(LabeledVector(1.0, DenseVector(1.0, 1.0, 1.0)), + LabeledVector(2.0, DenseVector(2.0, 2.0, 2.0))) + + val vectorData = env.fromCollection(vData) + val labeledVectorData = env.fromCollection(lvData) + + val expectedScaledVectorSet = Set( + DenseVector(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, -1.0, -1.0, -1.0), + DenseVector(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0) + ) + + val expectedScaledLabeledVectorSet = Set( + LabeledVector(1.0, DenseVector(1.0, 3.0, 5.0, 9.0, 15.0, 25.0, -1.0, -3.0, -5.0)), + LabeledVector(2.0, DenseVector(1.0, -1.0, -3.0, 1.0, 3.0, 9.0, 1.0, -1.0, -3.0)) + ) + + val scaler = StandardScaler() + val polyFeatures = PolynomialFeatures().setDegree(2) + + val pipeline = scaler.chainTransformer(polyFeatures) + + pipeline.fit(vectorData) + + val scaledVectorDataDS = pipeline.transform(vectorData) + val scaledLabeledVectorDataDS = pipeline.transform(labeledVectorData) + + val scaledVectorData = scaledVectorDataDS.collect() + val scaledLabeledVectorData = scaledLabeledVectorDataDS.collect() + + scaledVectorData.size should be(expectedScaledVectorSet.size) + + for(scaledVector <- scaledVectorData){ + expectedScaledVectorSet should contain(scaledVector) + } + + scaledLabeledVectorData.size should be(expectedScaledLabeledVectorSet.size) + + for(scaledLabeledVector <- scaledLabeledVectorData) { + expectedScaledLabeledVectorSet should contain(scaledLabeledVector) + } + } + + it should "throw an exception when the pipeline operators are not compatible" in { + val env = ExecutionEnvironment.getExecutionEnvironment + + val scaler = StandardScaler() + val mlr = MultipleLinearRegression() + + val vData = List(DenseVector(1.0, 2.0, 3.0), DenseVector(2.0, 3.0, 4.0)) + val vectorData = env.fromCollection(vData) + val labeledVectors = List(LabeledVector(1.0, DenseVector(1.0, 2.0)), + LabeledVector(2.0, DenseVector(2.0, 3.0)), + LabeledVector(3.0, DenseVector(3.0, 4.0))) + val labeledData = env.fromCollection(labeledVectors) + val doubles = List(1.0, 2.0, 3.0) + val doubleData = env.fromCollection(doubles) + + val pipeline = scaler.chainPredictor(mlr) + + val exceptionFit = intercept[RuntimeException] { + pipeline.fit(vectorData) + } + + exceptionFit.getMessage should equal("There is no FitOperation defined for org.apache." + + "flink.ml.regression.MultipleLinearRegression which trains on a " + + "DataSet[org.apache.flink.ml.math.DenseVector]") + + // fit the pipeline so that the StandardScaler won't fail when predict is called on the pipeline + pipeline.fit(labeledData) + + // make sure that we have TransformOperation[StandardScaler, Double, Double] + implicit val standardScalerDoubleTransform = + new TransformDataSetOperation[StandardScaler, Double, Double] { + override def transformDataSet(instance: StandardScaler, transformParameters: ParameterMap, + input: DataSet[Double]): DataSet[Double] = { + input + } + } + + val exceptionPredict = intercept[RuntimeException] { + pipeline.predict(doubleData) + } + + exceptionPredict.getMessage should equal("There is no PredictOperation defined for " + + "org.apache.flink.ml.regression.MultipleLinearRegression which takes a " + + "DataSet[Double] as input.") + } + + it should "throw an exception when the input data is not supported" in { + val env = ExecutionEnvironment.getExecutionEnvironment + + val dData = List(1.0, 2.0, 3.0) + val doubleData = env.fromCollection(dData) + + val scaler = StandardScaler() + val polyFeatures = PolynomialFeatures() + + val pipeline = scaler.chainTransformer(polyFeatures) + + val exceptionFit = intercept[RuntimeException] { + pipeline.fit(doubleData) + } + + exceptionFit.getMessage should equal("There is no FitOperation defined for org.apache." + + "flink.ml.preprocessing.StandardScaler which trains on a DataSet[Double]") + + val exceptionTransform = intercept[RuntimeException] { + pipeline.transform(doubleData) + } + + exceptionTransform.getMessage should equal("There is no TransformOperation defined for " + + "org.apache.flink.ml.preprocessing.StandardScaler which takes a DataSet[Double] as input.") + } + + it should "support multiple transformers and a predictor" in { + val env = ExecutionEnvironment.getExecutionEnvironment + + val data = List(LabeledVector(1.0, DenseVector(1.0, 2.0)), + LabeledVector(2.0, DenseVector(2.0, 3.0)), + LabeledVector(3.0, DenseVector(3.0, 4.0))) + val testing = data.map(_.vector) + val evaluation = data.map(x => (x.vector, x.label)) + + val trainingData = env.fromCollection(data) + val testingData = env.fromCollection(testing) + val evaluationData = env.fromCollection(evaluation) + + val chainedScalers2 = StandardScaler().chainTransformer(StandardScaler()) + val chainedScalers3 = chainedScalers2.chainTransformer(StandardScaler()) + val chainedScalers4 = chainedScalers3.chainTransformer(StandardScaler()) + val chainedScalers5 = chainedScalers4.chainTransformer(StandardScaler()) + + val predictor = MultipleLinearRegression() + + val pipeline = chainedScalers5.chainPredictor(predictor) + + pipeline.fit(trainingData) + + val weightVector = predictor.weightsOption.get.collect().head + + weightVector.weights.valueIterator.foreach{ + _ should be (0.268050 +- 0.01) + } + + weightVector.intercept should be (0.807924 +- 0.01) + + val predictionDS = pipeline.predict(testingData) + + val predictionResult = predictionDS.collect() + + val evaluationDS = pipeline.evaluate(evaluationData) + + val evaluationResult = evaluationDS.collect() + + predictionResult.size should be(testing.size) + evaluationResult.size should be(evaluation.size) + } + + it should "throw an exception when the input data is not supported by a predictor" in { + val env = ExecutionEnvironment.getExecutionEnvironment + + val data = List(1.0, 2.0, 3.0) + val doubleData = env.fromCollection(data) + + val svm = SVM() + + intercept[RuntimeException] { + svm.fit(doubleData) + } + + intercept[RuntimeException] { + svm.predict(doubleData) + } + } +} diff --git a/src/test/scala/org/apache/flink/ml/preprocessing/MinMaxScalerITSuite.scala b/src/test/scala/org/apache/flink/ml/preprocessing/MinMaxScalerITSuite.scala new file mode 100644 index 0000000000000..dc51ee65136c4 --- /dev/null +++ b/src/test/scala/org/apache/flink/ml/preprocessing/MinMaxScalerITSuite.scala @@ -0,0 +1,243 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.flink.ml.preprocessing + +import breeze.linalg.{max, min} +import org.apache.flink.api.scala._ +import org.apache.flink.ml.common.LabeledVector +import org.apache.flink.ml.math.Breeze._ +import org.apache.flink.ml.math.{DenseVector, Vector} +import org.apache.flink.ml.util.FlinkTestBase +import org.scalatest.{FlatSpec, Matchers} + + +class MinMaxScalerITSuite + extends FlatSpec + with Matchers + with FlinkTestBase { + + behavior of "Flink's MinMax Scaler" + + import MinMaxScalerData._ + + it should "scale the vectors' values to be restricted in the [0.0,1.0] range" in { + + val env = ExecutionEnvironment.getExecutionEnvironment + + val dataSet = env.fromCollection(data) + val minMaxScaler = MinMaxScaler() + minMaxScaler.fit(dataSet) + val scaledVectors = minMaxScaler.transform(dataSet).collect + + scaledVectors.length should equal(data.length) + + //ensure data lies in the user-specified range + for (vector <- scaledVectors) { + val test = vector.asBreeze.forall(fv => { + fv >= 0.0 && fv <= 1.0 + }) + test shouldEqual true + } + + var expectedMin = data.head.asBreeze + var expectedMax = data.head.asBreeze + + for (v <- data.tail) { + val tempVector = v.asBreeze + expectedMin = min(expectedMin, tempVector) + expectedMax = max(expectedMax, tempVector) + } + + //ensure that estimated Min and Max vectors equal the expected ones + val estimatedMinMax = minMaxScaler.metricsOption.get.collect() + estimatedMinMax.head shouldEqual(expectedMin, expectedMax) + + //handle the case where a feature takes only one value + val expectedRangePerFeature = (expectedMax - expectedMin) + for (i <- 0 until expectedRangePerFeature.size) { + if (expectedRangePerFeature(i) == 0.0) { + expectedRangePerFeature(i)= 1.0 + } + } + + //ensure that vectors where scaled correctly + for (i <- 0 until data.length) { + var expectedVector = data(i).asBreeze - expectedMin + expectedVector :/= expectedRangePerFeature + expectedVector = expectedVector :* (1.0 - 0.0) + + expectedVector.fromBreeze.toSeq should contain theSameElementsInOrderAs scaledVectors(i) + } + } + + it should "scale vectors' values in the [-1.0,1.0] range" in { + + val env = ExecutionEnvironment.getExecutionEnvironment + + val dataSet = env.fromCollection(labeledData) + val minMaxScaler = MinMaxScaler().setMin(-1.0) + minMaxScaler.fit(dataSet) + val scaledVectors = minMaxScaler.transform(dataSet).collect + + scaledVectors.length should equal(labeledData.length) + + //ensure data lies in the user-specified range + for (labeledVector <- scaledVectors) { + val test = labeledVector.vector.asBreeze.forall(lv => { + lv >= -1.0 && lv <= 1.0 + }) + test shouldEqual true + } + + var expectedMin = labeledData.head.vector.asBreeze + var expectedMax = labeledData.head.vector.asBreeze + + for (v <- labeledData.tail) { + val tempVector = v.vector.asBreeze + expectedMin = min(expectedMin, tempVector) + expectedMax = max(expectedMax, tempVector) + } + + //ensure that estimated Min and Max vectors equal the expected ones + val estimatedMinMax = minMaxScaler.metricsOption.get.collect() + estimatedMinMax.head shouldEqual(expectedMin, expectedMax) + + //handle the case where a feature takes only one value + val expectedRangePerFeature = (expectedMax - expectedMin) + for (i <- 0 until expectedRangePerFeature.size) { + if (expectedRangePerFeature(i) == 0.0) { + expectedRangePerFeature(i)= 1.0 + } + } + + //ensure that LabeledVectors where scaled correctly + for (i <- 0 until labeledData.length) { + var expectedVector = labeledData(i).vector.asBreeze - expectedMin + expectedVector :/= expectedRangePerFeature + expectedVector = (expectedVector :* (1.0 + 1.0)) - 1.0 + + labeledData(i).label shouldEqual scaledVectors(i).label + expectedVector.fromBreeze.toSeq should contain theSameElementsInOrderAs scaledVectors(i) + .vector + } + } +} + + +object MinMaxScalerData { + + val data: Seq[Vector] = List( + DenseVector(Array(2104.00, 3.00, 0.0)), + DenseVector(Array(1600.00, 3.00, 0.0)), + DenseVector(Array(2400.00, 3.00, 0.0)), + DenseVector(Array(1416.00, 2.00, 0.0)), + DenseVector(Array(3000.00, 4.00, 0.0)), + DenseVector(Array(1985.00, 4.00, 0.0)), + DenseVector(Array(1534.00, 3.00, 0.0)), + DenseVector(Array(1427.00, 3.00, 0.0)), + DenseVector(Array(1380.00, 3.00, 0.0)), + DenseVector(Array(1494.00, 3.00, 0.0)), + DenseVector(Array(1940.00, 4.00, 0.0)), + DenseVector(Array(2000.00, 3.00, 0.0)), + DenseVector(Array(1890.00, 3.00, 0.0)), + DenseVector(Array(4478.00, 5.00, 0.0)), + DenseVector(Array(1268.00, 3.00, 0.0)), + DenseVector(Array(2300.00, 4.00, 0.0)), + DenseVector(Array(1320.00, 2.00, 0.0)), + DenseVector(Array(1236.00, 3.00, 0.0)), + DenseVector(Array(2609.00, 4.00, 0.0)), + DenseVector(Array(3031.00, 4.00, 0.0)), + DenseVector(Array(1767.00, 3.00, 0.0)), + DenseVector(Array(1888.00, 2.00, 0.0)), + DenseVector(Array(1604.00, 3.00, 0.0)), + DenseVector(Array(1962.00, 4.00, 0.0)), + DenseVector(Array(3890.00, 3.00, 0.0)), + DenseVector(Array(1100.00, 3.00, 0.0)), + DenseVector(Array(1458.00, 3.00, 0.0)), + DenseVector(Array(2526.00, 3.00, 0.0)), + DenseVector(Array(2200.00, 3.00, 0.0)), + DenseVector(Array(2637.00, 3.00, 0.0)), + DenseVector(Array(1839.00, 2.00, 0.0)), + DenseVector(Array(1000.00, 1.00, 0.0)), + DenseVector(Array(2040.00, 4.00, 0.0)), + DenseVector(Array(3137.00, 3.00, 0.0)), + DenseVector(Array(1811.00, 4.00, 0.0)), + DenseVector(Array(1437.00, 3.00, 0.0)), + DenseVector(Array(1239.00, 3.00, 0.0)), + DenseVector(Array(2132.00, 4.00, 0.0)), + DenseVector(Array(4215.00, 4.00, 0.0)), + DenseVector(Array(2162.00, 4.00, 0.0)), + DenseVector(Array(1664.00, 2.00, 0.0)), + DenseVector(Array(2238.00, 3.00, 0.0)), + DenseVector(Array(2567.00, 4.00, 0.0)), + DenseVector(Array(1200.00, 3.00, 0.0)), + DenseVector(Array(852.00, 2.00, 0.0)), + DenseVector(Array(1852.00, 4.00, 0.0)), + DenseVector(Array(1203.00, 3.00, 0.0)) + ) + + val labeledData: Seq[LabeledVector] = List( + LabeledVector(1.0, DenseVector(Array(2104.00, 3.00, 0.0))), + LabeledVector(1.0, DenseVector(Array(1600.00, 3.00, 0.0))), + LabeledVector(1.0, DenseVector(Array(2400.00, 3.00, 0.0))), + LabeledVector(0.0, DenseVector(Array(1416.00, 2.00, 0.0))), + LabeledVector(1.0, DenseVector(Array(3000.00, 4.00, 0.0))), + LabeledVector(1.0, DenseVector(Array(1985.00, 4.00, 0.0))), + LabeledVector(1.0, DenseVector(Array(1534.00, 3.00, 0.0))), + LabeledVector(1.0, DenseVector(Array(1427.00, 3.00, 0.0))), + LabeledVector(1.0, DenseVector(Array(1380.00, 3.00, 0.0))), + LabeledVector(1.0, DenseVector(Array(1494.00, 3.00, 0.0))), + LabeledVector(1.0, DenseVector(Array(1940.00, 4.00, 0.0))), + LabeledVector(1.0, DenseVector(Array(2000.00, 3.00, 0.0))), + LabeledVector(1.0, DenseVector(Array(1890.00, 3.00, 0.0))), + LabeledVector(1.0, DenseVector(Array(4478.00, 5.00, 0.0))), + LabeledVector(1.0, DenseVector(Array(1268.00, 3.00, 0.0))), + LabeledVector(1.0, DenseVector(Array(2300.00, 4.00, 0.0))), + LabeledVector(0.0, DenseVector(Array(1320.00, 2.00, 0.0))), + LabeledVector(1.0, DenseVector(Array(1236.00, 3.00, 0.0))), + LabeledVector(1.0, DenseVector(Array(2609.00, 4.00, 0.0))), + LabeledVector(1.0, DenseVector(Array(3031.00, 4.00, 0.0))), + LabeledVector(1.0, DenseVector(Array(1767.00, 3.00, 0.0))), + LabeledVector(0.0, DenseVector(Array(1888.00, 2.00, 0.0))), + LabeledVector(1.0, DenseVector(Array(1604.00, 3.00, 0.0))), + LabeledVector(1.0, DenseVector(Array(1962.00, 4.00, 0.0))), + LabeledVector(1.0, DenseVector(Array(3890.00, 3.00, 0.0))), + LabeledVector(1.0, DenseVector(Array(1100.00, 3.00, 0.0))), + LabeledVector(1.0, DenseVector(Array(1458.00, 3.00, 0.0))), + LabeledVector(1.0, DenseVector(Array(2526.00, 3.00, 0.0))), + LabeledVector(1.0, DenseVector(Array(2200.00, 3.00, 0.0))), + LabeledVector(1.0, DenseVector(Array(2637.00, 3.00, 0.0))), + LabeledVector(0.0, DenseVector(Array(1839.00, 2.00, 0.0))), + LabeledVector(0.0, DenseVector(Array(1000.00, 1.00, 0.0))), + LabeledVector(1.0, DenseVector(Array(2040.00, 4.00, 0.0))), + LabeledVector(1.0, DenseVector(Array(3137.00, 3.00, 0.0))), + LabeledVector(1.0, DenseVector(Array(1811.00, 4.00, 0.0))), + LabeledVector(1.0, DenseVector(Array(1437.00, 3.00, 0.0))), + LabeledVector(1.0, DenseVector(Array(1239.00, 3.00, 0.0))), + LabeledVector(1.0, DenseVector(Array(2132.00, 4.00, 0.0))), + LabeledVector(1.0, DenseVector(Array(4215.00, 4.00, 0.0))), + LabeledVector(1.0, DenseVector(Array(2162.00, 4.00, 0.0))), + LabeledVector(0.0, DenseVector(Array(1664.00, 2.00, 0.0))), + LabeledVector(1.0, DenseVector(Array(2238.00, 3.00, 0.0))), + LabeledVector(1.0, DenseVector(Array(2567.00, 4.00, 0.0))), + LabeledVector(1.0, DenseVector(Array(1200.00, 3.00, 0.0))), + LabeledVector(0.0, DenseVector(Array(852.00, 2.00, 0.0))), + LabeledVector(1.0, DenseVector(Array(1852.00, 4.00, 0.0))), + LabeledVector(1.0, DenseVector(Array(1203.00, 3.00, 0.0))) + ) +} diff --git a/src/test/scala/org/apache/flink/ml/preprocessing/PolynomialFeaturesITSuite.scala b/src/test/scala/org/apache/flink/ml/preprocessing/PolynomialFeaturesITSuite.scala new file mode 100644 index 0000000000000..f6c6193986d90 --- /dev/null +++ b/src/test/scala/org/apache/flink/ml/preprocessing/PolynomialFeaturesITSuite.scala @@ -0,0 +1,124 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.preprocessing + +import org.apache.flink.api.scala.{ExecutionEnvironment, _} +import org.apache.flink.ml.common.LabeledVector +import org.apache.flink.ml.math.DenseVector +import org.apache.flink.ml.util.FlinkTestBase +import org.scalatest.{FlatSpec, Matchers} + +class PolynomialFeaturesITSuite + extends FlatSpec + with Matchers + with FlinkTestBase { + + behavior of "The polynomial base implementation" + + it should "map single element vectors to the polynomial vector space" in { + val env = ExecutionEnvironment.getExecutionEnvironment + + env.setParallelism(2) + + val input = Seq( + LabeledVector(1.0, DenseVector(1)), + LabeledVector(2.0, DenseVector(2)) + ) + + val inputDS = env.fromCollection(input) + + val transformer = PolynomialFeatures() + .setDegree(3) + + val transformedDS = transformer.transform(inputDS) + + val expectedMap = Map( + 1.0 -> DenseVector(1.0, 1.0, 1.0), + 2.0 -> DenseVector(8.0, 4.0, 2.0) + ) + + val result = transformedDS.collect() + + for (entry <- result) { + expectedMap.contains(entry.label) should be(true) + entry.vector should equal(expectedMap(entry.label)) + } + } + + it should "map vectors to the polynomial vector space" in { + val env = ExecutionEnvironment.getExecutionEnvironment + + env.setParallelism(2) + + val input = Seq( + LabeledVector(1.0, DenseVector(2, 3)), + LabeledVector(2.0, DenseVector(2, 3, 4)) + ) + + val expectedMap = List( + (1.0 -> DenseVector(8.0, 12.0, 18.0, 27.0, 4.0, 6.0, 9.0, 2.0, 3.0)), + (2.0 -> DenseVector(8.0, 12.0, 16.0, 18.0, 24.0, 32.0, 27.0, 36.0, 48.0, 64.0, 4.0, 6.0, 8.0, + 9.0, 12.0, 16.0, 2.0, 3.0, 4.0)) + ) toMap + + val inputDS = env.fromCollection(input) + + val transformer = PolynomialFeatures() + .setDegree(3) + + val transformedDS = transformer.transform(inputDS) + + val result = transformedDS.collect() + + for (entry <- result) { + expectedMap.contains(entry.label) should be(true) + entry.vector should equal(expectedMap(entry.label)) + } + } + + it should "return an empty vector if the max degree is zero" in { + val env = ExecutionEnvironment.getExecutionEnvironment + + env.setParallelism(2) + + val input = Seq( + LabeledVector(1.0, DenseVector(2, 3)), + LabeledVector(2.0, DenseVector(2, 3, 4)) + ) + + val inputDS = env.fromCollection(input) + + val transformer = PolynomialFeatures() + .setDegree(0) + + val transformedDS = transformer.transform(inputDS) + + val result = transformedDS.collect() + + val expectedMap = Map( + 1.0 -> DenseVector(), + 2.0 -> DenseVector() + ) + + for (entry <- result) { + expectedMap.contains(entry.label) should be(true) + entry.vector should equal(expectedMap(entry.label)) + } + } +} diff --git a/src/test/scala/org/apache/flink/ml/preprocessing/SplitterITSuite.scala b/src/test/scala/org/apache/flink/ml/preprocessing/SplitterITSuite.scala new file mode 100644 index 0000000000000..42863f906c8ba --- /dev/null +++ b/src/test/scala/org/apache/flink/ml/preprocessing/SplitterITSuite.scala @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.flink.ml.preprocessing + +import org.apache.flink.api.scala.ExecutionEnvironment +import org.apache.flink.api.scala._ +import org.apache.flink.ml.util.FlinkTestBase +import org.scalatest.{Matchers, FlatSpec} +import org.apache.flink.api.scala.utils._ + + +class SplitterITSuite extends FlatSpec + with Matchers + with FlinkTestBase { + + behavior of "Flink's DataSet Splitter" + + import MinMaxScalerData._ + + it should "result in datasets with no elements in common and all elements used" in { + val env = ExecutionEnvironment.getExecutionEnvironment + + val dataSet = env.fromCollection(data) + + val splitDataSets = Splitter.randomSplit(dataSet.zipWithUniqueId, 0.5) + + (splitDataSets(0).union(splitDataSets(1)).count()) should equal(dataSet.count()) + + + splitDataSets(0).join(splitDataSets(1)).where(0).equalTo(0).count() should equal(0) + } + + it should "result in datasets of an expected size when precise" in { + val env = ExecutionEnvironment.getExecutionEnvironment + + val dataSet = env.fromCollection(data) + + val splitDataSets = Splitter.randomSplit(dataSet, 0.5, true) + + val expectedLength = data.size.toDouble * 0.5 + + splitDataSets(0).count().toDouble should equal(expectedLength +- 1.0) + } + + it should "result in expected number of datasets" in { + val env = ExecutionEnvironment.getExecutionEnvironment + + val dataSet = env.fromCollection(data) + + val fracArray = Array(0.5, 0.25, 0.25) + + val splitDataSets = Splitter.multiRandomSplit(dataSet, fracArray) + + splitDataSets.length should equal(fracArray.length) + } + + it should "produce TrainTestDataSets in which training size is greater than testing size" in { + val env = ExecutionEnvironment.getExecutionEnvironment + + val dataSet = env.fromCollection(data) + + val dataSetArray = Splitter.kFoldSplit(dataSet, 4) + + (dataSetArray(1).testing.count() < dataSetArray(1).training.count()) should be(true) + + } + + it should "throw an exception if sample fraction ins nonsensical" in { + val env = ExecutionEnvironment.getExecutionEnvironment + + val dataSet = env.fromCollection(data) + + intercept[IllegalArgumentException] { + val splitDataSets = Splitter.randomSplit(dataSet, -0.2) + } + + intercept[IllegalArgumentException] { + val splitDataSets = Splitter.randomSplit(dataSet, -1.2) + } + + } +} diff --git a/src/test/scala/org/apache/flink/ml/preprocessing/StandardScalerITSuite.scala b/src/test/scala/org/apache/flink/ml/preprocessing/StandardScalerITSuite.scala new file mode 100644 index 0000000000000..01cb5a55ac0d3 --- /dev/null +++ b/src/test/scala/org/apache/flink/ml/preprocessing/StandardScalerITSuite.scala @@ -0,0 +1,166 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.flink.ml.preprocessing + +import breeze.linalg +import breeze.numerics.sqrt +import breeze.numerics.sqrt._ +import org.apache.flink.api.scala._ +import org.apache.flink.ml.common.LabeledVector +import org.apache.flink.ml.math.{Vector => FlinkVector, DenseVector} +import org.apache.flink.ml.math.Breeze._ +import org.apache.flink.ml.util.FlinkTestBase +import org.scalatest._ + + +class StandardScalerITSuite + extends FlatSpec + with Matchers + with FlinkTestBase { + + behavior of "Flink's Standard Scaler" + + import StandardScalerData._ + + def checkVectors( + scaledVectors: Seq[FlinkVector], + expectedMean: Double, + expectedStd: Double): Unit = { + scaledVectors.length should equal(data.length) + + val numberOfFeatures = scaledVectors.head.size + var scaledMean: linalg.Vector[Double] = linalg.DenseVector.zeros(numberOfFeatures) + var scaledStd: linalg.Vector[Double] = linalg.DenseVector.zeros(numberOfFeatures) + + for (vector <- scaledVectors) { + scaledMean += vector.asBreeze + } + scaledMean /= scaledVectors.size.asInstanceOf[Double] + + for (vector <- scaledVectors) { + val temp = vector.asBreeze - scaledMean + scaledStd += temp :* temp + } + scaledStd /= scaledVectors.size.asInstanceOf[Double] + scaledStd = sqrt(scaledStd) + + for (i <- 0 until numberOfFeatures) { + scaledMean(i) should be(expectedMean +- 1e-9) + scaledStd(i) should be(expectedStd +- 1e-9) + } + } + + it should "scale the vectors to have mean equal to 0 and std equal to 1" in { + + val env = ExecutionEnvironment.getExecutionEnvironment + + val dataSet = env.fromCollection(data) + val scaler = StandardScaler() + scaler.fit(dataSet) + val scaledVectors = scaler.transform(dataSet).collect() + + checkVectors(scaledVectors, 0.0, 1.0) + } + + it should "scale the vectors to have mean equal to 10 and standard deviation equal to 2" in { + + val env = ExecutionEnvironment.getExecutionEnvironment + + val dataSet = env.fromCollection(data) + val scaler = StandardScaler().setMean(10.0).setStd(2.0) + scaler.fit(dataSet) + val scaledVectors = scaler.transform(dataSet).collect() + + checkVectors(scaledVectors, 10.0, 2.0) + } + + it should "work with LabeledVector" in { + val env = ExecutionEnvironment.getExecutionEnvironment + + val dataSet = env.fromCollection(data).map(v => LabeledVector(1.0, v)) + val scaler = StandardScaler() + scaler.fit(dataSet) + val scaledVectors = scaler.transform(dataSet).map(_.vector).collect() + + checkVectors(scaledVectors, 0.0, 1.0) + } + + it should "work with (FlinkVector, Double) tuples" in { + val env = ExecutionEnvironment.getExecutionEnvironment + + val dataSet = env.fromCollection(data).map(v => (v, 1.0)) + val scaler = StandardScaler() + scaler.fit(dataSet) + val scaledVectors = scaler.transform(dataSet).map(_._1).collect() + + checkVectors(scaledVectors, 0.0, 1.0) + } +} + +object StandardScalerData { + + val data: Seq[FlinkVector] = List( + DenseVector(Array(2104.00, 3.00)), + DenseVector(Array(1600.00, 3.00)), + DenseVector(Array(2400.00, 3.00)), + DenseVector(Array(1416.00, 2.00)), + DenseVector(Array(3000.00, 4.00)), + DenseVector(Array(1985.00, 4.00)), + DenseVector(Array(1534.00, 3.00)), + DenseVector(Array(1427.00, 3.00)), + DenseVector(Array(1380.00, 3.00)), + DenseVector(Array(1494.00, 3.00)), + DenseVector(Array(1940.00, 4.00)), + DenseVector(Array(2000.00, 3.00)), + DenseVector(Array(1890.00, 3.00)), + DenseVector(Array(4478.00, 5.00)), + DenseVector(Array(1268.00, 3.00)), + DenseVector(Array(2300.00, 4.00)), + DenseVector(Array(1320.00, 2.00)), + DenseVector(Array(1236.00, 3.00)), + DenseVector(Array(2609.00, 4.00)), + DenseVector(Array(3031.00, 4.00)), + DenseVector(Array(1767.00, 3.00)), + DenseVector(Array(1888.00, 2.00)), + DenseVector(Array(1604.00, 3.00)), + DenseVector(Array(1962.00, 4.00)), + DenseVector(Array(3890.00, 3.00)), + DenseVector(Array(1100.00, 3.00)), + DenseVector(Array(1458.00, 3.00)), + DenseVector(Array(2526.00, 3.00)), + DenseVector(Array(2200.00, 3.00)), + DenseVector(Array(2637.00, 3.00)), + DenseVector(Array(1839.00, 2.00)), + DenseVector(Array(1000.00, 1.00)), + DenseVector(Array(2040.00, 4.00)), + DenseVector(Array(3137.00, 3.00)), + DenseVector(Array(1811.00, 4.00)), + DenseVector(Array(1437.00, 3.00)), + DenseVector(Array(1239.00, 3.00)), + DenseVector(Array(2132.00, 4.00)), + DenseVector(Array(4215.00, 4.00)), + DenseVector(Array(2162.00, 4.00)), + DenseVector(Array(1664.00, 2.00)), + DenseVector(Array(2238.00, 3.00)), + DenseVector(Array(2567.00, 4.00)), + DenseVector(Array(1200.00, 3.00)), + DenseVector(Array(852.00, 2.00)), + DenseVector(Array(1852.00, 4.00)), + DenseVector(Array(1203.00, 3.00)) + ) +} diff --git a/src/test/scala/org/apache/flink/ml/recommendation/ALSITSuite.scala b/src/test/scala/org/apache/flink/ml/recommendation/ALSITSuite.scala new file mode 100644 index 0000000000000..5d5bb5fe32994 --- /dev/null +++ b/src/test/scala/org/apache/flink/ml/recommendation/ALSITSuite.scala @@ -0,0 +1,116 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.recommendation + +import org.apache.flink.ml.util.FlinkTestBase +import org.scalatest._ + +import scala.language.postfixOps + +import org.apache.flink.api.scala._ + +class ALSITSuite extends FlatSpec with Matchers with FlinkTestBase { + + override val parallelism = 2 + + behavior of "The alternating least squares (ALS) implementation" + + it should "properly factorize a matrix" in { + import Recommendation._ + + val env = ExecutionEnvironment.getExecutionEnvironment + + val als = ALS() + .setIterations(iterations) + .setLambda(lambda) + .setBlocks(4) + .setNumFactors(numFactors) + + val inputDS = env.fromCollection(dataLong) + + als.fit(inputDS) + + val testData = env.fromCollection(expectedResultLong.map { + case (userID, itemID, rating) => (userID, itemID) + }) + + val predictions = als.predict(testData).collect() + + predictions.length should equal(expectedResultLong.length) + + val resultMap = expectedResultLong.map { + case (uID, iID, value) => (uID, iID) -> value + }.toMap + + predictions foreach { + case (uID, iID, value) => { + resultMap.isDefinedAt((uID, iID)) should be(true) + + value should be(resultMap((uID, iID)) +- 0.1) + } + } + + val risk = als.empiricalRisk(inputDS).collect().head + + risk should be(expectedEmpiricalRisk +- 1) + } + + it should "properly factorize a matrix (integer indices)" in { + import Recommendation._ + + val env = ExecutionEnvironment.getExecutionEnvironment + + val als = ALS() + .setIterations(iterations) + .setLambda(lambda) + .setBlocks(4) + .setNumFactors(numFactors) + + val inputDS = env.fromCollection(data) + + als.fit(inputDS) + + + val testData = env.fromCollection(expectedResult.map { + case (userID, itemID, rating) => (userID, itemID) + }) + + val predictions = als.predict(testData).collect() + + predictions.length should equal(expectedResult.length) + + val resultMap = expectedResultLong.map { + case (uID, iID, value) => (uID, iID) -> value + }.toMap + + predictions foreach { + case (uID, iID, value) => { + resultMap.isDefinedAt((uID, iID)) should be(true) + + value should be(resultMap((uID, iID)) +- 0.1) + } + } + + val risk = als.empiricalRisk( + inputDS.map( x => (x._1.toLong, x._2.toLong, x._3))) + .collect().head + + risk should be(expectedEmpiricalRisk +- 1) + } +} diff --git a/src/test/scala/org/apache/flink/ml/recommendation/Recommendation.scala b/src/test/scala/org/apache/flink/ml/recommendation/Recommendation.scala new file mode 100644 index 0000000000000..13ab7d7c943ea --- /dev/null +++ b/src/test/scala/org/apache/flink/ml/recommendation/Recommendation.scala @@ -0,0 +1,153 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.recommendation + +object Recommendation { + val iterations = 9 + val lambda = 1.0 + val numFactors = 5 + + val data: Seq[(Int, Int, Double)] = { + Seq( + (2,13,534.3937734561154), + (6,14,509.63176469621936), + (4,14,515.8246770897443), + (7,3,495.05234565105), + (2,3,532.3281786219485), + (5,3,497.1906356844367), + (3,3,512.0640508585093), + (10,3,500.2906742233019), + (1,4,521.9189079662882), + (2,4,515.0734651491396), + (1,7,522.7532725967008), + (8,4,492.65683825096403), + (4,8,492.65683825096403), + (10,8,507.03319667905413), + (7,1,522.7532725967008), + (1,1,572.2230209271174), + (2,1,563.5849190220224), + (6,1,518.4844061038742), + (9,1,529.2443732217674), + (8,1,543.3202505434103), + (7,2,516.0188923307859), + (1,2,563.5849190220224), + (1,11,515.1023793011227), + (8,2,536.8571133978352), + (2,11,507.90776961762225), + (3,2,532.3281786219485), + (5,11,476.24185144363304), + (4,2,515.0734651491396), + (4,11,469.92049343738233), + (3,12,509.4713776280098), + (4,12,494.6533165132021), + (7,5,482.2907867916308), + (6,5,477.5940040923741), + (4,5,480.9040684364228), + (1,6,518.4844061038742), + (6,6,470.6605085832807), + (8,6,489.6360564705307), + (4,6,472.74052954447046), + (7,9,482.5837650471611), + (5,9,487.00175463269863), + (9,9,500.69514584780944), + (4,9,477.71644808419325), + (7,10,485.3852917539852), + (8,10,507.03319667905413), + (3,10,500.2906742233019), + (5,15,488.08215944254437), + (6,15,480.16929757607346) + ) + } + + val expectedResult: Seq[(Int, Int, Double)] = { + Seq( + (2, 2, 526.1037), + (5, 9, 468.5680), + (10, 3, 484.8975), + (5, 13, 451.6228), + (1, 15, 493.4956), + (4, 11, 456.3862) + ) + } + + val dataLong: Seq[(Long, Long, Double)] = { + Seq( + (2,13,534.3937734561154), + (6,14,509.63176469621936), + (4,14,515.8246770897443), + (7,3,495.05234565105), + (2,3,532.3281786219485), + (5,3,497.1906356844367), + (3,3,512.0640508585093), + (10,3,500.2906742233019), + (1,4,521.9189079662882), + (2,4,515.0734651491396), + (1,7,522.7532725967008), + (8,4,492.65683825096403), + (4,8,492.65683825096403), + (10,8,507.03319667905413), + (7,1,522.7532725967008), + (1,1,572.2230209271174), + (2,1,563.5849190220224), + (6,1,518.4844061038742), + (9,1,529.2443732217674), + (8,1,543.3202505434103), + (7,2,516.0188923307859), + (1,2,563.5849190220224), + (1,11,515.1023793011227), + (8,2,536.8571133978352), + (2,11,507.90776961762225), + (3,2,532.3281786219485), + (5,11,476.24185144363304), + (4,2,515.0734651491396), + (4,11,469.92049343738233), + (3,12,509.4713776280098), + (4,12,494.6533165132021), + (7,5,482.2907867916308), + (6,5,477.5940040923741), + (4,5,480.9040684364228), + (1,6,518.4844061038742), + (6,6,470.6605085832807), + (8,6,489.6360564705307), + (4,6,472.74052954447046), + (7,9,482.5837650471611), + (5,9,487.00175463269863), + (9,9,500.69514584780944), + (4,9,477.71644808419325), + (7,10,485.3852917539852), + (8,10,507.03319667905413), + (3,10,500.2906742233019), + (5,15,488.08215944254437), + (6,15,480.16929757607346) + ) + } + + val expectedResultLong: Seq[(Long, Long, Double)] = { + Seq( + (2, 2, 526.1037), + (5, 9, 468.5680), + (10, 3, 484.8975), + (5, 13, 451.6228), + (1, 15, 493.4956), + (4, 11, 456.3862) + ) + } + + val expectedEmpiricalRisk = 505374.1877 +} diff --git a/src/test/scala/org/apache/flink/ml/regression/MultipleLinearRegressionITSuite.scala b/src/test/scala/org/apache/flink/ml/regression/MultipleLinearRegressionITSuite.scala new file mode 100644 index 0000000000000..7e1787a41b02f --- /dev/null +++ b/src/test/scala/org/apache/flink/ml/regression/MultipleLinearRegressionITSuite.scala @@ -0,0 +1,159 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.regression + +import org.apache.flink.api.scala._ +import org.apache.flink.ml.common.{ParameterMap, WeightVector} +import org.apache.flink.ml.preprocessing.PolynomialFeatures +import org.apache.flink.ml.util.FlinkTestBase +import org.scalatest.{FlatSpec, Matchers} + +class MultipleLinearRegressionITSuite + extends FlatSpec + with Matchers + with FlinkTestBase { + + behavior of "The multiple linear regression implementation" + + it should "estimate a linear function" in { + val env = ExecutionEnvironment.getExecutionEnvironment + + env.setParallelism(2) + + val mlr = MultipleLinearRegression() + + import RegressionData._ + + val parameters = ParameterMap() + + parameters.add(MultipleLinearRegression.Stepsize, 2.0) + parameters.add(MultipleLinearRegression.Iterations, 10) + parameters.add(MultipleLinearRegression.ConvergenceThreshold, 0.001) + + val inputDS = env.fromCollection(data) + mlr.fit(inputDS, parameters) + + val weightList = mlr.weightsOption.get.collect() + + weightList.size should equal(1) + + val WeightVector(weights, intercept) = weightList.head + + expectedWeights.toIterator zip weights.valueIterator foreach { + case (expectedWeight, weight) => + weight should be (expectedWeight +- 1) + } + intercept should be (expectedWeight0 +- 0.4) + + val srs = mlr.squaredResidualSum(inputDS).collect().head + + srs should be (expectedSquaredResidualSum +- 2) + } + + it should "work with sparse vectors as input" in { + val env = ExecutionEnvironment.getExecutionEnvironment + + val mlr = MultipleLinearRegression() + + val sparseInputDS = env.fromCollection(RegressionData.sparseData) + + val parameters = ParameterMap() + + parameters.add(MultipleLinearRegression.Stepsize, 2.0) + parameters.add(MultipleLinearRegression.Iterations, 10) + parameters.add(MultipleLinearRegression.ConvergenceThreshold, 0.001) + + mlr.fit(sparseInputDS, parameters) + + val weightList = mlr.weightsOption.get.collect() + + val WeightVector(weights, intercept) = weightList.head + + RegressionData.expectedWeightsSparseInput.toIterator zip weights.valueIterator foreach { + case (expectedWeight, weight) => + weight should be (expectedWeight +- 1) + } + intercept should be (RegressionData.expectedInterceptSparseInput +- 0.4) + } + + it should "estimate a cubic function" in { + val env = ExecutionEnvironment.getExecutionEnvironment + + env.setParallelism(2) + + val polynomialBase = PolynomialFeatures() + val mlr = MultipleLinearRegression() + + val pipeline = polynomialBase.chainPredictor(mlr) + + val inputDS = env.fromCollection(RegressionData.polynomialData) + + val parameters = ParameterMap() + .add(PolynomialFeatures.Degree, 3) + .add(MultipleLinearRegression.Stepsize, 0.004) + .add(MultipleLinearRegression.Iterations, 100) + + pipeline.fit(inputDS, parameters) + + val weightList = mlr.weightsOption.get.collect() + + weightList.size should equal(1) + + val WeightVector(weights, intercept) = weightList.head + + RegressionData.expectedPolynomialWeights.toIterator.zip(weights.valueIterator) foreach { + case (expectedWeight, weight) => + weight should be(expectedWeight +- 0.1) + } + + intercept should be(RegressionData.expectedPolynomialWeight0 +- 0.1) + + val transformedInput = polynomialBase.transform(inputDS, parameters) + + val srs = mlr.squaredResidualSum(transformedInput).collect().head + + srs should be(RegressionData.expectedPolynomialSquaredResidualSum +- 5) + } + + it should "make (mostly) correct predictions" in { + val env = ExecutionEnvironment.getExecutionEnvironment + + val mlr = MultipleLinearRegression() + + import RegressionData._ + + val parameters = ParameterMap() + + parameters.add(MultipleLinearRegression.Stepsize, 1.0) + parameters.add(MultipleLinearRegression.Iterations, 10) + parameters.add(MultipleLinearRegression.ConvergenceThreshold, 0.001) + + val inputDS = env.fromCollection(data) + val evaluationDS = inputDS.map(x => (x.vector, x.label)) + + mlr.fit(inputDS, parameters) + + val predictionPairs = mlr.evaluate(evaluationDS) + + val absoluteErrorSum = predictionPairs.collect().map{ + case (truth, prediction) => Math.abs(truth - prediction)}.sum + + absoluteErrorSum should be < 50.0 + } +} diff --git a/src/test/scala/org/apache/flink/ml/regression/RegressionData.scala b/src/test/scala/org/apache/flink/ml/regression/RegressionData.scala new file mode 100644 index 0000000000000..bce4f980fc033 --- /dev/null +++ b/src/test/scala/org/apache/flink/ml/regression/RegressionData.scala @@ -0,0 +1,206 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.regression + +import org.apache.flink.ml.common.LabeledVector +import org.apache.flink.ml.math.{SparseVector, DenseVector} + +object RegressionData { + + val expectedWeights = Array[Double](3.0094) + val expectedWeight0: Double = 9.8158 + val expectedSquaredResidualSum: Double = 49.7596/2 + + val sparseData: Seq[LabeledVector] = Seq( + new LabeledVector(1.0, new SparseVector(10, Array(0, 2, 3), Array(1.0, 1.0, 1.0))), + new LabeledVector(1.0, new SparseVector(10, Array(0, 1, 5, 9), Array(1.0, 1.0, 1.0, 1.0))), + new LabeledVector(0.0, new SparseVector(10, Array(0, 2), Array(0.0, 1.0))), + new LabeledVector(0.0, new SparseVector(10, Array(0), Array(0.0))), + new LabeledVector(0.0, new SparseVector(10, Array(0, 2), Array(0.0, 1.0))), + new LabeledVector(0.0, new SparseVector(10, Array(0), Array(0.0)))) + + val expectedWeightsSparseInput = Array(0.5448906338353784, 0.15718880164669916, + 0.034001300318125725, 0.38770183218867915, 0.0, + 0.15718880164669916, 0.0, 0.0, 0.0, 0.15718880164669916) + + val expectedInterceptSparseInput = -0.006918274867886108 + + + val data: Seq[LabeledVector] = Seq( + LabeledVector(10.7949, DenseVector(0.2714)), + LabeledVector(10.6426, DenseVector(0.1008)), + LabeledVector(10.5603, DenseVector(0.5078)), + LabeledVector(12.8707, DenseVector(0.5856)), + LabeledVector(10.7026, DenseVector(0.7629)), + LabeledVector(9.8571, DenseVector(0.0830)), + LabeledVector(10.5001, DenseVector(0.6616)), + LabeledVector(11.2063, DenseVector(0.5170)), + LabeledVector(9.1892, DenseVector(0.1710)), + LabeledVector(12.2408, DenseVector(0.9386)), + LabeledVector(11.0307, DenseVector(0.5905)), + LabeledVector(10.1369, DenseVector(0.4406)), + LabeledVector(10.7609, DenseVector(0.9419)), + LabeledVector(12.5328, DenseVector(0.6559)), + LabeledVector(13.3560, DenseVector(0.4519)), + LabeledVector(14.7424, DenseVector(0.8397)), + LabeledVector(11.1057, DenseVector(0.5326)), + LabeledVector(11.6157, DenseVector(0.5539)), + LabeledVector(11.5744, DenseVector(0.6801)), + LabeledVector(11.1775, DenseVector(0.3672)), + LabeledVector(9.7991, DenseVector(0.2393)), + LabeledVector(9.8173, DenseVector(0.5789)), + LabeledVector(12.5642, DenseVector(0.8669)), + LabeledVector(9.9952, DenseVector(0.4068)), + LabeledVector(8.4354, DenseVector(0.1126)), + LabeledVector(13.7058, DenseVector(0.4438)), + LabeledVector(10.6672, DenseVector(0.3002)), + LabeledVector(11.6080, DenseVector(0.4014)), + LabeledVector(13.6926, DenseVector(0.8334)), + LabeledVector(9.5261, DenseVector(0.4036)), + LabeledVector(11.5837, DenseVector(0.3902)), + LabeledVector(11.5831, DenseVector(0.3604)), + LabeledVector(10.5038, DenseVector(0.1403)), + LabeledVector(10.9382, DenseVector(0.2601)), + LabeledVector(9.7325, DenseVector(0.0868)), + LabeledVector(12.0113, DenseVector(0.4294)), + LabeledVector(9.9219, DenseVector(0.2573)), + LabeledVector(10.0963, DenseVector(0.2976)), + LabeledVector(11.9999, DenseVector(0.4249)), + LabeledVector(12.0442, DenseVector(0.1192)) + ) + + val expectedNoInterceptWeights = Array[Double](5.0) + val expectedNoInterceptWeight0: Double = 0.0 + + val noInterceptData: Seq[LabeledVector] = Seq( + LabeledVector(217.228709, DenseVector(43.4457419)), + LabeledVector(450.037048, DenseVector(90.0074095)), + LabeledVector( 67.553478, DenseVector(13.5106955)), + LabeledVector( 26.976958, DenseVector( 5.3953916)), + LabeledVector(403.808709, DenseVector(80.7617418)), + LabeledVector(203.932158, DenseVector(40.7864316)), + LabeledVector(146.974958, DenseVector(29.3949916)), + LabeledVector( 46.869291, DenseVector( 9.3738582)), + LabeledVector(450.780834, DenseVector(90.1561667)), + LabeledVector(386.535619, DenseVector(77.3071239)), + LabeledVector(202.644342, DenseVector(40.5288684)), + LabeledVector(227.586507, DenseVector(45.5173013)), + LabeledVector(408.801080, DenseVector(81.7602161)), + LabeledVector(146.118550, DenseVector(29.2237100)), + LabeledVector(156.475382, DenseVector(31.2950763)), + LabeledVector(291.822515, DenseVector(58.3645030)), + LabeledVector( 61.506887, DenseVector(12.3013775)), + LabeledVector(363.949913, DenseVector(72.7899827)), + LabeledVector(398.050744, DenseVector(79.6101487)), + LabeledVector(246.053111, DenseVector(49.2106221)), + LabeledVector(225.494661, DenseVector(45.0989323)), + LabeledVector(265.986844, DenseVector(53.1973689)), + LabeledVector(110.459912, DenseVector(22.0919823)), + LabeledVector(122.716974, DenseVector(24.5433947)), + LabeledVector(128.014314, DenseVector(25.6028628)), + LabeledVector(252.538913, DenseVector(50.5077825)), + LabeledVector(393.632082, DenseVector(78.7264163)), + LabeledVector( 77.698941, DenseVector(15.5397881)), + LabeledVector(206.187568, DenseVector(41.2375135)), + LabeledVector(244.073426, DenseVector(48.8146851)), + LabeledVector(364.946890, DenseVector(72.9893780)), + LabeledVector( 4.627494, DenseVector( 0.9254987)), + LabeledVector(485.359565, DenseVector(97.0719130)), + LabeledVector(347.359190, DenseVector(69.4718380)), + LabeledVector(419.663211, DenseVector(83.9326422)), + LabeledVector(488.518318, DenseVector(97.7036635)), + LabeledVector( 28.082962, DenseVector( 5.6165925)), + LabeledVector(211.002441, DenseVector(42.2004881)), + LabeledVector(250.624124, DenseVector(50.1248248)), + LabeledVector(489.776669, DenseVector(97.9553337)) + ) + + + val expectedPolynomialWeights = Seq(0.2375, -0.3493, -0.1674) + val expectedPolynomialWeight0 = 0.0233 + val expectedPolynomialSquaredResidualSum = 1.5389e+03/2 + + val polynomialData: Seq[LabeledVector] = Seq( + LabeledVector(2.1415, DenseVector(3.6663)), + LabeledVector(10.9835, DenseVector(4.0761)), + LabeledVector(7.2507, DenseVector(0.5714)), + LabeledVector(11.9274, DenseVector(4.1102)), + LabeledVector(-4.2798, DenseVector(2.8456)), + LabeledVector(7.1929, DenseVector(0.4389)), + LabeledVector(4.5097, DenseVector(1.2532)), + LabeledVector(-3.6059, DenseVector(2.4610)), + LabeledVector(18.1132, DenseVector(4.3088)), + LabeledVector(19.2674, DenseVector(4.3420)), + LabeledVector(7.0664, DenseVector(0.7093)), + LabeledVector(20.1836, DenseVector(4.3677)), + LabeledVector(18.0609, DenseVector(4.3073)), + LabeledVector(-2.2090, DenseVector(2.1842)), + LabeledVector(1.1306, DenseVector(3.6013)), + LabeledVector(7.1903, DenseVector(0.6385)), + LabeledVector(-0.2668, DenseVector(1.8979)), + LabeledVector(12.2281, DenseVector(4.1208)), + LabeledVector(0.6086, DenseVector(3.5649)), + LabeledVector(18.4202, DenseVector(4.3177)), + LabeledVector(-4.1284, DenseVector(2.9508)), + LabeledVector(6.1964, DenseVector(0.1607)), + LabeledVector(4.9638, DenseVector(3.8211)), + LabeledVector(14.6677, DenseVector(4.2030)), + LabeledVector(-3.8132, DenseVector(3.0543)), + LabeledVector(-1.2891, DenseVector(3.4098)), + LabeledVector(-1.9390, DenseVector(3.3441)), + LabeledVector(0.7293, DenseVector(1.7650)), + LabeledVector(-4.1310, DenseVector(2.9497)), + LabeledVector(6.9131, DenseVector(0.7703)), + LabeledVector(-3.2060, DenseVector(3.1772)), + LabeledVector(6.0899, DenseVector(0.1432)), + LabeledVector(4.5567, DenseVector(1.2462)), + LabeledVector(6.4562, DenseVector(0.2078)), + LabeledVector(7.1903, DenseVector(0.4371)), + LabeledVector(2.8017, DenseVector(3.7056)), + LabeledVector(-3.4873, DenseVector(3.1267)), + LabeledVector(3.2918, DenseVector(1.4269)), + LabeledVector(17.0085, DenseVector(4.2760)), + LabeledVector(6.1622, DenseVector(0.1550)), + LabeledVector(-0.8192, DenseVector(1.9743)), + LabeledVector(1.0957, DenseVector(1.7170)), + LabeledVector(-0.9065, DenseVector(3.4448)), + LabeledVector(0.7986, DenseVector(3.5784)), + LabeledVector(6.6861, DenseVector(0.8409)), + LabeledVector(-2.3274, DenseVector(2.2039)), + LabeledVector(-1.0359, DenseVector(2.0051)), + LabeledVector(-4.2092, DenseVector(2.9084)), + LabeledVector(-3.1140, DenseVector(3.1921)), + LabeledVector(-1.4323, DenseVector(3.3961)) + ) + + val expectedRegWeights = Array[Double](0.0, 0.0, 0.0, 0.18, 0.2, 0.24) + val expectedRegWeight0 = 0.74 + + // Example values from scikit-learn L1 test: http://git.io/vf4V2 + val regularizationData: Seq[LabeledVector] = Seq( + LabeledVector(1.0, DenseVector(1.0,0.9 ,0.8 ,0.0 ,0.0 ,0.0)), + LabeledVector(1.0, DenseVector(1.0,0.84,0.98,0.0 ,0.0 ,0.0)), + LabeledVector(1.0, DenseVector(1.0,0.96,0.88,0.0 ,0.0 ,0.0)), + LabeledVector(1.0, DenseVector(1.0,0.91,0.99,0.0 ,0.0 ,0.0)), + LabeledVector(2.0, DenseVector(0.0,0.0 ,0.0 ,0.89,0.91,1.0)), + LabeledVector(2.0, DenseVector(0.0,0.0 ,0.0 ,0.79,0.84,1.0)), + LabeledVector(2.0, DenseVector(0.0,0.0 ,0.0 ,0.91,0.95,1.0)), + LabeledVector(2.0, DenseVector(0.0,0.0 ,0.0 ,0.93,1.0 ,1.0)) + ) +} diff --git a/src/test/scala/org/apache/flink/ml/util/FlinkTestBase.scala b/src/test/scala/org/apache/flink/ml/util/FlinkTestBase.scala new file mode 100644 index 0000000000000..498fa70f87d36 --- /dev/null +++ b/src/test/scala/org/apache/flink/ml/util/FlinkTestBase.scala @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.util + +import org.apache.flink.runtime.minicluster.LocalFlinkMiniCluster +import org.apache.flink.test.util.{TestBaseUtils, TestEnvironment} +import org.scalatest.{BeforeAndAfter, Suite} + +/** Mixin to start and stop a LocalFlinkMiniCluster automatically for Scala based tests. + * Additionally a TestEnvironment with the started cluster is created and set as the default + * [[org.apache.flink.api.java.ExecutionEnvironment]]. + * + * This mixin starts a LocalFlinkMiniCluster with one TaskManager and a number of slots given + * by parallelism. This value can be overridden in a sub class in order to start the cluster + * with a different number of slots. + * + * The cluster is started once before starting the tests and is re-used for the individual tests. + * After all tests have been executed, the cluster is shutdown. + * + * The cluster is used by obtaining the default [[org.apache.flink.api.java.ExecutionEnvironment]]. + * + * @example + * {{{ + * def testSomething: Unit = { + * // Obtain TestEnvironment with started LocalFlinkMiniCluster + * val env = ExecutionEnvironment.getExecutionEnvironment + * + * env.fromCollection(...) + * + * env.execute + * } + * }}} + * + */ +trait FlinkTestBase extends BeforeAndAfter { + that: Suite => + + var cluster: Option[LocalFlinkMiniCluster] = None + val parallelism = 4 + + before { + val cl = TestBaseUtils.startCluster( + 1, + parallelism, + false, + false, + true) + + val clusterEnvironment = new TestEnvironment(cl, parallelism) + clusterEnvironment.setAsContext() + + cluster = Some(cl) + } + + after { + cluster.foreach(c => TestBaseUtils.stopCluster(c, TestBaseUtils.DEFAULT_TIMEOUT)) + } + +} From ee2e753780aa7bce386fbae17ca684c2f0d94eef Mon Sep 17 00:00:00 2001 From: p4nna Date: Tue, 28 Mar 2017 09:29:56 +0200 Subject: [PATCH 05/12] added apache license --- .../flink-ml/src/main/java/Imputer.java | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/flink-libraries/flink-ml/src/main/java/Imputer.java b/flink-libraries/flink-ml/src/main/java/Imputer.java index 91d288353064d..882eae5547ca0 100644 --- a/flink-libraries/flink-ml/src/main/java/Imputer.java +++ b/flink-libraries/flink-ml/src/main/java/Imputer.java @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package Imputer; import java.util.ArrayList; import java.util.Arrays; From b6d52fc67937b9ff21911306be015a979a12d8e7 Mon Sep 17 00:00:00 2001 From: p4nna Date: Tue, 28 Mar 2017 09:30:19 +0200 Subject: [PATCH 06/12] added apache license --- .../flink-ml/src/main/java/Strategy.java | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/flink-libraries/flink-ml/src/main/java/Strategy.java b/flink-libraries/flink-ml/src/main/java/Strategy.java index e29d389a0cbd6..5e86d3fe81aba 100644 --- a/flink-libraries/flink-ml/src/main/java/Strategy.java +++ b/flink-libraries/flink-ml/src/main/java/Strategy.java @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package Imputer; public enum Strategy { From 986617fbe937d9103e8336dedbdd6c5f7922e78d Mon Sep 17 00:00:00 2001 From: p4nna Date: Tue, 28 Mar 2017 09:30:48 +0200 Subject: [PATCH 07/12] added apache license --- .../src/test/Imputer/columnwiseTest.java | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/flink-libraries/flink-ml/src/test/Imputer/columnwiseTest.java b/flink-libraries/flink-ml/src/test/Imputer/columnwiseTest.java index 71fa77418ee5c..a13b4f3d3cf20 100644 --- a/flink-libraries/flink-ml/src/test/Imputer/columnwiseTest.java +++ b/flink-libraries/flink-ml/src/test/Imputer/columnwiseTest.java @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package Imputer; import static org.junit.Assert.*; From 6aac7182e8654d5fba7ed2362a42b803a0820d17 Mon Sep 17 00:00:00 2001 From: p4nna Date: Tue, 28 Mar 2017 09:31:12 +0200 Subject: [PATCH 08/12] added apache license --- .../flink-ml/src/test/Imputer/rowwiseTest.java | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/flink-libraries/flink-ml/src/test/Imputer/rowwiseTest.java b/flink-libraries/flink-ml/src/test/Imputer/rowwiseTest.java index 0d98e23aab0e3..490b59c91d501 100644 --- a/flink-libraries/flink-ml/src/test/Imputer/rowwiseTest.java +++ b/flink-libraries/flink-ml/src/test/Imputer/rowwiseTest.java @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package Imputer; import static org.junit.Assert.*; From 40ea05f33e12fa3a61a01d938a10c6bd736e2d5c Mon Sep 17 00:00:00 2001 From: p4nna Date: Tue, 28 Mar 2017 13:19:09 +0200 Subject: [PATCH 09/12] deleted unnecessary file --- .../scala/org/apache/flink/ml/MLUtils.scala | 125 ------------------ 1 file changed, 125 deletions(-) diff --git a/src/main/scala/org/apache/flink/ml/MLUtils.scala b/src/main/scala/org/apache/flink/ml/MLUtils.scala index 051544f79d937..d3f5a12faa997 100644 --- a/src/main/scala/org/apache/flink/ml/MLUtils.scala +++ b/src/main/scala/org/apache/flink/ml/MLUtils.scala @@ -1,126 +1 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.flink.ml - -import org.apache.flink.api.common.functions.{RichFlatMapFunction, RichMapFunction} -import org.apache.flink.api.java.operators.DataSink -import org.apache.flink.api.scala._ -import org.apache.flink.configuration.Configuration -import org.apache.flink.ml.common.LabeledVector -import org.apache.flink.ml.math.SparseVector -import org.apache.flink.util.Collector - -/** Convenience functions for machine learning tasks - * - * This object contains convenience functions for machine learning tasks: - * - * - readLibSVM: - * Reads a libSVM/SVMLight input file and returns a data set of [[LabeledVector]]. - * The file format is specified [http://svmlight.joachims.org/ here]. - * - * - writeLibSVM: - * Writes a data set of [[LabeledVector]] in libSVM/SVMLight format to disk. THe file format - * is specified [http://svmlight.joachims.org/ here]. - */ -object MLUtils { - - val DIMENSION = "dimension" - - /** Reads a file in libSVM/SVMLight format and converts the data into a data set of - * [[LabeledVector]]. The dimension of the [[LabeledVector]] is determined automatically. - * - * Since the libSVM/SVMLight format stores a vector in its sparse form, the [[LabeledVector]] - * will also be instantiated with a [[SparseVector]]. - * - * @param env executionEnvironment [[ExecutionEnvironment]] - * @param filePath Path to the input file - * @return [[DataSet]] of [[LabeledVector]] containing the information of the libSVM/SVMLight - * file - */ - def readLibSVM(env: ExecutionEnvironment, filePath: String): DataSet[LabeledVector] = { - val labelCOODS = env.readTextFile(filePath).flatMap( - new RichFlatMapFunction[String, (Double, Array[(Int, Double)])] { - val splitPattern = "\\s+".r - - override def flatMap( - line: String, - out: Collector[(Double, Array[(Int, Double)])] - ): Unit = { - val commentFreeLine = line.takeWhile(_ != '#').trim - - if (commentFreeLine.nonEmpty) { - val splits = splitPattern.split(commentFreeLine) - val label = splits.head.toDouble - val sparseFeatures = splits.tail - val coos = sparseFeatures.flatMap { str => - val pair = str.split(':') - require(pair.length == 2, "Each feature entry has to have the form :") - - // libSVM index is 1-based, but we expect it to be 0-based - val index = pair(0).toInt - 1 - val value = pair(1).toDouble - - Some((index, value)) - } - - out.collect((label, coos)) - } - } - }) - - // Calculate maximum dimension of vectors - val dimensionDS = labelCOODS.map { - labelCOO => - labelCOO._2.map( _._1 + 1 ).max - }.reduce(scala.math.max(_, _)) - - labelCOODS.map{ new RichMapFunction[(Double, Array[(Int, Double)]), LabeledVector] { - var dimension = 0 - - override def open(configuration: Configuration): Unit = { - dimension = getRuntimeContext.getBroadcastVariable(DIMENSION).get(0) - } - - override def map(value: (Double, Array[(Int, Double)])): LabeledVector = { - new LabeledVector(value._1, SparseVector.fromCOO(dimension, value._2)) - } - }}.withBroadcastSet(dimensionDS, DIMENSION) - } - - /** Writes a [[DataSet]] of [[LabeledVector]] to a file using the libSVM/SVMLight format. - * - * @param filePath Path to output file - * @param labeledVectors [[DataSet]] of [[LabeledVector]] to write to disk - * @return - */ - def writeLibSVM(filePath: String, labeledVectors: DataSet[LabeledVector]): DataSink[String] = { - val stringRepresentation = labeledVectors.map{ - labeledVector => - val vectorStr = labeledVector.vector. - // remove zero entries - filter( _._2 != 0). - map{case (idx, value) => (idx + 1) + ":" + value}. - mkString(" ") - - labeledVector.label + " " + vectorStr - } - - stringRepresentation.writeAsText(filePath) - } -} From 36dc30c054335c1c88a0fb1f73e52591c0d24061 Mon Sep 17 00:00:00 2001 From: p4nna Date: Tue, 28 Mar 2017 13:19:26 +0200 Subject: [PATCH 10/12] deleted unnecessary file --- .../apache/flink/ml/classification/SVM.scala | 551 ------------------ 1 file changed, 551 deletions(-) diff --git a/src/main/scala/org/apache/flink/ml/classification/SVM.scala b/src/main/scala/org/apache/flink/ml/classification/SVM.scala index eff9fbd258585..d3f5a12faa997 100644 --- a/src/main/scala/org/apache/flink/ml/classification/SVM.scala +++ b/src/main/scala/org/apache/flink/ml/classification/SVM.scala @@ -1,552 +1 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.flink.ml.classification - -import org.apache.flink.api.common.functions.RichMapFunction -import org.apache.flink.api.scala._ -import org.apache.flink.configuration.Configuration -import org.apache.flink.ml.common.FlinkMLTools.ModuloKeyPartitioner -import org.apache.flink.ml.common._ -import org.apache.flink.ml.math.Breeze._ -import org.apache.flink.ml.math.{DenseVector, Vector} -import org.apache.flink.ml.pipeline.{FitOperation, PredictOperation, Predictor} - -import scala.collection.mutable.ArrayBuffer -import scala.util.Random - -import breeze.linalg.{DenseVector => BreezeDenseVector, Vector => BreezeVector} - -/** Implements a soft-margin SVM using the communication-efficient distributed dual coordinate - * ascent algorithm (CoCoA) with hinge-loss function. - * - * It can be used for binary classification problems, with the labels set as +1.0 to indiciate a - * positive example and -1.0 to indicate a negative example. - * - * The algorithm solves the following minimization problem: - * - * `min_{w in bbb"R"^d} lambda/2 ||w||^2 + 1/n sum_(i=1)^n l_{i}(w^Tx_i)` - * - * with `w` being the weight vector, `lambda` being the regularization constant, - * `x_{i} in bbb"R"^d` being the data points and `l_{i}` being the convex loss functions, which - * can also depend on the labels `y_{i} in bbb"R"`. - * In the current implementation the regularizer is the 2-norm and the loss functions are the - * hinge-loss functions: - * - * `l_{i} = max(0, 1 - y_{i} * w^Tx_i` - * - * With these choices, the problem definition is equivalent to a SVM with soft-margin. - * Thus, the algorithm allows us to train a SVM with soft-margin. - * - * The minimization problem is solved by applying stochastic dual coordinate ascent (SDCA). - * In order to make the algorithm efficient in a distributed setting, the CoCoA algorithm - * calculates several iterations of SDCA locally on a data block before merging the local - * updates into a valid global state. - * This state is redistributed to the different data partitions where the next round of local - * SDCA iterations is then executed. - * The number of outer iterations and local SDCA iterations control the overall network costs, - * because there is only network communication required for each outer iteration. - * The local SDCA iterations are embarrassingly parallel once the individual data partitions have - * been distributed across the cluster. - * - * Further details of the algorithm can be found [[http://arxiv.org/abs/1409.1458 here]]. - * - * @example - * {{{ - * val trainingDS: DataSet[LabeledVector] = env.readLibSVM(pathToTrainingFile) - * - * val svm = SVM() - * .setBlocks(10) - * - * svm.fit(trainingDS) - * - * val testingDS: DataSet[Vector] = env.readLibSVM(pathToTestingFile) - * .map(lv => lv.vector) - * - * val predictionDS: DataSet[(Vector, Double)] = svm.predict(testingDS) - * }}} - * - * =Parameters= - * - * - [[org.apache.flink.ml.classification.SVM.Blocks]]: - * Sets the number of blocks into which the input data will be split. On each block the local - * stochastic dual coordinate ascent method is executed. This number should be set at least to - * the degree of parallelism. If no value is specified, then the parallelism of the input - * [[DataSet]] is used as the number of blocks. (Default value: '''None''') - * - * - [[org.apache.flink.ml.classification.SVM.Iterations]]: - * Defines the maximum number of iterations of the outer loop method. In other words, it defines - * how often the SDCA method is applied to the blocked data. After each iteration, the locally - * computed weight vector updates have to be reduced to update the global weight vector value. - * The new weight vector is broadcast to all SDCA tasks at the beginning of each iteration. - * (Default value: '''10''') - * - * - [[org.apache.flink.ml.classification.SVM.LocalIterations]]: - * Defines the maximum number of SDCA iterations. In other words, it defines how many data points - * are drawn from each local data block to calculate the stochastic dual coordinate ascent. - * (Default value: '''10''') - * - * - [[org.apache.flink.ml.classification.SVM.Regularization]]: - * Defines the regularization constant of the SVM algorithm. The higher the value, the smaller - * will the 2-norm of the weight vector be. In case of a SVM with hinge loss this means that the - * SVM margin will be wider even though it might contain some false classifications. - * (Default value: '''1.0''') - * - * - [[org.apache.flink.ml.classification.SVM.Stepsize]]: - * Defines the initial step size for the updates of the weight vector. The larger the step size - * is, the larger will be the contribution of the weight vector updates to the next weight vector - * value. The effective scaling of the updates is `stepsize/blocks`. This value has to be tuned - * in case that the algorithm becomes instable. (Default value: '''1.0''') - * - * - [[org.apache.flink.ml.classification.SVM.Seed]]: - * Defines the seed to initialize the random number generator. The seed directly controls which - * data points are chosen for the SDCA method. (Default value: '''Random value''') - * - * - [[org.apache.flink.ml.classification.SVM.ThresholdValue]]: - * Defines the limiting value for the decision function above which examples are labeled as - * positive (+1.0). Examples with a decision function value below this value are classified as - * negative(-1.0). In order to get the raw decision function values you need to indicate it by - * using the [[org.apache.flink.ml.classification.SVM.OutputDecisionFunction]]. - * (Default value: '''0.0''') - * - * - [[org.apache.flink.ml.classification.SVM.OutputDecisionFunction]]: - * Determines whether the predict and evaluate functions of the SVM should return the distance - * to the separating hyperplane, or binary class labels. Setting this to true will return the raw - * distance to the hyperplane for each example. Setting it to false will return the binary - * class label (+1.0, -1.0) (Default value: '''false''') - */ -class SVM extends Predictor[SVM] { - - import SVM._ - - /** Stores the learned weight vector after the fit operation */ - var weightsOption: Option[DataSet[DenseVector]] = None - - /** Sets the number of data blocks/partitions - * - * @param blocks the number of blocks into which the input data will be split. - * @return itself - */ - def setBlocks(blocks: Int): SVM = { - parameters.add(Blocks, blocks) - this - } - - /** Sets the number of outer iterations - * - * @param iterations the maximum number of iterations of the outer loop method - * @return itself - */ - def setIterations(iterations: Int): SVM = { - parameters.add(Iterations, iterations) - this - } - - /** Sets the number of local SDCA iterations - * - * @param localIterations the maximum number of SDCA iterations - * @return itself - */ - def setLocalIterations(localIterations: Int): SVM = { - parameters.add(LocalIterations, localIterations) - this - } - - /** Sets the regularization constant - * - * @param regularization the regularization constant of the SVM algorithm - * @return itself - */ - def setRegularization(regularization: Double): SVM = { - parameters.add(Regularization, regularization) - this - } - - /** Sets the stepsize for the weight vector updates - * - * @param stepsize the initial step size for the updates of the weight vector - * @return itself - */ - def setStepsize(stepsize: Double): SVM = { - parameters.add(Stepsize, stepsize) - this - } - - /** Sets the seed value for the random number generator - * - * @param seed the seed to initialize the random number generator - * @return itself - */ - def setSeed(seed: Long): SVM = { - parameters.add(Seed, seed) - this - } - - /** Sets the threshold above which elements are classified as positive. - * - * The [[predict ]] and [[evaluate]] functions will return +1.0 for items with a decision - * function value above this threshold, and -1.0 for items below it. - * @param threshold the limiting value for the decision function above which examples are - * labeled as positive - * @return itself - */ - def setThreshold(threshold: Double): SVM = { - parameters.add(ThresholdValue, threshold) - this - } - - /** Sets whether the predictions should return the raw decision function value or the - * thresholded binary value. - * - * When setting this to true, predict and evaluate return the raw decision value, which is - * the distance from the separating hyperplane. - * When setting this to false, they return thresholded (+1.0, -1.0) values. - * - * @param outputDecisionFunction When set to true, [[predict ]] and [[evaluate]] return the raw - * decision function values. When set to false, they return the - * thresholded binary values (+1.0, -1.0). - * @return itself - */ - def setOutputDecisionFunction(outputDecisionFunction: Boolean): SVM = { - parameters.add(OutputDecisionFunction, outputDecisionFunction) - this - } -} - -/** Companion object of SVM. Contains convenience functions and the parameter type definitions - * of the algorithm. - */ -object SVM{ - - val WEIGHT_VECTOR_BROADCAST_NAME = "weightVector" - - // ========================================== Parameters ========================================= - - case object Blocks extends Parameter[Int] { - val defaultValue: Option[Int] = None - } - - case object Iterations extends Parameter[Int] { - val defaultValue = Some(10) - } - - case object LocalIterations extends Parameter[Int] { - val defaultValue = Some(10) - } - - case object Regularization extends Parameter[Double] { - val defaultValue = Some(1.0) - } - - case object Stepsize extends Parameter[Double] { - val defaultValue = Some(1.0) - } - - case object Seed extends Parameter[Long] { - val defaultValue = Some(Random.nextLong()) - } - - case object ThresholdValue extends Parameter[Double] { - val defaultValue = Some(0.0) - } - - case object OutputDecisionFunction extends Parameter[Boolean] { - val defaultValue = Some(false) - } - - // ========================================== Factory methods ==================================== - - def apply(): SVM = { - new SVM() - } - - // ========================================== Operations ========================================= - - /** Provides the operation that makes the predictions for individual examples. - * - * @tparam T Input data type which is a subtype of [[Vector]] - * @return A PredictOperation, through which it is possible to predict a value, given a - * feature vector - */ - implicit def predictVectors[T <: Vector] = { - new PredictOperation[SVM, DenseVector, T, Double](){ - - var thresholdValue: Double = _ - var outputDecisionFunction: Boolean = _ - - override def getModel(self: SVM, predictParameters: ParameterMap): DataSet[DenseVector] = { - thresholdValue = predictParameters(ThresholdValue) - outputDecisionFunction = predictParameters(OutputDecisionFunction) - self.weightsOption match { - case Some(model) => model - case None => { - throw new RuntimeException("The SVM model has not been trained. Call first fit" + - "before calling the predict operation.") - } - } - } - - override def predict(value: T, model: DenseVector): Double = { - val rawValue = value.asBreeze dot model.asBreeze - - if (outputDecisionFunction) { - rawValue - } else { - if (rawValue > thresholdValue) 1.0 else -1.0 - } - } - } - } - - /** [[FitOperation]] which trains a SVM with soft-margin based on the given training data set. - * - */ - implicit val fitSVM = { - new FitOperation[SVM, LabeledVector] { - override def fit( - instance: SVM, - fitParameters: ParameterMap, - input: DataSet[LabeledVector]) - : Unit = { - val resultingParameters = instance.parameters ++ fitParameters - - // Check if the number of blocks/partitions has been specified - val blocks = resultingParameters.get(Blocks) match { - case Some(value) => value - case None => input.getParallelism - } - - val scaling = resultingParameters(Stepsize)/blocks - val iterations = resultingParameters(Iterations) - val localIterations = resultingParameters(LocalIterations) - val regularization = resultingParameters(Regularization) - val seed = resultingParameters(Seed) - - // Obtain DataSet with the dimension of the data points - val dimension = input.map{_.vector.size}.reduce{ - (a, b) => { - require(a == b, "Dimensions of feature vectors have to be equal.") - a - } - } - - val initialWeights = createInitialWeights(dimension) - - // Count the number of vectors, but keep the value in a DataSet to broadcast it later - // TODO: Once efficient count and intermediate result partitions are implemented, use count - val numberVectors = input map { x => 1 } reduce { _ + _ } - - // Group the input data into blocks in round robin fashion - val blockedInputNumberElements = FlinkMLTools.block( - input, - blocks, - Some(ModuloKeyPartitioner)). - cross(numberVectors). - map { x => x } - - val resultingWeights = initialWeights.iterate(iterations) { - weights => { - // compute the local SDCA to obtain the weight vector updates - val deltaWs = localDualMethod( - weights, - blockedInputNumberElements, - localIterations, - regularization, - scaling, - seed - ) - - // scale the weight vectors - val weightedDeltaWs = deltaWs map { - deltaW => { - deltaW :*= scaling - } - } - - // calculate the new weight vector by adding the weight vector updates to the weight - // vector value - weights.union(weightedDeltaWs).reduce { _ + _ } - } - } - - // Store the learned weight vector in hte given instance - instance.weightsOption = Some(resultingWeights.map(_.fromBreeze[DenseVector])) - } - } - } - - /** Creates a zero vector of length dimension - * - * @param dimension [[DataSet]] containing the dimension of the initial weight vector - * @return Zero vector of length dimension - */ - private def createInitialWeights(dimension: DataSet[Int]): DataSet[BreezeDenseVector[Double]] = { - dimension.map { - d => BreezeDenseVector.zeros[Double](d) - } - } - - /** Computes the local SDCA on the individual data blocks/partitions - * - * @param w Current weight vector - * @param blockedInputNumberElements Blocked/Partitioned input data - * @param localIterations Number of local SDCA iterations - * @param regularization Regularization constant - * @param scaling Scaling value for new weight vector updates - * @param seed Random number generator seed - * @return [[DataSet]] of weight vector updates. The weight vector updates are double arrays - */ - private def localDualMethod( - w: DataSet[BreezeDenseVector[Double]], - blockedInputNumberElements: DataSet[(Block[LabeledVector], Int)], - localIterations: Int, - regularization: Double, - scaling: Double, - seed: Long) - : DataSet[BreezeDenseVector[Double]] = { - /* - Rich mapper calculating for each data block the local SDCA. We use a RichMapFunction here, - because we broadcast the current value of the weight vector to all mappers. - */ - val localSDCA = new RichMapFunction[(Block[LabeledVector], Int), BreezeDenseVector[Double]] { - var originalW: BreezeDenseVector[Double] = _ - // we keep the alphas across the outer loop iterations - val alphasArray = ArrayBuffer[BreezeDenseVector[Double]]() - // there might be several data blocks in one Flink partition, therefore store mapping - val idMapping = scala.collection.mutable.HashMap[Int, Int]() - var counter = 0 - - var r: Random = _ - - override def open(parameters: Configuration): Unit = { - originalW = getRuntimeContext.getBroadcastVariable(WEIGHT_VECTOR_BROADCAST_NAME).get(0) - - if(r == null){ - r = new Random(seed ^ getRuntimeContext.getIndexOfThisSubtask) - } - } - - override def map(blockNumberElements: (Block[LabeledVector], Int)) - : BreezeDenseVector[Double] = { - val (block, numberElements) = blockNumberElements - - // check if we already processed a data block with the corresponding block index - val localIndex = idMapping.get(block.index) match { - case Some(idx) => idx - case None => - idMapping += (block.index -> counter) - counter += 1 - - alphasArray += BreezeDenseVector.zeros[Double](block.values.length) - - counter - 1 - } - - // create temporary alpha array for the local SDCA iterations - val tempAlphas = alphasArray(localIndex).copy - - val numLocalDatapoints = tempAlphas.length - val deltaAlphas = BreezeDenseVector.zeros[Double](numLocalDatapoints) - - val w = originalW.copy - - val deltaW = BreezeDenseVector.zeros[Double](originalW.length) - - for(i <- 1 to localIterations) { - // pick random data point for SDCA - val idx = r.nextInt(numLocalDatapoints) - - val LabeledVector(label, vector) = block.values(idx) - val alpha = tempAlphas(idx) - - // maximize the dual problem and retrieve alpha and weight vector updates - val (deltaAlpha, deltaWUpdate) = maximize( - vector.asBreeze, - label, - regularization, - alpha, - w, - numberElements) - - // update alpha values - tempAlphas(idx) += deltaAlpha - deltaAlphas(idx) += deltaAlpha - - // deltaWUpdate is already scaled with 1/lambda/n - w += deltaWUpdate - deltaW += deltaWUpdate - } - - // update local alpha values - alphasArray(localIndex) += deltaAlphas * scaling - - deltaW - } - } - - blockedInputNumberElements.map(localSDCA).withBroadcastSet(w, WEIGHT_VECTOR_BROADCAST_NAME) - } - - /** Maximizes the dual problem using hinge loss functions. It returns the alpha and weight - * vector updates. - * - * @param x Selected data point - * @param y Label of selected data point - * @param regularization Regularization constant - * @param alpha Alpha value of selected data point - * @param w Current weight vector value - * @param numberElements Number of elements in the training data set - * @return Alpha and weight vector updates - */ - private def maximize( - x: BreezeVector[Double], - y: Double, regularization: Double, - alpha: Double, - w: BreezeVector[Double], - numberElements: Int) - : (Double, BreezeVector[Double]) = { - // compute hinge loss gradient - val dotProduct = x dot w - val grad = (y * dotProduct - 1.0) * (regularization * numberElements) - - // compute projected gradient - var proj_grad = if(alpha <= 0.0){ - scala.math.min(grad, 0) - } else if(alpha >= 1.0) { - scala.math.max(grad, 0) - } else { - grad - } - - if(scala.math.abs(grad) != 0.0){ - val qii = x dot x - val newAlpha = if(qii != 0.0){ - scala.math.min(scala.math.max(alpha - (grad / qii), 0.0), 1.0) - } else { - 1.0 - } - - val deltaW = x * y * (newAlpha - alpha) / (regularization * numberElements) - - (newAlpha - alpha, deltaW) - } else { - (0.0 , BreezeVector.zeros(w.length)) - } - } - -} From e2a6f542828b4f613b3e1db63c7ab64475974b99 Mon Sep 17 00:00:00 2001 From: p4nna Date: Tue, 28 Mar 2017 13:22:15 +0200 Subject: [PATCH 11/12] deleted unnecessary file From caea8b12098f2094c79043c220ade9192f95bd80 Mon Sep 17 00:00:00 2001 From: p4nna Date: Tue, 28 Mar 2017 13:35:07 +0200 Subject: [PATCH 12/12] Update MLUtils.scala