From 183df3bfbb56921aa85095e62deefe09a37435b0 Mon Sep 17 00:00:00 2001 From: somideshmukh Date: Mon, 25 Jan 2016 17:15:32 +0530 Subject: [PATCH] [SPARK-12632][Python][Make Parameter Descriptions Consistent for PySpark MLlib FPM and Recommendation] --- python/pyspark/mllib/fpm.py | 44 ++++++++++++-------- python/pyspark/mllib/recommendation.py | 56 ++++++++++++++++++-------- 2 files changed, 67 insertions(+), 33 deletions(-) diff --git a/python/pyspark/mllib/fpm.py b/python/pyspark/mllib/fpm.py index 2039decc0cb3c..5637a63b4ee0c 100644 --- a/python/pyspark/mllib/fpm.py +++ b/python/pyspark/mllib/fpm.py @@ -67,12 +67,14 @@ class FPGrowth(object): def train(cls, data, minSupport=0.3, numPartitions=-1): """ Computes an FP-Growth model that contains frequent itemsets. - - :param data: The input data set, each element contains a - transaction. - :param minSupport: The minimal support level (default: `0.3`). - :param numPartitions: The number of partitions used by - parallel FP-growth (default: same as input data). + :param data: + The input data set, each element contains a transaction. + :param minSupport: + The minimal support level. + (default: 0.3) + :param numPartitions: + The number of partitions used by parallel FP-growth. + (default: same as input data) """ model = callMLlibFunc("trainFPGrowthModel", data, float(minSupport), int(numPartitions)) return FPGrowthModel(model) @@ -128,17 +130,25 @@ class PrefixSpan(object): @since("1.6.0") def train(cls, data, minSupport=0.1, maxPatternLength=10, maxLocalProjDBSize=32000000): """ - Finds the complete set of frequent sequential patterns in the input sequences of itemsets. - - :param data: The input data set, each element contains a sequnce of itemsets. - :param minSupport: the minimal support level of the sequential pattern, any pattern appears - more than (minSupport * size-of-the-dataset) times will be output (default: `0.1`) - :param maxPatternLength: the maximal length of the sequential pattern, any pattern appears - less than maxPatternLength will be output. (default: `10`) - :param maxLocalProjDBSize: The maximum number of items (including delimiters used in - the internal storage format) allowed in a projected database before local - processing. If a projected database exceeds this size, another - iteration of distributed prefix growth is run. (default: `32000000`) + Finds the complete set of frequent sequential patterns in the input + sequences of itemsets. + :param data: + The input data set, each element contains a sequnce of itemsets. + :param minSupport: + The minimal support level of the sequential pattern, any pattern + appears more than (minSupport * size-of-the-dataset) times will be + output. + (default: 0.1) + :param maxPatternLength: + The maximal length of the sequential pattern, any pattern + appears less than maxPatternLength will be output. + (default: 10) + :param maxLocalProjDBSize: + The maximum number of items (including delimiters used in the internal + storage format) allowed in a projected database before local + processing. If a projected database exceeds this size, another + iteration of distributed prefix growth is run. + (default: 32000000) """ model = callMLlibFunc("trainPrefixSpanModel", data, minSupport, maxPatternLength, maxLocalProjDBSize) diff --git a/python/pyspark/mllib/recommendation.py b/python/pyspark/mllib/recommendation.py index 93e47a797f490..680fa01931522 100644 --- a/python/pyspark/mllib/recommendation.py +++ b/python/pyspark/mllib/recommendation.py @@ -165,28 +165,30 @@ def productFeatures(self): @since("1.4.0") def recommendUsers(self, product, num): """ - Recommends the top "num" number of users for a given product and returns a list - of Rating objects sorted by the predicted rating in descending order. + Recommends the top "num" number of users for a given product and returns a + list of Rating objects sorted by the predicted rating in descending order. """ return list(self.call("recommendUsers", product, num)) @since("1.4.0") def recommendProducts(self, user, num): """ - Recommends the top "num" number of products for a given user and returns a list - of Rating objects sorted by the predicted rating in descending order. + Recommends the top "num" number of products for a given user and returns a + list of Rating objects sorted by the predicted rating in descending order. """ return list(self.call("recommendProducts", user, num)) def recommendProductsForUsers(self, num): """ - Recommends top "num" products for all users. The number returned may be less than this. + Recommends top "num" products for all users. The number returned may be + less than this. """ return self.call("wrappedRecommendProductsForUsers", num) def recommendUsersForProducts(self, num): """ - Recommends top "num" users for all products. The number returned may be less than this. + Recommends top "num" users for all products. The number returned may be + less than this. """ return self.call("wrappedRecommendUsersForProducts", num) @@ -234,11 +236,22 @@ def _prepare(cls, ratings): def train(cls, ratings, rank, iterations=5, lambda_=0.01, blocks=-1, nonnegative=False, seed=None): """ - Train a matrix factorization model given an RDD of ratings given by users to some products, - in the form of (userID, productID, rating) pairs. We approximate the ratings matrix as the - product of two lower-rank matrices of a given rank (number of features). To solve for these - features, we run a given number of iterations of ALS. This is done using a level of - parallelism given by `blocks`. + Train a matrix factorization model given an RDD of ratings given by users + to some products, in the form of (userID, productID, rating) pairs. We + approximate the ratings matrix as the product of two lower-rank matrices + of a given rank (number of features). To solve for these features, we run + a given number of iterations of ALS. This is done using a level of + parallelism given by `blocks`. + + :param iterations: + Number of iterations run for each batch of data. + (default: 5) + :param lambda_: + The smoothing parameter. + (default: 0.01) + :param seed: + Random seed for initial matrix factorization model. + (default: None) """ model = callMLlibFunc("trainALSModel", cls._prepare(ratings), rank, iterations, lambda_, blocks, nonnegative, seed) @@ -249,11 +262,22 @@ def train(cls, ratings, rank, iterations=5, lambda_=0.01, blocks=-1, nonnegative def trainImplicit(cls, ratings, rank, iterations=5, lambda_=0.01, blocks=-1, alpha=0.01, nonnegative=False, seed=None): """ - Train a matrix factorization model given an RDD of 'implicit preferences' given by users - to some products, in the form of (userID, productID, preference) pairs. We approximate the - ratings matrix as the product of two lower-rank matrices of a given rank (number of - features). To solve for these features, we run a given number of iterations of ALS. - This is done using a level of parallelism given by `blocks`. + Train a matrix factorization model given an RDD of 'implicit preferences' + given by users to some products, in the form of (userID, productID, + preference) pairs. We approximate the ratings matrix as the product of + two lower-rank matrices of a given rank (number of features).To solve + for these features, we run a given number of iterations of ALS. This is + done using a level of parallelism given by `blocks`. + + :param iterations: + Number of iterations run for each batch of data. + (default: 5) + :param lambda_: + The smoothing parameter. + (default: 0.01) + :param seed: + Random seed for initial matrix factorization model. + (default: None) """ model = callMLlibFunc("trainImplicitALSModel", cls._prepare(ratings), rank, iterations, lambda_, blocks, alpha, nonnegative, seed)