In [None]:
import sys
import argparse
from datetime import datetime, timedelta
from pyspark.sql import DataFrame
from functools import reduce
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.types import DoubleType
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import Evaluator
from pyspark.mllib.evaluation import RankingMetrics
from pyspark.sql import Window
from pyspark.ml.recommendation import ALS
import boto3
import pandas as pd

In [None]:
class RankingEvaluator(Evaluator):
    """
    The ranking evaluator provides the NDCG metric evaluting the predicted
    ranking of category view count rather than the actual view counts.
    """

    def __init__(self, k=None):
        self.k = k

    def isLargerBetter(self):
        return True

    def _evaluate(self, predictedDF):
        partition_window = Window.partitionBy('device_id')
        windowSpec = partition_window.orderBy(F.col('prediction').desc())
        per_user_predicted = predictedDF \
            .select('device_id', 'segment_id', 'prediction', F.rank().
                    over(windowSpec).alias('rank')).where(
                f'rank <= {self.k}').groupBy('device_id') \
            .agg(F.expr('collect_list(segment_id) as items'),
                 F.count('segment_id').alias('num_items'))

        per_user_predicted = per_user_predicted.filter(
            F.col("num_items") > 5).select("device_id", "items")

        windowSpec = partition_window.orderBy(F.col("frequency").desc())
        per_user_actual = predictedDF \
            .select('device_id', 'segment_id', "frequency", F.rank().
                    over(windowSpec).alias('rank')) \
            .where(f'rank <= {self.k}').groupBy('device_id') \
            .agg(F.expr('collect_list(segment_id) as items'),
                 F.count('segment_id').alias('num_items'))

        per_user_actual = per_user_actual.filter(
            F.col("num_items") > 5).select("device_id", "items")
        perUserItemsRDD = per_user_predicted.join(
            per_user_actual, 'device_id', 'inner').rdd.map(
            lambda row: (row[1], row[2]))

        if perUserItemsRDD.isEmpty():
            return 0.0

        rankingMetrics = RankingMetrics(perUserItemsRDD)
        metric = rankingMetrics.ndcgAt(self.k)
        return metric



In [None]:
def get_integer_device_ids(data):
    """
    For Spark ALS implementation to work, the device and category IDs
    have to be integers. Using the function monotonically_increasing_id
    for this purpose.
    """
    devices = data_ema.select('device_id').distinct()
    devices = devices.repartition(1).withColumn(
        "device_id_int", F.monotonically_increasing_id()).persist()
    data = data_ema.join(devices, on="device_id", how="left")
    return devices, data


In [None]:
def train_als_model(training, test, ranks, reguls, iters,
                    alphas, evaluator):
    """
    Train the ALS collaborative filtering model with implicit ratings.
    Parmeters
    training: taining data set.
    test: the validation data set.
    ranks: the dimesnion for the low dimensional device product representation.
    reguls: the  regularization to be applied.
    alphas: the confidence estimates.
    evaluator: the evaluator for the ALS model.

    """
    als = ALS(userCol="device_id_int", itemCol="segment_id",
              ratingCol="frequency", coldStartStrategy="drop",
              implicitPrefs=True, nonnegative=False)

    # We use a ParamGridBuilder to construct a grid of parameters to search
    # over. TrainValidationSplit will try all combinations of values and
    # determine best model using the evaluator.
    paramGrid = ParamGridBuilder()\
        .addGrid(als.regParam, [0.1, 0.05]) \
        .addGrid(als.rank, [15])\
        .addGrid(als.alpha, [1.0])\
        .addGrid(als.maxIter, [15])\
        .build()

    crossval = CrossValidator(estimator=als,
                              estimatorParamMaps=paramGrid,
                              evaluator=evaluator,
                              numFolds=3)  # use 3+ folds in practice

    # Run cross-validation, and choose the best set of parameters.
    model = crossval.fit(training)
    finalModel = model.bestModel

    predictions = finalModel.transform(test)

    metric = evaluator.evaluate(predictions)
    print(f"NCDG ", metric)
    best_alpha = model.bestModel._java_obj.parent().getAlpha()
    best_regul = model.bestModel._java_obj.parent().getRegParam()
    best_iter = model.bestModel._java_obj.parent().getMaxIter()
    best_rank = model.bestModel._java_obj.parent().getRank()
    print(f"""
    best alpha: {best_alpha}\nbest regul: {best_regul}
    best iter: {best_iter}\nbest rank: {best_rank}
    """)
    return finalModel



In [None]:
finalModel = train_als_model(
    training, test, [15], [0.1, 0.15], [15], [1.0, 1.5], evaluator)

print("Finished training the collaborative filtering model.")

reco = finalModel.recommendForAllItems(400000)
reco = reco.withColumn("user_scores", F.explode(reco.recommendations))
reco = reco.withColumn('device_id_int', F.col('user_scores.device_id_int'))

recommendations = reco.withColumn(
    'rating', F.col('user_scores.rating')).drop(
    F.col("recommendations")).drop(F.col("user_scores")).join(
    categories.hint("broadcast"), on="segment_id", how="left").join(
    devices, on="device_id_int", how="left")

recommendations = recommendations.filter(
    F.col("rating") >= 0.6).withColumnRenamed(
    "segment_id", "category_id").select(["device_id", "category_id", "rating"])