In [1]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars /home/jovyan/elasticsearch-spark-30_2.12-8.4.1.jar pyspark-shell'

from pyspark.sql import *
from pyspark.sql.functions import *
spark = SparkSession.builder \
    .appName("recommendation_system") \
    .config("spark.jars", "/home/jovyan/elasticsearch-spark-30_2.12-8.4.1.jar") \
    .config("spark.es.nodes.wan.only","true") \
    .config("spark.es.nodes","172.22.0.2") \
    .config("spark.es.port","9200") \
    .getOrCreate()
spark

In [2]:
import gzip
import json
from pyspark.sql.types import *


def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

        
def getMetaData(path):
    data = []
    data_schema =  [
                       StructField("asin", StringType(), True),
                       StructField("title", StringType(), True),
                       StructField("brand", StringType(), True),
                       StructField("category", ArrayType(StringType(), True), True),
                       StructField("main_category", StringType(), True),
                       StructField("image", ArrayType(StringType(), True), True)
                   ]
    final_schema = StructType(fields=data_schema)
    for d in parse(path):
        review = {}
        review['asin'] = d['asin']
        review['title'] = d['title']
        review['brand'] = d['brand']
        review['category'] = d['category']
#         print(d['category'])
        review['main_category'] = next(reversed(d['category']), None) if len(d['category'])!= 0 else ''
    
        #try:
        #    review['image'] = d['image']
        #except KeyError:
        #    review['image'] = ['none']

        data.append(review)
#   print(df)
    return spark.createDataFrame(data, schema=final_schema)

product_data = getMetaData('./data/meta_Appliances.json.gz')
product_data = product_data.dropDuplicates(['asin'])
product_data.limit(1).toPandas()
# product_data.printSchema()

Unnamed: 0,asin,title,brand,category,main_category,image
0,8792559360,The Cigar - Moments of Pleasure,The Cigar Book,"[Appliances, Parts &amp; Accessories]",Parts &amp; Accessories,


In [3]:
product_data.groupBy("main_category").count().orderBy(col('count').desc()).show(100)

+--------------------+-----+
|       main_category|count|
+--------------------+-----+
| Parts & Accessories| 4513|
|Refrigerator Part...| 3733|
|Washer Parts & Ac...| 2270|
|Dishwasher Parts ...| 1790|
|Range Parts & Acc...| 1710|
|       Water Filters| 1572|
|   Replacement Parts| 1556|
|Cooktop Parts & A...| 1171|
|         Range Hoods|  951|
|Humidifier Parts ...|  887|
|                    |  805|
|       Refrigerators|  722|
|Oven Parts & Acce...|  645|
|          Ice Makers|  453|
|            Cooktops|  436|
| Freestanding Ranges|  412|
|               Knobs|  406|
|Freezer Parts & A...|  360|
|Built-In Dishwashers|  357|
|         Accessories|  341|
|             Washers|  302|
|                Bins|  273|
|              Dryers|  253|
|               Vents|  243|
|Dryer Parts & Acc...|  235|
|              Motors|  224|
|             Filters|  213|
|     Humidity Meters|  185|
|   Replacement Wicks|  177|
|Refrigerators, Fr...|  175|
|Ranges, Ovens & C...|  169|
|Range Hood Pa

In [4]:
from elasticsearch import Elasticsearch

# test your ES instance is running
es = Elasticsearch('http://172.22.0.2:9200')
es.info(pretty=True)

ObjectApiResponse({'name': '8d5e3008021d', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'eoXjfjhuR5ifktX3ElvQsA', 'version': {'number': '8.4.1', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '2bd229c8e56650b42e40992322a76e7914258f0c', 'build_date': '2022-08-26T12:11:43.232597118Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [5]:
if es.indices.exists(index="products"):
    es.indices.delete(index="products")
VECTOR_DIM = 25

product_mapping = {
    # this mapping definition sets up the metadata fields for the products
    "properties": {
        "asin": {
            "type": "keyword"
        },
        "title": {
            "type": "keyword"
        },
        "image": {
            "type": "keyword"
        },
        "brand": {
            "type": "keyword"
        },
        "category": {
            "type": "keyword"
        },
        "main_category": {
            "type": "keyword"
        },
        # the following fields define our model factor vectors and metadata
        "model_factor": {
            "type": "dense_vector",
            "dims" : VECTOR_DIM
        },
        "model_version": {
            "type": "keyword"
        },
        "model_timestamp": {
            "type": "date"
        }          
    }
}

res_products = es.indices.create(index="products", mappings=product_mapping)

print("Created indices:")
print(res_products)

Created indices:
{'acknowledged': True, 'shards_acknowledged': True, 'index': 'products'}


In [6]:
 es.count(index="products")['count']

0

In [7]:
product_data.write.format("es").option("es.mapping.id", "asin").save("products")
num_products_df = product_data.count()
num_products_es = es.count(index="products")['count']
# check load went ok
print("Product DF count: {}".format(num_products_df))
print("ES index count: {}".format(num_products_es))

Product DF count: 30239
ES index count: 30239


In [8]:
es.search(index="products", q="main_category:Refrigerators", size=3)

ObjectApiResponse({'took': 43, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 722, 'relation': 'eq'}, 'max_score': 3.7342029, 'hits': [{'_index': 'products', '_id': 'B000E846DA', '_score': 3.7342029, '_source': {'asin': 'B000E846DA', 'title': 'Fisher Paykel E522BLX 17.6 cu ft Bottom-Freezer Refrigerator - Stainless Steel with Left Hinge', 'brand': 'Fisher Paykel', 'category': ['Appliances', 'Refrigerators, Freezers & Ice Makers', 'Refrigerators'], 'main_category': 'Refrigerators'}}, {'_index': 'products', '_id': 'B000EMNKOC', '_score': 3.7342029, '_source': {'asin': 'B000EMNKOC', 'title': 'Frigidaire FRS6R5ESB 26 Cu. Ft. Side-by-Side Refrigerator (Stainless Steel)', 'brand': 'Frigidaire', 'category': ['Appliances', 'Refrigerators, Freezers & Ice Makers', 'Refrigerators'], 'main_category': 'Refrigerators'}}, {'_index': 'products', '_id': 'B000EPN8ZK', '_score': 3.7342029, '_source': {'asin': 'B000EPN8ZK', 'title': '4.

In [9]:
def getRatingData(path):
    data = []
    data_schema = [
               StructField("asin", StringType(), True),
               StructField("reviewerId", StringType(), True),
               StructField("rating", FloatType(), True)]
    final_schema = StructType(fields=data_schema)
    for d in parse(path):
        review = {}
        review['asin'] = d['asin']
        review['reviewerId'] = d['reviewerID']
        review['rating'] = d['overall']
        data.append(review)
#   print(df)
    return spark.createDataFrame(data, schema=final_schema)

df_rating= getRatingData('./data/Appliances.json.gz')
df_rating.limit(3).toPandas()

Unnamed: 0,asin,reviewerId,rating
0,1118461304,A3NHUQ33CFH3VM,5.0
1,1118461304,A3SK6VNBQDNBJE,5.0
2,1118461304,A3SOFHUR27FO3K,5.0


In [10]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.sql.functions import col

indexer = [StringIndexer(inputCol=column, outputCol=column+"_index") for column in list(set(df_rating.columns)-set(['rating'])) ]
pipeline = Pipeline(stages=indexer)
transformed = pipeline.fit(df_rating).transform(df_rating)
transformed.show()

+----------+--------------+------+----------------+----------+
|      asin|    reviewerId|rating|reviewerId_index|asin_index|
+----------+--------------+------+----------------+----------+
|1118461304|A3NHUQ33CFH3VM|   5.0|           118.0|    2229.0|
|1118461304|A3SK6VNBQDNBJE|   5.0|        396610.0|    2229.0|
|1118461304|A3SOFHUR27FO3K|   5.0|        397010.0|    2229.0|
|1118461304|A1HOG1PYCAE157|   5.0|        122436.0|    2229.0|
|1118461304|A26JGAM6GZMM4V|   5.0|        204413.0|    2229.0|
|1118461304|A17K8WANMYHTX2|   5.0|         88869.0|    2229.0|
|1118461304|A13IW3A6W43U0G|   5.0|         75566.0|    2229.0|
|1118461304|A1ECEGG1MP7J8J|   5.0|        111423.0|    2229.0|
|1118461304|A2D5X9G9S3A7RN|   5.0|        226519.0|    2229.0|
|1118461304| AP2F86JFRQ205|   5.0|        479108.0|    2229.0|
|1118461304|A3VF3A5A3O04E1|   4.0|        406281.0|    2229.0|
|1118461304|A14DW5UMQ1M96O|   5.0|         78358.0|    2229.0|
|1118461304|A2V7UVKOFG57IW|   4.0|        286083.0|    

In [11]:
als=ALS(maxIter=5,regParam=0.09,rank=25,userCol="reviewerId_index",itemCol="asin_index",ratingCol="rating",coldStartStrategy="drop",nonnegative=True)
model=als.fit(transformed)

In [12]:
model.itemFactors.count()

30252

In [13]:
evaluator=RegressionEvaluator(metricName="rmse",labelCol="rating",predictionCol="prediction")
predictions=model.transform(transformed)
rmse=evaluator.evaluate(predictions)
print("RMSE="+str(rmse))
predictions.show()

RMSE=0.2029201610101436
+----------+--------------+------+----------------+----------+----------+
|      asin|    reviewerId|rating|reviewerId_index|asin_index|prediction|
+----------+--------------+------+----------------+----------+----------+
|B005AR7B0U|A2SEK167AI6DZM|   5.0|            17.0|     833.0|  4.786325|
|B00Y1RCAEW|A2UEH0U5CV3053|   5.0|            43.0|   18654.0| 4.7185335|
|B005AR7B0U|A11MR9NLWY2TCB|   5.0|            44.0|     833.0|  4.931913|
|B01GC1087U|A1P5WN345EBHDJ|   5.0|            48.0|    3918.0|  4.739944|
|B005BNMVEY|A2ZXGRVD55NQ5T|   5.0|            91.0|     463.0|  4.286341|
|B005BNMVEY|A2KSW3CSIMJ3AT|   5.0|           139.0|     463.0|   4.32248|
|B003ZB3TTA|A2M0K9OMYO81IY|   1.0|           140.0|   15846.0| 1.0170279|
|B00OMR0AXY|A3CIEPFEOAKP4T|   5.0|           339.0|     496.0| 4.5304193|
|B004XLE0I0|A3P9GXL2BDXKQF|   4.0|           358.0|    1580.0| 3.9781501|
|B005AR7B0U|A3TAF3L8D5QEBY|   4.0|           368.0|     833.0| 3.9198346|
|B00L4GDO34| A

In [14]:
from pyspark.sql.functions import lit, current_timestamp, unix_timestamp
ver = model.uid
ts = unix_timestamp(current_timestamp())
product_vectors = model.itemFactors.select("id",\
                                         col("features").alias("model_factor"),\
                                         lit(ver).alias("model_version"),\
                                         ts.alias("model_timestamp"))
product_vectors.show(2)

+---+--------------------+----------------+---------------+
| id|        model_factor|   model_version|model_timestamp|
+---+--------------------+----------------+---------------+
|  0|[0.29134184, 0.27...|ALS_d0e9b000de29|     1663287881|
| 10|[1.7965407, 0.392...|ALS_d0e9b000de29|     1663287881|
+---+--------------------+----------------+---------------+
only showing top 2 rows



In [15]:
asin_index_meta = [
    f.metadata for f in transformed.schema.fields if f.name == "asin_index"]
asin_index_labels = asin_index_meta[0]["ml_attr"]["vals"]

from pyspark.ml.feature import IndexToString

reviewerId_converter = IndexToString(inputCol="id", outputCol="asin",   labels=asin_index_labels)
PredictedLabels = reviewerId_converter.transform(product_vectors)
PredictedLabels = PredictedLabels.drop('id')
PredictedLabels.show(10)

+--------------------+----------------+---------------+----------+
|        model_factor|   model_version|model_timestamp|      asin|
+--------------------+----------------+---------------+----------+
|[0.29134184, 0.27...|ALS_d0e9b000de29|     1663287885|B000AST3AK|
|[1.7965407, 0.392...|ALS_d0e9b000de29|     1663287885|B0006GVNOA|
|[0.5861694, 0.073...|ALS_d0e9b000de29|     1663287885|B01CTNA1VI|
|[0.40666553, 0.03...|ALS_d0e9b000de29|     1663287885|B00126NABC|
|[0.12317319, 0.34...|ALS_d0e9b000de29|     1663287885|B00UB441HS|
|[0.2624271, 0.149...|ALS_d0e9b000de29|     1663287885|B0042U16YI|
|[0.20160528, 0.01...|ALS_d0e9b000de29|     1663287885|B00W0W8LMK|
|[0.16788186, 0.06...|ALS_d0e9b000de29|     1663287885|B004XLDE5A|
|[0.017832814, 0.4...|ALS_d0e9b000de29|     1663287885|B00W0WXHCO|
|[0.23772983, 0.14...|ALS_d0e9b000de29|     1663287885|B00NIZ0DV0|
+--------------------+----------------+---------------+----------+
only showing top 10 rows



In [16]:
PredictedLabels.count()

30252

In [17]:
PredictedLabels.write.format("es") \
    .option("es.mapping.id", "asin") \
    .option("es.write.operation", "upsert") \
    .save("products", mode="append")