### Modeling using pyspark
Here I use pyspark to create an ALS model for recommender system

In [2]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.feature import StringIndexer
from pyspark.sql.functions import col

In [4]:
spark = SparkSession.builder.appName("ALS").getOrCreate()

ratings_df = spark.read.csv('df_no_cold_start.csv', header=True, inferSchema=True)

In [5]:
product_indexer = StringIndexer(inputCol="product_id", outputCol="product_id_indexed")
indexed_df = product_indexer.fit(ratings_df).transform(ratings_df)
indexed_df = indexed_df.withColumn("product_id_indexed", col("product_id_indexed").cast("integer"))

indexed_df = indexed_df.withColumn('customer_id_str', col('customer_id').cast('string'))

indexer = StringIndexer(inputCol='customer_id_str', outputCol='customer_id_indexed')

indexed_df = indexer.fit(indexed_df).transform(indexed_df)
indexed_df = indexed_df.withColumn("customer_id_indexed", col("customer_id_indexed").cast("integer"))
indexed_df.head(5)

[Row(_c0=57, customer_id=38443412, product_id='B00HQVTMK0', star_rating=2, product_id_indexed=8296, customer_id_str='38443412', customer_id_indexed=9953),
 Row(_c0=63, customer_id=35789310, product_id='B00F877VES', star_rating=4, product_id_indexed=4780, customer_id_str='35789310', customer_id_indexed=692),
 Row(_c0=68, customer_id=38575075, product_id='B008EPW7TA', star_rating=5, product_id_indexed=528, customer_id_str='38575075', customer_id_indexed=184),
 Row(_c0=74, customer_id=12166830, product_id='B008UGPBXM', star_rating=5, product_id_indexed=1649, customer_id_str='12166830', customer_id_indexed=2679),
 Row(_c0=107, customer_id=19951707, product_id='B00CTKBTH2', star_rating=4, product_id_indexed=4713, customer_id_str='19951707', customer_id_indexed=7354)]

In [6]:
(training, val, test) = indexed_df.randomSplit([0.6, 0.2, 0.2])

In [92]:
als = ALS(userCol="customer_id_indexed",rank = 20, regParam = 0.6, maxIter = 18, itemCol="product_id_indexed", ratingCol="star_rating", coldStartStrategy="drop")

In [93]:
model = als.fit(training)

In [94]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="star_rating", predictionCol="prediction")
predictions_train = model.transform(training)
predictions_test = model.transform(test)
predictions_val = model.transform(val)

rmse_train = evaluator.evaluate(predictions_train)
rmse_test = evaluator.evaluate(predictions_test)
rmse_val = evaluator.evaluate(predictions_val)
print(f"Root-mean-square error for train = {rmse_train}")
print(f"Root-mean-square error for test = {rmse_test}")
print(f"Root-mean-square error for val = {rmse_val}")

Root-mean-square error for train = 0.7744603925322685
Root-mean-square error for test = 1.8740797549677606
Root-mean-square error for val = 1.8516329601462063
