In [1]:
from pyspark.sql.functions import rand
from pyspark.ml.feature import StringIndexer, IndexToString
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col, lit
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, LongType, DoubleType, TimestampType

In [2]:
! ls -la

total 1115100
drwxrwxr-x  4 big big      4096 May 13 17:29 .
drwxr-xr-x 33 big big      4096 May 12 14:59 ..
drwxrwxr-x  8 big big      4096 May 12 13:49 .git
-rw-rw-r--  1 big big         6 May  8 20:47 .gitignore
drwxrwxr-x  2 big big      4096 May 11 21:44 .ipynb_checkpoints
-rw-rw-r--  1 big big 570895494 May  8 17:13 local_part.csv
-rw-r--r--  1 big big 171002105 May 11 21:13 local_test.csv
-rw-r--r--  1 big big 399893389 May 11 21:13 local_train.csv
-rw-rw-r--  1 big big      3716 May 12 22:05 Preare_data.ipynb
-rw-rw-r--  1 big big     26684 May 13 17:29 Recommender.ipynb


In [3]:
# all field types reference
#__all__ = [
#   "DataType", "NullType", "StringType", "BinaryType", "BooleanType", "DateType",
#   "TimestampType", "DecimalType", "DoubleType", "FloatType", "ByteType", "IntegerType",
#   "LongType", "ShortType", "ArrayType", "MapType", "StructField", "StructType"]

In [4]:
schema = StructType([ \
    StructField("event_time",TimestampType(),True),\
    StructField("event_type",StringType(),False),\
    StructField("product_id",IntegerType(),True),\
    StructField("category_id", LongType(), True),\
    StructField("category_code", StringType(), True),\
    StructField("brand", StringType(), True),\
    StructField("price", DoubleType(), True),\
    StructField("user_id", IntegerType(), True),\
    StructField("user_session", StringType(), True),\
  ])

In [5]:
df=spark.read.csv('local_part.csv', schema=schema, inferSchema=True) # Read the dataset

In [6]:
df.printSchema()

root
 |-- event_time: timestamp (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- category_id: long (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: double (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- user_session: string (nullable = true)



In [7]:
df.show()

+-------------------+----------+----------+-------------------+--------------------+---------+------+---------+--------------------+
|         event_time|event_type|product_id|        category_id|       category_code|    brand| price|  user_id|        user_session|
+-------------------+----------+----------+-------------------+--------------------+---------+------+---------+--------------------+
|2019-10-01 01:00:01|      view|   1307067|2053013558920217191|  computers.notebook|   lenovo|251.74|550050854|7c90fc70-0e80-459...|
|2019-10-01 01:00:20|      view|   1003306|2053013555631882655|electronics.smart...|    apple|588.77|555446831|6ec635da-ea15-4a5...|
|2019-10-01 01:00:25|      view|  27500014|2053013554692358509|                null|  redmond| 37.98|555217733|74d40a28-41f9-432...|
|2019-10-01 01:00:31|      view|  28718079|2053013565362668491|  apparel.shoes.keds|  respect| 66.67|545323115|75fb5d0c-e907-429...|
|2019-10-01 01:00:31|      view|   3900746|2053013552326770905|applia

In [8]:
# Users events count
df.groupBy('user_id').count().orderBy('count',ascending=False).show(10,False)

+---------+-----+
|user_id  |count|
+---------+-----+
|512475445|806  |
|512365995|421  |
|526731152|306  |
|513021392|293  |
|512505687|283  |
|546270188|258  |
|546159478|257  |
|516308435|253  |
|514649263|240  |
|551211823|238  |
+---------+-----+
only showing top 10 rows



In [9]:
df.filter(df.user_id == 512475445).groupBy('event_type').count().orderBy('count',ascending=False).show(10,False)

+----------+-----+
|event_type|count|
+----------+-----+
|view      |806  |
+----------+-----+



In [10]:
# brands events count
df.groupBy('brand').count().orderBy('count',ascending=False).show(10,False)

+-------+------+
|brand  |count |
+-------+------+
|null   |610994|
|samsung|527583|
|apple  |411810|
|xiaomi |308436|
|huawei |110888|
|lucente|65673 |
|lg     |56602 |
|bosch  |56118 |
|oppo   |48640 |
|sony   |45535 |
+-------+------+
only showing top 10 rows



In [11]:
# events count
df.groupBy('event_type').count().orderBy('count',ascending=False).show(10,False)

+----------+-------+
|event_type|count  |
+----------+-------+
|view      |4078750|
|cart      |92702  |
|purchase  |74196  |
+----------+-------+



In [12]:
# convert event_type to raiting such us purchase = 1, view and chart = 0
# so if user bought item, he "rated" it
dictionary = {"purchase": "1", "view": "0", "cart": "0"}
df2 = df.na.replace(dictionary,"event_type")
#df2.show()

# cast to integer
df_data = df2.select(df2["event_type"].cast(IntegerType()), df2["user_id"], df2["product_id"]) 
df_data.show()



+----------+---------+----------+
|event_type|  user_id|product_id|
+----------+---------+----------+
|         0|550050854|   1307067|
|         0|555446831|   1003306|
|         0|555217733|  27500014|
|         0|545323115|  28718079|
|         0|555444559|   3900746|
|         0|515454339|  12712064|
|         0|551377651|   1003141|
|         0|519885473|   4100126|
|         0|555447577|  28717211|
|         0|512558158|   1004659|
|         0|525856698|  26500144|
|         0|544648245|   4300070|
|         0|513457407|   1004792|
|         0|550050854|   1306631|
|         0|514336739|   1004321|
|         0|537918940|   1004545|
|         0|555447570|  28715758|
|         0|519885473|   4100274|
|         0|516896785|   1004792|
|         0|555447748|  26201000|
+----------+---------+----------+
only showing top 20 rows



In [13]:
# let's check
df_data.groupBy('event_type').count().orderBy('count',ascending=False).show(10,False)

+----------+-------+
|event_type|count  |
+----------+-------+
|0         |4171452|
|1         |74196  |
+----------+-------+



In [14]:
# splitting dataset
train,test=df_data.randomSplit(weights = [0.8,0.2], seed = 10)

In [15]:
print(train.count())
print(test.count())

3395720
849928


In [16]:
# set up a model
recomend=ALS(\
    maxIter=3,\
    regParam=0.5,\
    implicitPrefs=True,\
    alpha=0.9,\
    userCol='user_id',\
    itemCol='product_id',\
    ratingCol='event_type',\
    nonnegative=True,\
    coldStartStrategy="drop"\
)

In [17]:
# train the model
recomend_model=recomend.fit(train)

In [18]:
# make predictions on test
predicted_ratings=recomend_model.transform(test)

In [19]:
# predicted_ratings.printSchema()

In [20]:
# show some predictions
predicted_ratings.filter(predicted_ratings.event_type == 1).orderBy(rand()).show(10)

+----------+---------+----------+------------+
|event_type|  user_id|product_id|  prediction|
+----------+---------+----------+------------+
|         1|512492404|   4804055|         0.0|
|         1|514119878|   4700422|         0.0|
|         1|513102832|   6100216|         0.0|
|         1|522197947|   1005205|5.7550933E-4|
|         1|558029983|   1004237|         0.0|
|         1|554025548|   5100576|         0.0|
|         1|517611413|   4804055|         0.0|
|         1|536901707|  15700078|         0.0|
|         1|520665320|  14701435|         0.0|
|         1|518845114|   1004240|         0.0|
+----------+---------+----------+------------+
only showing top 10 rows



In [21]:
# set up evaluator with Root Mean Square Error
evaluator=RegressionEvaluator(metricName='rmse',predictionCol='prediction',labelCol='event_type')

In [22]:
rmse=evaluator.evaluate(predicted_ratings)
# explicit - 100 - 1 - 0
# 20.390250369964022 - maxIter = 10
# 28.886748140963718 - maxIter = 3

# implicit
print(rmse)

0.13331617684682437


In [23]:
# number of unic products as items
unique_items=df_data.select('product_id').selectExpr("product_id as product_id_1").distinct()
print(unique_items.count())

125057


In [24]:
# user id for making recommendations
user_id=526731152

In [25]:
# number of products user interact
purchased_items=df_data.filter(df_data['user_id'] ==  user_id).select('product_id').distinct()  

In [26]:
print(purchased_items.count())

112


In [27]:
# join purchaised and all items
total_items = unique_items.join(\
                            purchased_items,\
                            unique_items.product_id_1 == purchased_items.product_id,\
                            how='left'\
                            )
print(total_items.count())

125057


In [28]:
total_items.show(10,False)

+------------+----------+
|product_id_1|product_id|
+------------+----------+
|1003938     |null      |
|1004666     |null      |
|1004739     |null      |
|1005158     |null      |
|1201512     |null      |
|1305803     |null      |
|1306176     |null      |
|1307005     |null      |
|1307184     |null      |
|1307463     |null      |
+------------+----------+
only showing top 10 rows



In [29]:
# select items that user didnt purchaise
remaining_items=total_items.where(col("product_id").isNull()).select(unique_items.product_id_1).distinct()
print(remaining_items.count())

124945


In [30]:
# add user_it col
remaining_items=remaining_items.withColumn("user_id",lit(int(user_id))).selectExpr("product_id_1 as product_id","user_id as user_id")

In [31]:
remaining_items.show(10,False)

+----------+---------+
|product_id|user_id  |
+----------+---------+
|1003938   |526731152|
|1004666   |526731152|
|1004739   |526731152|
|1005158   |526731152|
|1201512   |526731152|
|1305803   |526731152|
|1306176   |526731152|
|1307005   |526731152|
|1307184   |526731152|
|1307463   |526731152|
+----------+---------+
only showing top 10 rows



In [32]:
# get the predictions
predictions = recomend_model.transform(remaining_items).collect()

In [33]:
recommendations=recomend_model.transform(remaining_items).orderBy('prediction',ascending=False)

In [34]:
recommendations.show()

+----------+---------+----------+
|product_id|  user_id|prediction|
+----------+---------+----------+
|   1003051|526731152|       0.0|
|   1004324|526731152|       0.0|
|   1004441|526731152|       0.0|
|   1004867|526731152|       0.0|
|   1005133|526731152|       0.0|
|   1201471|526731152|       0.0|
|   1304411|526731152|       0.0|
|   1306249|526731152|       0.0|
|   1306312|526731152|       0.0|
|   1306436|526731152|       0.0|
|   1306648|526731152|       0.0|
|   1307039|526731152|       0.0|
|   1801604|526731152|       0.0|
|   1801689|526731152|       0.0|
|   1801691|526731152|       0.0|
|   1801789|526731152|       0.0|
|   1801816|526731152|       0.0|
|   2240015|526731152|       0.0|
|   2240020|526731152|       0.0|
|   2400604|526731152|       0.0|
+----------+---------+----------+
only showing top 20 rows

