In [1]:
from pyspark.sql.functions import rand
from pyspark.ml.feature import StringIndexer, IndexToString
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col, lit
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, LongType, DoubleType, TimestampType

In [2]:
! ls -la

total 1115084
drwxrwxr-x  4 big big      4096 May 12 09:47 .
drwxr-xr-x 32 big big      4096 May 11 22:33 ..
drwxrwxr-x  8 big big      4096 May 11 22:33 .git
-rw-rw-r--  1 big big         6 May  8 20:47 .gitignore
drwxrwxr-x  2 big big      4096 May 11 21:44 .ipynb_checkpoints
-rw-rw-r--  1 big big 570895494 May  8 17:13 local_part.csv
-rw-r--r--  1 big big 171002105 May 11 21:13 local_test.csv
-rw-r--r--  1 big big 399893389 May 11 21:13 local_train.csv
-rw-rw-r--  1 big big      3714 May 11 22:18 Preare_data.ipynb
-rw-rw-r--  1 big big     10456 May 12 09:47 Recommender.ipynb


In [3]:
# all field types reference
#__all__ = [
#   "DataType", "NullType", "StringType", "BinaryType", "BooleanType", "DateType",
#   "TimestampType", "DecimalType", "DoubleType", "FloatType", "ByteType", "IntegerType",
#   "LongType", "ShortType", "ArrayType", "MapType", "StructField", "StructType"]

In [4]:
schema = StructType([ \
    StructField("event_time",TimestampType(),True),\
    StructField("event_type",StringType(),False),\
    StructField("product_id",IntegerType(),True),\
    StructField("category_id", LongType(), True),\
    StructField("category_code", StringType(), True),\
    StructField("brand", StringType(), True),\
    StructField("price", DoubleType(), True),\
    StructField("user_id", IntegerType(), True),\
    StructField("user_session", StringType(), True),\
  ])

In [5]:
df=spark.read.csv('local_train.csv', schema=schema, inferSchema=True) # Read the dataset

In [6]:
df.printSchema()

root
 |-- event_time: timestamp (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- category_id: long (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: double (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- user_session: string (nullable = true)



In [7]:
df.show()

+-------------------+----------+----------+-------------------+--------------------+---------+------+---------+--------------------+
|         event_time|event_type|product_id|        category_id|       category_code|    brand| price|  user_id|        user_session|
+-------------------+----------+----------+-------------------+--------------------+---------+------+---------+--------------------+
|2019-10-01 01:00:01|      view|   1307067|2053013558920217191|  computers.notebook|   lenovo|251.74|550050854|7c90fc70-0e80-459...|
|2019-10-01 01:00:20|      view|   1003306|2053013555631882655|electronics.smart...|    apple|588.77|555446831|6ec635da-ea15-4a5...|
|2019-10-01 01:00:25|      view|  27500014|2053013554692358509|                null|  redmond| 37.98|555217733|74d40a28-41f9-432...|
|2019-10-01 01:00:36|      view|  12712064|2053013553559896355|                null| triangle| 30.89|515454339|828dbd8e-8683-409...|
|2019-10-01 01:00:41|      view|   1003141|2053013555631882655|electr

In [8]:
# Users events count
df.groupBy('user_id').count().orderBy('count',ascending=False).show(10,False)

+---------+-----+
|user_id  |count|
+---------+-----+
|512475445|554  |
|512365995|301  |
|526731152|227  |
|513021392|207  |
|546159478|193  |
|512505687|192  |
|546270188|182  |
|516308435|177  |
|514649263|171  |
|551211823|165  |
+---------+-----+
only showing top 10 rows



In [9]:
# brands events count
df.groupBy('brand').count().orderBy('count',ascending=False).show(10,False)

+-------+------+
|brand  |count |
+-------+------+
|null   |427856|
|samsung|369262|
|apple  |288448|
|xiaomi |216395|
|huawei |77701 |
|lucente|46132 |
|lg     |39563 |
|bosch  |39530 |
|oppo   |33985 |
|sony   |31908 |
+-------+------+
only showing top 10 rows



In [10]:
# events count
df.groupBy('event_type').count().orderBy('count',ascending=False).show(10,False)

+----------+-------+
|event_type|count  |
+----------+-------+
|view      |2857147|
|cart      |64843  |
|purchase  |51968  |
+----------+-------+

