In [2]:
from pyspark.sql import SparkSession

# Add here your team number teamx
team = 6

# location of your Hive database in HDFS
warehouse = "project/hive/warehouse"

spark = SparkSession.builder\
        .appName("{} - spark ML".format(team))\
        .master("yarn")\
        .config("hive.metastore.uris", "thrift://hadoop-02.uni.innopolis.ru:9883")\
        .config("spark.sql.warehouse.dir", warehouse)\
        .config("spark.sql.avro.compression.codec", "snappy")\
        .config("spark.yarn.queue", "master_teams")\
        .config("spark.executor.instances", "10")\
        .config("spark.executor.cores", "10")\
        .enableHiveSupport()\
        .getOrCreate()


## Read the hive tables as dataframes.

In [55]:
items = spark.read.format("avro").table('team6_projectdb.items')
items.createOrReplaceTempView('items')

In [56]:
items.printSchema()

root
 |-- itemid: integer (nullable = true)
 |-- shopid: integer (nullable = true)
 |-- item_name: string (nullable = true)
 |-- item_description: string (nullable = true)
 |-- item_variation: string (nullable = true)
 |-- price: float (nullable = true)
 |-- stock: integer (nullable = true)
 |-- cb_option: boolean (nullable = true)
 |-- is_preferred: boolean (nullable = true)
 |-- sold_count: integer (nullable = true)
 |-- item_creation_date: timestamp (nullable = true)
 |-- category: string (nullable = true)



In [57]:
items.show(2)

+--------+--------+--------------------+--------------------+--------------+-----+-----+---------+------------+----------+-------------------+----------+
|  itemid|  shopid|           item_name|    item_description|item_variation|price|stock|cb_option|is_preferred|sold_count| item_creation_date|  category|
+--------+--------+--------------------+--------------------+--------------+-----+-----+---------+------------+----------+-------------------+----------+
|88025115|11509993|Geometric Pattern...|Clothes Type:Padd...|            {}| 27.0|    0|     true|       false|         0|2016-11-09 16:01:00|Men's Wear|
|88025112|11509993|Geometric Pattern...|Clothes Type:Padd...|            {}| 28.0|    1|     true|       false|         0|2016-11-09 16:01:00|Men's Wear|
+--------+--------+--------------------+--------------------+--------------+-----+-----+---------+------------+----------+-------------------+----------+
only showing top 2 rows



## Build and fit a feature extraction pipeline.

### Vectorize Item Description

In [58]:
input_col = "item_description"
tokens_col = "desc_tokens"
output_col = "desc_enc"

In [59]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer


pipeline = Pipeline(stages=[
    Tokenizer(
        inputCol=input_col,
        outputCol=tokens_col
    )
])

items = pipeline.fit(items)\
            .transform(items)\
            .drop(input_col)

items.show(2)

+---------+--------+--------------------+--------------------+-----+-----+---------+------------+----------+-------------------+-----------+--------------------+
|   itemid|  shopid|           item_name|      item_variation|price|stock|cb_option|is_preferred|sold_count| item_creation_date|   category|         desc_tokens|
+---------+--------+--------------------+--------------------+-----+-----+---------+------------+----------+-------------------+-----------+--------------------+
|487531175|16174997|Creative Unisex F...|{40-41: 10.6, 36-...| 10.6| 5000|     true|       false|         0|2017-09-14 17:33:00|Men's Shoes|[specification:10...|
|487531172|16174997|Creative Unisex F...|{28-29: 7.9, 30-3...|  7.9| 1000|     true|       false|         0|2017-09-14 17:33:00|Men's Shoes|[specification:10...|
+---------+--------+--------------------+--------------------+-----+-----+---------+------------+----------+-------------------+-----------+--------------------+
only showing top 2 rows



In [60]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import Word2Vec


pipeline = Pipeline(stages=[
    Word2Vec(
        vectorSize=5,
        seed=42,
        minCount=10,
        inputCol=tokens_col,
        outputCol=output_col
    )
])

items = pipeline.fit(items)\
            .transform(items)\
            .drop(tokens_col)

items.show(2)

+---------+--------+--------------------+---------------+-----+-----+---------+------------+----------+-------------------+---------------+--------------------+
|   itemid|  shopid|           item_name| item_variation|price|stock|cb_option|is_preferred|sold_count| item_creation_date|       category|            desc_enc|
+---------+--------+--------------------+---------------+-----+-----+---------+------------+----------+-------------------+---------------+--------------------+
|581423084|16174997|Cross Stitch Tool...| {Default: 8.0}|  8.0| 1000|     true|       false|         3|2017-10-13 13:29:00|Design & Crafts|[-0.2476372364494...|
|581423071|16174997|Cross Stitch Tool...|{Default: 11.4}| 11.4| 1000|     true|       false|         0|2017-10-13 13:29:00|Design & Crafts|[-0.1775885929167...|
+---------+--------+--------------------+---------------+-----+-----+---------+------------+----------+-------------------+---------------+--------------------+
only showing top 2 rows



### Vectorize Item Variation

In [61]:
input_col = "item_variation"
tokens_col = "var_tokens"
output_col = "var_enc"

In [62]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer


pipeline = Pipeline(stages=[
    Tokenizer(
        inputCol=input_col,
        outputCol=tokens_col
    )
])

items = pipeline.fit(items)\
            .transform(items)\
            .drop(input_col)

items.show(2)

+---------+--------+--------------------+-----+-----+---------+------------+----------+-------------------+--------------------+--------------------+------------------+
|   itemid|  shopid|           item_name|price|stock|cb_option|is_preferred|sold_count| item_creation_date|            category|            desc_enc|        var_tokens|
+---------+--------+--------------------+-----+-----+---------+------------+----------+-------------------+--------------------+--------------------+------------------+
|534623211|16174997|2.4G Air Mouse Wi...|  7.8| 1000|     true|       false|         1|2017-09-30 07:22:00|Computers & Perip...|[-0.1966305688373...|   [{black:, 7.8}]|
|455041448|16174997|2.4G Air Mouse Wi...| 12.0| 1000|     true|       false|         0|2017-09-01 08:55:00|Computers & Perip...|[-0.2856807188963...|[{default:, 12.0}]|
+---------+--------+--------------------+-----+-----+---------+------------+----------+-------------------+--------------------+--------------------+------

In [63]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import Word2Vec


pipeline = Pipeline(stages=[
    Word2Vec(
        vectorSize=5,
        seed=42,
        minCount=1,
        inputCol=tokens_col,
        outputCol=output_col
    )
])

items = pipeline.fit(items)\
            .transform(items)\
            .drop(tokens_col)

items.show(2)

+---------+--------+--------------------+-----+-----+---------+------------+----------+-------------------+--------+--------------------+--------------------+
|   itemid|  shopid|           item_name|price|stock|cb_option|is_preferred|sold_count| item_creation_date|category|            desc_enc|             var_enc|
+---------+--------+--------------------+-----+-----+---------+------------+----------+-------------------+--------+--------------------+--------------------+
|744783814|16174997|SKMEI Mens Waterp...|14.02| 2000|     true|       false|         0|2017-12-09 15:43:00| Watches|[-0.2929523125930...|[-0.2924000592902...|
|744783812|16174997|SKMEI Mens Waterp...|13.21| 2000|     true|       false|         0|2017-12-09 15:43:00| Watches|[-0.2977395940328...|[-0.0685735863633...|
+---------+--------+--------------------+-----+-----+---------+------------+----------+-------------------+--------+--------------------+--------------------+
only showing top 2 rows



### Vectorize Item Name

In [64]:
input_col = "item_name"
tokens_col = "name_tokens"
output_col = "name_enc"

In [65]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer


pipeline = Pipeline(stages=[
    Tokenizer(
        inputCol=input_col,
        outputCol=tokens_col
    )
])

items = pipeline.fit(items)\
            .transform(items)\
            .drop(input_col)

items.show(2)

+---------+--------+-----+-----+---------+------------+----------+-------------------+-------------+--------------------+--------------------+--------------------+
|   itemid|  shopid|price|stock|cb_option|is_preferred|sold_count| item_creation_date|     category|            desc_enc|             var_enc|         name_tokens|
+---------+--------+-----+-----+---------+------------+----------+-------------------+-------------+--------------------+--------------------+--------------------+
|455044645|16174997| 45.7| 1000|     true|       false|         0|2017-09-01 08:57:00|Home & Living|[-0.1504048824241...|[0.53768449742347...|[escam, q8, hd, 9...|
|455044643|16174997| 45.7| 3000|     true|       false|         0|2017-09-01 08:57:00|Home & Living|[-0.1507162324350...|[0.53768449742347...|[escam, q8, hd, 9...|
+---------+--------+-----+-----+---------+------------+----------+-------------------+-------------+--------------------+--------------------+--------------------+
only showing top

In [66]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import Word2Vec


pipeline = Pipeline(stages=[
    Word2Vec(
        vectorSize=5,
        seed=42,
        minCount=1,
        inputCol=tokens_col,
        outputCol=output_col
    )
])

items = pipeline.fit(items)\
            .transform(items)\
            .drop(tokens_col)

items.show(2)

+---------+--------+-----+-----+---------+------------+----------+-------------------+---------------+--------------------+--------------------+--------------------+
|   itemid|  shopid|price|stock|cb_option|is_preferred|sold_count| item_creation_date|       category|            desc_enc|             var_enc|            name_enc|
+---------+--------+-----+-----+---------+------------+----------+-------------------+---------------+--------------------+--------------------+--------------------+
|229344431|20242999| 3.16|  100|     true|       false|         0|2017-04-21 23:00:00|Health & Beauty|[0.10731946888633...|[-0.0046485066413...|[-0.3792011671596...|
|228869648|20242999|  3.2|  100|     true|       false|         0|2017-04-21 14:54:00|Health & Beauty|[0.06137828536045...|[-0.0046485066413...|[-0.3792011671596...|
+---------+--------+-----+-----+---------+------------+----------+-------------------+---------------+--------------------+--------------------+--------------------+
only

### Encode timestamp

In [67]:
from pyspark.sql.functions import year, month, dayofmonth, hour, minute

# there is no sense in "seconds" column because it is always zero
items = items.withColumn("year", year("item_creation_date")) \
        .withColumn("month", month("item_creation_date")) \
        .withColumn("day", dayofmonth("item_creation_date")) \
        .withColumn("hour", hour("item_creation_date")) \
        .withColumn("minute", minute("item_creation_date")) \
        .drop("item_creation_date")

items.show(2)

+---------+--------+-----+-----+---------+------------+----------+--------+--------------------+--------------------+--------------------+----+-----+---+----+------+
|   itemid|  shopid|price|stock|cb_option|is_preferred|sold_count|category|            desc_enc|             var_enc|            name_enc|year|month|day|hour|minute|
+---------+--------+-----+-----+---------+------------+----------+--------+--------------------+--------------------+--------------------+----+-----+---+----+------+
|744783814|16174997|14.02| 2000|     true|       false|         0| Watches|[-0.2929523125930...|[-0.2924000592902...|[-0.1417144838720...|2017|   12|  9|  15|    43|
|744783812|16174997|13.21| 2000|     true|       false|         0| Watches|[-0.2977395940328...|[-0.0685735863633...|[-0.1417144838720...|2017|   12|  9|  15|    43|
+---------+--------+-----+-----+---------+------------+----------+--------+--------------------+--------------------+--------------------+----+-----+---+----+------+
only

In [68]:
from pyspark import keyword_only
from pyspark.ml import Transformer
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, Param, Params, TypeConverters
from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable
from pyspark.sql import DataFrame
from pyspark.sql.types import DoubleType
import pyspark.sql.functions as F
import math


class TimeEncoderTransformer(
    Transformer,
    HasInputCol,
    HasOutputCol,
    DefaultParamsReadable,
    DefaultParamsWritable
):
    input_col = Param(
        Params._dummy(),
        "input_col",
        "input column name.",
        typeConverter=TypeConverters.toString
    )
    output_col_sin = Param(
        Params._dummy(),
        "output_col_sin",
        "output column name for sin wave.",
        typeConverter=TypeConverters.toString
    )
    output_col_cos = Param(
        Params._dummy(),
        "output_col_cos",
        "output column name for cos wave.",
        typeConverter=TypeConverters.toString
    )
    timestamp_part = Param(
        Params._dummy(),
        "timestamp_part",
        "part of the timestamp like month, day, hour, minute",
        typeConverter=TypeConverters.toString
    )

    @keyword_only
    def __init__(
        self,
        input_col: str = "input",
        output_col_sin: str = "sin",
        output_col_cos: str = "cos",
        timestamp_part: str = "month"
    ):
        super(TimeEncoderTransformer, self).__init__()
        self._setDefault(
            input_col=None,
            output_col_sin=None,
            output_col_cos=None,
            timestamp_part=None
        )
        kwargs = self._input_kwargs
        self.set_params(**kwargs)

    @keyword_only
    def set_params(
        self,
        input_col: str = "input",
        output_col_sin: str = "sin",
        output_col_cos: str = "cos",
        timestamp_part: str = "month"
    ):
        kwargs = self._input_kwargs
        self._set(**kwargs)

    def get_input_col(self):
        return self.getOrDefault(self.input_col)

    def get_output_col_sin(self):
        return self.getOrDefault(self.output_col_sin)

    def get_output_col_cos(self):
        return self.getOrDefault(self.output_col_cos)

    def get_timestamp_part(self):
        return self.getOrDefault(self.timestamp_part)

    def _transform(self, df: DataFrame):
        input_col = self.get_input_col()
        output_col_sin = self.get_output_col_sin()
        output_col_cos = self.get_output_col_cos()
        timestamp_part = self.get_timestamp_part()

        if timestamp_part == 'month':
            denominator = 12
        elif timestamp_part == 'day':
            denominator = 31
        elif timestamp_part in ['hour', 'minute']:
            denominator = 60
        else:
            raise Exception()

        sin_udf = F.udf(
            lambda x: math.sin(2 * math.pi * x / denominator),
            DoubleType()
        )
        cos_udf = F.udf(
            lambda x: math.cos(2 * math.pi * x / denominator),
            DoubleType()
        )

        df = df.withColumn(output_col_sin, sin_udf(F.col(input_col)))
        df = df.withColumn(output_col_cos, cos_udf(F.col(input_col)))

        return df

In [69]:
from pyspark.ml import Pipeline

input_col = "month"

pipeline = Pipeline(stages=[
    TimeEncoderTransformer(
        input_col=input_col,
        output_col_sin=f"{input_col}_sin",
        output_col_cos=f"{input_col}_cos",
        timestamp_part=input_col
    )
])

items = pipeline.fit(items)\
            .transform(items)\
            .drop(input_col)

items.show(2)

+---------+-------+-----+-----+---------+------------+----------+----------------+--------------------+--------------------+--------------------+----+---+----+------+--------------------+------------------+
|   itemid| shopid|price|stock|cb_option|is_preferred|sold_count|        category|            desc_enc|             var_enc|            name_enc|year|day|hour|minute|           month_sin|         month_cos|
+---------+-------+-----+-----+---------+------------+----------+----------------+--------------------+--------------------+--------------------+----+---+----+------+--------------------+------------------+
|821115857|3344977| 1.25|   20|    false|       false|         0|Food & Beverages|[-0.0847395360469...|[-0.0046485066413...|[-0.2669184163212...|2018|  9|  18|     8| 0.49999999999999994|0.8660254037844387|
|780592157|3344977| 1.25|   90|    false|       false|         0|Food & Beverages|[-0.2529235662598...|[0.12054621366163...|[-0.2669184163212...|2017| 23|   7|    59|-2.449

In [70]:
from pyspark.ml import Pipeline

input_col = "day"

pipeline = Pipeline(stages=[
    TimeEncoderTransformer(
        input_col=input_col,
        output_col_sin=f"{input_col}_sin",
        output_col_cos=f"{input_col}_cos",
        timestamp_part=input_col
    )
])

items = pipeline.fit(items)\
            .transform(items)\
            .drop(input_col)

items.show(2)

+---------+--------+-----+-----+---------+------------+----------+--------+--------------------+--------------------+--------------------+----+----+------+--------------------+---------+------------------+-------------------+
|   itemid|  shopid|price|stock|cb_option|is_preferred|sold_count|category|            desc_enc|             var_enc|            name_enc|year|hour|minute|           month_sin|month_cos|           day_sin|            day_cos|
+---------+--------+-----+-----+---------+------------+----------+--------+--------------------+--------------------+--------------------+----+----+------+--------------------+---------+------------------+-------------------+
|744783814|16174997|14.02| 2000|     true|       false|         0| Watches|[-0.2929523125930...|[-0.2924000592902...|[-0.1417144838720...|2017|  15|    43|-2.44929359829470...|      1.0|0.9680771188662043|-0.2506525322587204|
|744783812|16174997|13.21| 2000|     true|       false|         0| Watches|[-0.2977395940328...|

In [71]:
from pyspark.ml import Pipeline

input_col = "hour"

pipeline = Pipeline(stages=[
    TimeEncoderTransformer(
        input_col=input_col,
        output_col_sin=f"{input_col}_sin",
        output_col_cos=f"{input_col}_cos",
        timestamp_part=input_col
    )
])

items = pipeline.fit(items)\
            .transform(items)\
            .drop(input_col)

items.show(2)

+--------+--------+-----+-----+---------+------------+----------+----------+--------------------+--------------------+--------------------+----+------+-------------------+------------------+------------------+-------------------+------------------+--------------------+
|  itemid|  shopid|price|stock|cb_option|is_preferred|sold_count|  category|            desc_enc|             var_enc|            name_enc|year|minute|          month_sin|         month_cos|           day_sin|            day_cos|          hour_sin|            hour_cos|
+--------+--------+-----+-----+---------+------------+----------+----------+--------------------+--------------------+--------------------+----+------+-------------------+------------------+------------------+-------------------+------------------+--------------------+
|88025115|11509993| 27.0|    0|     true|       false|         0|Men's Wear|[0.03715182226151...|[-0.0046485066413...|[-0.4407809637486...|2016|     1|-0.5000000000000004|0.8660254037844384|

In [72]:
from pyspark.ml import Pipeline

input_col = "minute"

pipeline = Pipeline(stages=[
    TimeEncoderTransformer(
        input_col=input_col,
        output_col_sin=f"{input_col}_sin",
        output_col_cos=f"{input_col}_cos",
        timestamp_part=input_col
    )
])

items = pipeline.fit(items)\
            .transform(items)

items.show(2)

+---------+--------+-----+-----+---------+------------+----------+-------------+--------------------+--------------------+--------------------+----+------+---------+--------------------+-------------------+------------------+------------------+------------------+-------------------+------------------+
|   itemid|  shopid|price|stock|cb_option|is_preferred|sold_count|     category|            desc_enc|             var_enc|            name_enc|year|minute|month_sin|           month_cos|            day_sin|           day_cos|          hour_sin|          hour_cos|         minute_sin|        minute_cos|
+---------+--------+-----+-----+---------+------------+----------+-------------+--------------------+--------------------+--------------------+----+------+---------+--------------------+-------------------+------------------+------------------+------------------+-------------------+------------------+
|455044645|16174997| 45.7| 1000|     true|       false|         0|Home & Living|[-0.1504048

### Encode Category

In [73]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

input_col = "category"

pipeline = Pipeline(stages=[
    StringIndexer(
        inputCol=input_col,
        outputCol=f"{input_col}_indexed")
])

items = pipeline.fit(items)\
            .transform(items)\
            .drop(input_col)

items.show(2)

+---------+--------+-----+-----+---------+------------+----------+--------------------+--------------------+--------------------+----+------+---------+--------------------+------------------+-------------------+-------------------+--------------------+-------------------+-------------------+----------------+
|   itemid|  shopid|price|stock|cb_option|is_preferred|sold_count|            desc_enc|             var_enc|            name_enc|year|minute|month_sin|           month_cos|           day_sin|            day_cos|           hour_sin|            hour_cos|         minute_sin|         minute_cos|category_indexed|
+---------+--------+-----+-----+---------+------------+----------+--------------------+--------------------+--------------------+----+------+---------+--------------------+------------------+-------------------+-------------------+--------------------+-------------------+-------------------+----------------+
|185120551|16503999|  8.0|    5|    false|       false|         0|[0.0

In [74]:
items.printSchema()

root
 |-- itemid: integer (nullable = true)
 |-- shopid: integer (nullable = true)
 |-- price: float (nullable = true)
 |-- stock: integer (nullable = true)
 |-- cb_option: boolean (nullable = true)
 |-- is_preferred: boolean (nullable = true)
 |-- sold_count: integer (nullable = true)
 |-- desc_enc: vector (nullable = true)
 |-- var_enc: vector (nullable = true)
 |-- name_enc: vector (nullable = true)
 |-- year: integer (nullable = true)
 |-- minute: integer (nullable = true)
 |-- month_sin: double (nullable = true)
 |-- month_cos: double (nullable = true)
 |-- day_sin: double (nullable = true)
 |-- day_cos: double (nullable = true)
 |-- hour_sin: double (nullable = true)
 |-- hour_cos: double (nullable = true)
 |-- minute_sin: double (nullable = true)
 |-- minute_cos: double (nullable = true)
 |-- category_indexed: double (nullable = false)



In [75]:
query = """CREATE EXTERNAL TABLE IF NOT EXISTS items_feat_extracted (
    itemid INTEGER,
    shopid INTEGER,
    price FLOAT,
    stock INTEGER,
    cb_option BOOLEAN,
    is_oreferred BOOLEAN,
    sold_count INTEGER,
    desc_enc ARRAY<DOUBLE>,
    var_enc ARRAY<DOUBLE>,
    name_enc ARRAY<DOUBLE>,
    year INTEGER,
    month_sin DOUBLE,
    month_cos DOUBLE,
    day_sin DOUBLE,
    day_cos DOUBLE,
    hour_sin DOUBLE,
    hour_cos DOUBLE,
    minute_sin DOUBLE,
    minute_cos DOUBLE,
    category_indexed DOUBLE
)
STORED AS TEXTFILE
LOCATION 'project/hive/warehouse/items_feat_extracted'
"""

# spark.sql(query)

In [76]:
import os

def run(command):
    return os.popen(command).read()

with open('../scripts/secrets/.hive.pass') as f:
    password = f.read()

run(f"beeline -u jdbc:hive2://hadoop-03.uni.innopolis.ru:10001 -n team6 -p {password} -e {query}")

'0: jdbc:hive2://hadoop-03.uni.innopolis.ru:10> '

In [77]:
items.limit(1)\
        .write\
        .mode("overwrite")\
        .saveAsTable("team6_projectdb.items_feat_extracted")

### Assemble Features

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler

inputCols = [
    'itemid',
    'shopid',
    'price',
    'stock',
    'cb_option',
    'is_preferred',
    'sold_count',
    'year',
    'month_sin',
    'month_cos',
    'day_sin',
    'day_cos',
    'hour_sin',
    'hour_cos',
    'minute_sin',
    'minute_cos',
    'name_enc',
    'desc_enc',
    'var_enc'
]


pipeline = Pipeline(stages=[
    VectorAssembler(
        inputCols=inputCols,
        outputCol="features"
    )
])

items = pipeline.fit(items).transform(items)

for col in inputCols:
    items = items.drop(col)

items.show(2)

## Split the input dataset into train and test datasets.

In [27]:
(train_data, test_data) = items.randomSplit([0.8, 0.2], seed=10)

In [28]:
train_data.select("features", "category_indexed")\
    .repartition(4)\
    .write\
    .mode("overwrite")\
    .format("json")\
    .save("project/data/train")

# Run it from root directory of the repository
!hdfs dfs -get project/data/train/*.json ../data/train

test_data.select("features", "category_indexed")\
    .repartition(4)\
    .write\
    .mode("overwrite")\
    .format("json")\
    .save("project/data/test")

# Run it from root directory of the repository
!hdfs dfs -get project/data/test/*.json ../data/test

## Select two types of ML models based on the ML task specified in project.info sheet.

1. Logistic Regression
2. Decision Tree

## First model type

### Build and train the model.

In [29]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(labelCol="category_indexed", featuresCol="features")
model = lr.fit(train_data)

### Predict for the test data.

In [40]:
# Make predictions on the test set
predictions = model.transform(test_data)

### Evaluate the model.

In [46]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Evaluate the model using F1-score
evaluator = MulticlassClassificationEvaluator(
    labelCol="category_indexed",
    predictionCol="prediction",
    metricName="f1"
)
f1_score = evaluator.evaluate(predictions)
print(f"Test set F1-score: {f1_score}")

Test set F1-score: 0.6388818106996959


In [53]:
categories = items.select('category_indexed').distinct()
categories.show()

+----------------+
|category_indexed|
+----------------+
|            16.0|
|            15.0|
|             3.0|
|            14.0|
|            18.0|
|             0.0|
|             8.0|
|             2.0|
|            13.0|
|            19.0|
|             9.0|
|            12.0|
|            10.0|
|             1.0|
|             5.0|
|            11.0|
|             7.0|
|             6.0|
|            17.0|
|             4.0|
+----------------+
only showing top 20 rows



In [50]:
# Evaluate the model using per-category precision
for metricLabel in range(21):
    evaluator = MulticlassClassificationEvaluator(
        labelCol="category_indexed",
        predictionCol="prediction",
        metricName="precisionByLabel",
        metricLabel=metricLabel
    )
    precisionByLabel = evaluator.evaluate(predictions)
    print(f"Test set precision for category {metricLabel}: {precisionByLabel}")

Test set precision for category 0: 0.683888447704022
Test set precision for category 1: 0.8509016826378933
Test set precision for category 2: 0.6119703062171358
Test set precision for category 3: 0.7564494971578487
Test set precision for category 4: 0.3930151338766007
Test set precision for category 5: 0.38817733990147785
Test set precision for category 6: 0.6101610493109746
Test set precision for category 7: 0.6299212598425197
Test set precision for category 8: 0.7067178502879079
Test set precision for category 9: 0.5828260173021468
Test set precision for category 10: 0.81212976022567
Test set precision for category 11: 0.31530139103554866
Test set precision for category 12: 0.1553398058252427
Test set precision for category 13: 0.3215258855585831
Test set precision for category 14: 0.5639810426540285
Test set precision for category 15: 0.25862068965517243
Test set precision for category 16: 0.0
Test set precision for category 17: 0.6
Test set precision for category 18: 0.142857142857

In [54]:
# Evaluate the model using per-category recall
for metricLabel in range(21):
    evaluator = MulticlassClassificationEvaluator(
        labelCol="category_indexed",
        predictionCol="prediction",
        metricName="recallByLabel",
        metricLabel=metricLabel
    )
    recallByLabel = evaluator.evaluate(predictions)
    print(f"Test set recall for category {metricLabel}: {recallByLabel}")

Test set recall for category 0: 0.8454311085648811
Test set recall for category 1: 0.9136352588511707
Test set recall for category 2: 0.72397490680971
Test set recall for category 3: 0.638945991403476
Test set recall for category 4: 0.28433814980900185
Test set recall for category 5: 0.463402489626556
Test set recall for category 6: 0.6575713769425371
Test set recall for category 7: 0.629147571035747
Test set recall for category 8: 0.7471287940935193
Test set recall for category 9: 0.45863900834809007
Test set recall for category 10: 0.836563494359271
Test set recall for category 11: 0.11639549436795996
Test set recall for category 12: 0.047653429602888084
Test set recall for category 13: 0.09515570934256055
Test set recall for category 14: 0.1631644004944376
Test set recall for category 15: 0.029469548133595286
Test set recall for category 16: 0.0
Test set recall for category 17: 0.009592326139088728
Test set recall for category 18: 0.0078125
Test set recall for category 19: 0.0
Test 

### Specify at least 2 hyperparameters for it and the settings of grid search and cross validation.

1. RegParam
2. ElasticNetParam

### Optimize its hyperparameters using cross validation and grid search on the training data only.

In [50]:
from pyspark.ml.tuning import ParamGridBuilder

paramGrid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.0, 0.5, 1.0]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
    .build()

In [51]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(
    labelCol="category_indexed",
    predictionCol="prediction",
    metricName="f1"
)

In [52]:
from pyspark.ml.tuning import CrossValidator

# Create the CrossValidator
cv = CrossValidator(
    estimator=lr,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=3
)

In [53]:
# Fit the model using cross-validation on the training data
cvModel = cv.fit(train_data)

In [54]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType

schema = StructType([
    StructField("parameter", StringType(), False),
    StructField("value", DoubleType(), False)
])

data = [
    ("RegParam", cvModel.bestModel.getRegParam()),
    ("ElasticNetParam", cvModel.bestModel.getElasticNetParam())
]

df = spark.createDataFrame(data, schema)

df.coalesce(1)\
    .write\
    .mode("overwrite")\
    .format("csv")\
    .save("project/output/model1_hyperparameters")

!rm -rf ../output/model1_hyperparameters 
!mkdir ../output/model1_hyperparameters
!hdfs dfs -get project/output/model1_hyperparameters/* ../output/model1_hyperparameters/

mkdir: cannot create directory ‘../output/model1_hyperparameters’: File exists
get: `../output/model1_hyperparameters/_SUCCESS': File exists


### Select the best model (model1) from grid search.

In [55]:
# Get the best model
bestModel = cvModel.bestModel

### Save the model1 to HDFS in location like project/models/model1 and later put it in models/model1 folder in the repository.

In [36]:
bestModel.write().overwrite().save("project/models/model1")

!hdfs dfs -get project/models/model1 ../models/model1

### Predict for the test data using the model1.

In [56]:
# Make predictions on the test set
predictions = bestModel.transform(test_data)

### Save the prediction results in HDFS in a CSV file like project/output/model1_predictions and later save it in output/model1_predictions.csv folder in the repository…

In [59]:
predictions.select("category_indexed", "prediction")\
    .repartition(4)\
    .write\
    .mode("overwrite")\
    .format("csv")\
    .save("project/output/model1_predictions")

!rm -rf ../output/model1_predictions
!mkdir ../output/model1_predictions
!hdfs dfs -get project/output/model1_predictions/* ../output/model1_predictions

### Evaluate the best model (model1) on the test data.

In [60]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Make predictions on the test set
predictions = bestModel.transform(test_data)

# Evaluate the model using F1-score
evaluator = MulticlassClassificationEvaluator(
    labelCol="category_indexed",
    predictionCol="prediction",
    metricName="f1"
)
f1_score_1 = evaluator.evaluate(predictions)
print(f"Test set F1-score: {f1_score_1}")

Test set F1-score: 0.6361978263932841


## Second model type.

### Build and train the model.

In [61]:
from pyspark.ml.classification import DecisionTreeClassifier

dt = DecisionTreeClassifier(labelCol="category_indexed", featuresCol="features")
model = dt.fit(train_data)

### Predict for the test data.

In [62]:
# Make predictions on the test set
predictions = model.transform(test_data)

### Evaluate the model.

In [63]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Evaluate the model using F1-score
evaluator = MulticlassClassificationEvaluator(
    labelCol="category_indexed",
    predictionCol="prediction",
    metricName="f1"
)
f1_score = evaluator.evaluate(predictions)
print(f"Test set F1-score: {f1_score}")

Test set F1-score: 0.518044828295119


### Specify at least 2 hyperparameters for it and the settings of grid search and cross validation.

1. MaxDepth
2. MaxInfoGain

### Optimize its hyperparameters using cross validation and grid search on the training data only.

In [64]:
from pyspark.ml.tuning import ParamGridBuilder

paramGrid = ParamGridBuilder() \
    .addGrid(dt.maxDepth, [2, 5, 10]) \
    .addGrid(dt.minInfoGain, [0.0, 0.01, 0.1]) \
    .build()

In [65]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(
    labelCol="category_indexed",
    predictionCol="prediction",
    metricName="f1"
)

In [66]:
from pyspark.ml.tuning import CrossValidator

# Create the CrossValidator
cv = CrossValidator(
    estimator=dt,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=3
)

In [67]:
# Fit the model using cross-validation on the training data
cvModel = cv.fit(train_data)

In [69]:
from pyspark.sql.functions import col

data = [
    ("MaxDepth", cvModel.bestModel.getMaxDepth()),
    ("MinInfoGain", cvModel.bestModel.getMinInfoGain())
]

# Convert integer values to double
data = [(k, float(v)) for k, v in data]

df = spark.createDataFrame(data, schema)

df.coalesce(1)\
    .write\
    .mode("overwrite")\
    .format("csv")\
    .save("project/output/model2_hyperparameters")

!rm -rf ../output/model2_hyperparameters 
!mkdir ../output/model2_hyperparameters
!hdfs dfs -get project/output/model2_hyperparameters/* ../output/model2_hyperparameters/

### Select the best model (model2) from grid search.

In [70]:
# Get the best model
bestModel = cvModel.bestModel

### Save the model2 to HDFS in location like project/models/model2 and later put it in models/model2 folder in the repository.

In [56]:
bestModel.write().overwrite().save("project/models/model2")

!hdfs dfs -get project/models/model2 ../models/model2

### Predict for the test data using the model2.

In [71]:
# Make predictions on the test set
predictions = bestModel.transform(test_data)

### Save the prediction results in HDFS in a CSV file like project/output/model2_predictions and later save it in output/model2_predictions.csv folder in the repository.

In [72]:
predictions.select("category_indexed", "prediction")\
    .repartition(4)\
    .write\
    .mode("overwrite")\
    .format("csv")\
    .save("project/output/model2_predictions")

!rm -rf ../output/model2_predictions 
!mkdir ../output/model2_predictions
!hdfs dfs -get project/output/model2_predictions/* ../output/model2_predictions

### Evaluate the best model (model2) on the test data.

In [73]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Make predictions on the test set
predictions = bestModel.transform(test_data)

# Evaluate the model using F1-score
evaluator = MulticlassClassificationEvaluator(
    labelCol="category_indexed",
    predictionCol="prediction",
    metricName="f1"
)
f1_score_2 = evaluator.evaluate(predictions)
print(f"Test set F1-score: {f1_score_2}")

Test set F1-score: 0.6746254145968918


## Compare the models (model1, model2) on the test data.

In [74]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType

schema = StructType([
    StructField("model", StringType(), False),
    StructField("f1_score", DoubleType(), False)
])

data = [
    ("Logistic Regression", f1_score_1),
    ("Decision Tree", f1_score_2)
]

df = spark.createDataFrame(data, schema)

In [75]:
df.coalesce(1)\
    .write\
    .mode("overwrite")\
    .format("csv")\
    .save("project/output/evaluation")

!rm -rf ../output/comparison 
!mkdir ../output/comparison
!hdfs dfs -get project/output/evaluation/* ../output/comparison