## Create session

In [1]:
import os
import sys
import numpy as np

os.environ["PYSPARK_PYTHON"] = "/opt/anaconda/envs/bd9/bin/python"
os.environ["SPARK_HOME"]     = "/usr/hdp/current/spark2-client"

spark_home = os.environ.get("SPARK_HOME", None)
if not spark_home:
    raise ValueError("SPARK_HOME environment variable is not set")

sys.path.insert(0, os.path.join(spark_home, "python"))
sys.path.insert(0, os.path.join(spark_home, "python/lib/py4j-0.10.7-src.zip"))

import pyspark
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import BinaryClassificationEvaluator

conf = SparkConf()\
       .setAppName("artem.spitsin_lab03")\
       .set("spark.driver.cores", "2")\
       .set("spark.driver.memory", "1g")\
       .set("spark.executor.instances", "4")\
       .set("spark.executor.cores", "5")\
       .set("spark.executor.memory", "2g")\
       .set("spark.executor.memoryOverhead", "2g")\
       .set("spark.sql.autoBroadcastJoinThreshold", -1)

ss = SparkSession\
     .builder\
     .appName("artem.spitsin_lab03")\
     .config(conf=conf)\
     .getOrCreate()

sc = ss.sparkContext

ss

## Functions

In [2]:
def create_agg_feature(data, groupby_cols:list, agg_expression):
    return data.groupby(*groupby_cols).agg(agg_expression)

def multiple_join_features(core_data, data_features:list, how:str="left"):
    for feature in data_features:
        join_key = "item_id" if "item_id" in feature.columns else "user_id"
        core_data = core_data.join(feature, on=join_key, how=how)
        
    return core_data

def feature_importance(model, name_features:list) -> "spark.DataFrame":
    feature_importances = model.featureImportances
    data_importances = [
        [name, float(feature_importances[ind])] for ind, name in enumerate(name_features)
    ]
    
    return ss.createDataFrame(
        data=data_importances, 
        schema=["feature", "importance"]
    ).sort(F.col("importance").desc())

@F.pandas_udf(returnType=DoubleType())
def sigmoid(values):
    return (1 / (1 + np.exp(-values))).astype("float32")

## Loading data

In [3]:
base_path = "/labs/slaba03"

train_data = ss.read.csv(f"{base_path}/laba03_train.csv", header=True, sep=",", inferSchema="true")
test_data  = ss.read.csv(f"{base_path}/laba03_test.csv", header=True, sep=",", inferSchema="true")

In [4]:
train_data.show(3), train_data.printSchema();

+-------+-------+--------+
|user_id|item_id|purchase|
+-------+-------+--------+
|   1654|  74107|       0|
|   1654|  89249|       0|
|   1654|  99982|       0|
+-------+-------+--------+
only showing top 3 rows

root
 |-- user_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- purchase: integer (nullable = true)



In [5]:
test_data.show(3), test_data.printSchema();

+-------+-------+--------+
|user_id|item_id|purchase|
+-------+-------+--------+
|   1654|  94814|    null|
|   1654|  93629|    null|
|   1654|   9980|    null|
+-------+-------+--------+
only showing top 3 rows

root
 |-- user_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- purchase: string (nullable = true)



## Feature engineering

In [6]:
# Features based on laba03_train
agg_expressions = [
    (F.sum("purchase"), "num_purchases"),
    (F.sum("purchase") / F.count("purchase"), "ratio_purchases")
]

calc_agg_features = []
for expression, name_feature in agg_expressions:
    calc_agg_features += [
        create_agg_feature(train_data, ["user_id"], expression.alias(f"{name_feature}_by_user")),
        create_agg_feature(train_data, ["item_id"], expression.alias(f"{name_feature}_by_item"))
    ]
    
# Join features
train = multiple_join_features(train_data, calc_agg_features, how="left")
test  = multiple_join_features(test_data, calc_agg_features, how="left")

## Preparation data for model

In [7]:
name_features = [
    "num_purchases_by_user", "num_purchases_by_item"
]

assembler = VectorAssembler(inputCols=name_features, outputCol="features")

In [8]:
%%time

train_sample = assembler.transform(train).select("features", "purchase").cache()
test_sample  = assembler.transform(test).select("user_id", "item_id", "features").cache()

train.count(), test.count()

CPU times: user 14.5 ms, sys: 0 ns, total: 14.5 ms
Wall time: 1min 15s


## Training model

In [9]:
%%time

params = dict(
    featuresCol="features",
    labelCol="purchase",
    predictionCol="raw_predictions",
    numTrees=100,
    maxDepth=5
)

# Training of regressor shows very good results :D
model = RandomForestRegressor(**params).fit(train_sample)

CPU times: user 18.4 ms, sys: 4.87 ms, total: 23.3 ms
Wall time: 2min 20s


In [10]:
feature_importance(model, name_features).show(vertical=True, truncate=False)

-RECORD 0---------------------------
 feature    | num_purchases_by_user 
 importance | 0.5037890713161708    
-RECORD 1---------------------------
 feature    | num_purchases_by_item 
 importance | 0.4962109286838291    



## Evaluating

In [11]:
train_predictions = model.transform(train_sample)

evaluator = BinaryClassificationEvaluator(
    labelCol="purchase", 
    rawPredictionCol="raw_predictions",
    metricName="areaUnderROC"
)

evaluator.evaluate(train_predictions)

0.929630472361225

## Submitting

In [12]:
%%time

predictions = model.transform(test_sample)

predictions\
.select(
   "user_id", 
   "item_id",
   sigmoid("raw_predictions").alias("purchase")
)\
.orderBy(["user_id", "item_id"], ascending=True)\
.toPandas()\
.to_csv("lab03.csv", header=True)

CPU times: user 27.3 s, sys: 1.37 s, total: 28.6 s
Wall time: 1min 35s


## Stopping session

In [13]:
ss.catalog.clearCache()
ss.stop()