# Обучение моделей

## Загрузка библиотек

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline

from pyspark.ml.feature import VectorAssembler

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.regression import GBTRegressor

In [2]:
spark = SparkSession.builder.appName("PySparkML").getOrCreate()

In [3]:
spark

## Загрузка данных

In [5]:
train_data = spark.read.parquet('train.parquet')
test_data = spark.read.parquet('test.parquet')

In [6]:
train_data.show(5)

+-----+---------------------+---------+------+------+----------------+---------+-----------------+
|ad_id|target_audience_count|has_video|is_cpm|is_cpc|         ad_cost|day_count|              ctr|
+-----+---------------------+---------+------+------+----------------+---------+-----------------+
|    1|     10707.2440058622|        1|     1|     0|201.829292651124|       15|0.431740082807281|
|    5|     10643.3872649482|        1|     1|     0|192.577221699704|       15|0.809264519216201|
|    6|     11418.7085911347|        1|     1|     0|204.104562956739|       11|0.909738306804039|
|    7|     10109.3278687796|        1|     1|     0|194.255798599684|       12|0.941221039774456|
|    8|     10665.1119991977|        1|     1|     0|202.658042557742|       14|0.986790019690954|
+-----+---------------------+---------+------+------+----------------+---------+-----------------+
only showing top 5 rows



## Пайпланы

Подготовка общих частей пайпланов.

In [7]:
feature_cols = list(set(train_data.columns) - {'ad_id', 'ctr'})
features = VectorAssembler(inputCols=feature_cols, outputCol='features')

evaluator = RegressionEvaluator(labelCol='ctr', predictionCol='prediction', metricName='rmse')

### Дерево решений

In [13]:
%%time

dt = DecisionTreeRegressor(featuresCol='features',
                           labelCol='ctr',
                           predictionCol='prediction',
                           seed=42)

dt_grid = (ParamGridBuilder()
           .addGrid(dt.maxDepth, range(4, 11, 2))
           .build())

dt_cv = CrossValidator(estimator=dt,
                       estimatorParamMaps=dt_grid,
                       evaluator=evaluator,
                       numFolds=3)

dt_pipeline = Pipeline(stages=[features, dt_cv]).fit(train_data)

CPU times: user 324 ms, sys: 98.9 ms, total: 423 ms
Wall time: 10.1 s


In [9]:
dt_pred = dt_pipeline.transform(test_data)
dt_rmse = evaluator.evaluate(dt_pred)

print('RMSE дерева решений:', dt_rmse)

RMSE дерева решений: 0.25441025590338934


### Случайный лес

In [12]:
%%time

rf = RandomForestRegressor(featuresCol='features',
                           labelCol='ctr',
                           predictionCol='prediction',
                           seed=42)

rf_grid = (ParamGridBuilder()
           .addGrid(rf.maxDepth, range(4, 11, 2))
           .addGrid(rf.numTrees, range(1, 21, 10))
           .build())

rf_cv = CrossValidator(estimator=rf,
                       estimatorParamMaps=rf_grid,
                       evaluator=evaluator,
                       numFolds=3)

rf_pipeline = Pipeline(stages=[features, rf_cv]).fit(train_data)

CPU times: user 737 ms, sys: 212 ms, total: 949 ms
Wall time: 32.5 s


In [10]:
rf_pred = rf_pipeline.transform(test_data)
rf_rmse = evaluator.evaluate(rf_pred)

print('RMSE случайного леса:', rf_rmse)

RMSE случайного леса: 0.2554683186368623


### Градиентный бустинг

In [11]:
%%time

gbt = GBTRegressor(featuresCol='features',
                   labelCol='ctr',
                   predictionCol='prediction')

gbt_grid = (ParamGridBuilder()
            .addGrid(gbt.maxDepth, range(2, 11, 4))
            .addGrid(gbt.stepSize, [0.1])
            .build())

gbt_cv = CrossValidator(estimator=gbt,
                        estimatorParamMaps=gbt_grid,
                        evaluator=evaluator,
                        numFolds=3)

gbt_pipeline = Pipeline(stages=[features, gbt_cv]).fit(train_data)

CPU times: user 276 ms, sys: 87 ms, total: 363 ms
Wall time: 1min 43s


In [12]:
gbt_pred = gbt_pipeline.transform(test_data)
gbt_rmse = evaluator.evaluate(gbt_pred)

print('RMSE случайного леса:', gbt_rmse)

RMSE случайного леса: 0.2543837508625071
