In [1]:
import io
import sys
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, datediff
from pyspark.sql import functions as F

In [2]:
TRAIN_TEST_SPLIT = [0.75, 0.25]

In [3]:
spark = SparkSession.builder.appName('PySparkTasks').getOrCreate()

In [4]:
# сначала загружаем данные
df = spark.read.parquet('clickstream.parquet')

In [5]:
df.show(5)

+----------+-------------------+-----+--------+------+---------------+-----------------+------------+-------+---------+---------------------+
|      date|               time|event|platform| ad_id|client_union_id|compaign_union_id|ad_cost_type|ad_cost|has_video|target_audience_count|
+----------+-------------------+-----+--------+------+---------------+-----------------+------------+-------+---------+---------------------+
|2019-04-01|2019-03-31 21:00:48| view| android| 45061|          34734|            45061|         CPM|  200.6|        0|              1955269|
|2019-04-01|2019-03-31 21:00:48| view|     web|121288|         121288|           121288|         CPM|  187.4|        0|               232011|
|2019-04-01|2019-03-31 21:01:03| view| android|102737|         102535|           102564|         CPC|   60.7|        0|                 4410|
|2019-04-01|2019-03-31 21:01:03| view| android|107564|         106914|           107564|         CPM|  217.3|        0|                62711|
|2019-

In [6]:
# создаем новые признаки
ndf = df.withColumn('is_cpm', F.when(df.ad_cost_type == 'CPM', 1).otherwise(0))\
       .withColumn('is_cpc', F.when(df.ad_cost_type == 'CPC', 1).otherwise(0))\
       .withColumn('view', F.when(df.event == 'view', 1).otherwise(0))\
       .withColumn('click', F.when(df.event == 'click', 1).otherwise(0))

In [7]:
ndf.show(5)

+----------+-------------------+-----+--------+------+---------------+-----------------+------------+-------+---------+---------------------+------+------+----+-----+
|      date|               time|event|platform| ad_id|client_union_id|compaign_union_id|ad_cost_type|ad_cost|has_video|target_audience_count|is_cpm|is_cpc|view|click|
+----------+-------------------+-----+--------+------+---------------+-----------------+------------+-------+---------+---------------------+------+------+----+-----+
|2019-04-01|2019-03-31 21:00:48| view| android| 45061|          34734|            45061|         CPM|  200.6|        0|              1955269|     1|     0|   1|    0|
|2019-04-01|2019-03-31 21:00:48| view|     web|121288|         121288|           121288|         CPM|  187.4|        0|               232011|     1|     0|   1|    0|
|2019-04-01|2019-03-31 21:01:03| view| android|102737|         102535|           102564|         CPC|   60.7|        0|                 4410|     0|     1|   1|    0

In [8]:
ndf_grouped = ndf.groupBy('ad_id')\
                 .agg(
                        F.max(ndf.target_audience_count),
                        F.max(ndf.has_video),
                        F.max(ndf.is_cpm),
                        F.max(ndf.is_cpc),
                        F.sum(ndf.ad_cost),
                        F.countDistinct(ndf.date),
                        F.sum(ndf.view),
                        F.sum(ndf.click)
                      )\
            .withColumnRenamed('max(target_audience_count)', 'target_audience_count')\
            .withColumnRenamed('max(has_video)', 'has_video')\
            .withColumnRenamed('max(is_cpm)', 'is_cpm')\
            .withColumnRenamed('max(is_cpc)', 'is_cpc')\
            .withColumnRenamed('sum(ad_cost)', 'ad_cost')\
            .withColumnRenamed('count(date)', 'day_count')\
            .withColumnRenamed('sum(view)', 'views')\
            .withColumnRenamed('sum(click)', 'clicks')\
            .withColumn('CTR', F.coalesce(col('clicks') / col('views'), col('views')))\
            .drop('clicks', 'views')

In [9]:
ndf_grouped.where(col('max(is_cpc)') == 1).show(5)

+------+---------------------+---------+------+------+------------------+---------+-------------------+
| ad_id|target_audience_count|has_video|is_cpm|is_cpc|           ad_cost|day_count|                CTR|
+------+---------------------+---------+------+------+------------------+---------+-------------------+
|114166|                 7350|        0|     0|     1|            1326.0|        2|                0.0|
| 18759|               625481|        0|     0|     1|4541.0000000000055|        2|                0.0|
| 47178|                52401|        0|     0|     1|11335.300000000037|        2|0.03137254901960784|
| 19912|                18860|        0|     0|     1|1316.5999999999995|        2|               0.16|
|117364|              7816835|        0|     0|     1|             355.6|        2|                0.0|
+------+---------------------+---------+------+------+------------------+---------+-------------------+
only showing top 5 rows



In [10]:
ndf_grouped.count()

965

In [11]:
# разбиваем на трейн и тест
train, test = ndf_grouped.randomSplit(TRAIN_TEST_SPLIT, seed=42)

In [12]:
train.show(4)

+-----+---------------------+---------+------+------+-----------------+---------+--------------------+
|ad_id|target_audience_count|has_video|is_cpm|is_cpc|          ad_cost|day_count|                 CTR|
+-----+---------------------+---------+------+------+-----------------+---------+--------------------+
|33412|                 7195|        0|     1|     0|7493.500000000003|        2|                 0.0|
|33602|              3277386|        0|     1|     0|91834.20000000055|        2|             0.01875|
|47217|                 7121|        0|     1|     0|5188.799999999999|        2|0.045454545454545456|
|43921|                 7807|        0|     1|     0|4327.399999999999|        1|                 0.0|
+-----+---------------------+---------+------+------+-----------------+---------+--------------------+
only showing top 4 rows



In [13]:
test.show(4)

+------+---------------------+---------+------+------+------------------+---------+--------------------+
| ad_id|target_audience_count|has_video|is_cpm|is_cpc|           ad_cost|day_count|                 CTR|
+------+---------------------+---------+------+------+------------------+---------+--------------------+
| 40515|                11533|        0|     1|     0|30931.199999999943|        2| 0.02857142857142857|
| 20596|              1106999|        0|     1|     0|135707.99999999907|        2|  0.0062402496099844|
|119169|                35019|        0|     1|     0|129866.60000000098|        2|0.007861635220125786|
| 15162|             32214433|        0|     1|     0|           16605.0|        2|                 0.0|
+------+---------------------+---------+------+------+------------------+---------+--------------------+
only showing top 4 rows



In [14]:
target_path = 'result'

In [15]:
train.coalesce(1).write.parquet(f'{target_path}/train')

In [16]:
test.coalesce(1).write.parquet(f'{target_path}/test')