In [1]:
OUTPUT_BUCKET_FOLDER = "gs://kaggle-ocp-data/output/"
DATA_BUCKET_FOLDER = "gs://kaggle-ocp-data/data/"

In [2]:
from IPython.display import display
from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.ml.linalg import Vectors, SparseVector, VectorUDT
import numpy as np
import scipy.sparse
import math
import datetime
import time
import itertools
import pickle

In [3]:
import random
random.seed(42)

In [4]:
import pandas as pd
%matplotlib inline

# UDFs

In [5]:
def date_time_to_unix_epoch(date_time):
    return int(time.mktime(date_time.timetuple()))

def date_time_to_unix_epoch_treated(dt):
    if dt != None:
        try:
            epoch = date_time_to_unix_epoch(dt)
            return epoch
        except Exception as e:
            print("Error processing dt={}".format(dt), e)
            return 0
    else:
        return 0

In [6]:
timestamp_null_to_zero_int_udf = F.udf(lambda x: date_time_to_unix_epoch_treated(x), IntegerType())

In [7]:
INT_DEFAULT_NULL_VALUE = -1
int_null_to_minus_one_udf = F.udf(lambda x: x if x != None else INT_DEFAULT_NULL_VALUE, IntegerType())
int_list_null_to_empty_list_udf = F.udf(lambda x: x if x != None else [], ArrayType(IntegerType()))
float_list_null_to_empty_list_udf = F.udf(lambda x: x if x != None else [], ArrayType(FloatType()))
str_list_null_to_empty_list_udf = F.udf(lambda x: x if x != None else [], ArrayType(StringType()))

In [8]:
def truncate_day_from_timestamp(ts):
    return int(ts / 1000 / 60 / 60 / 24)

In [9]:
truncate_day_from_timestamp_udf = F.udf(lambda ts: truncate_day_from_timestamp(ts), IntegerType())

In [10]:
extract_country_udf = F.udf(lambda geo: geo.strip()[:2] if geo != None else '', StringType())

In [11]:
extract_country_state_udf = F.udf(lambda geo: geo.strip()[:5] if geo != None else '', StringType())

In [12]:
list_len_udf = F.udf(lambda x: len(x) if x != None else 0, IntegerType())

In [13]:
def convert_odd_timestamp(timestamp_ms_relative):
    TIMESTAMP_DELTA=1465876799998
    return datetime.datetime.fromtimestamp((int(timestamp_ms_relative)+TIMESTAMP_DELTA)//1000)

# Loading Files

### Loading UTC/BST for each country and US / CA states (local time)

In [14]:
country_utc_dst_df = pd.read_csv('country_codes_utc_dst_tz_delta.csv', keep_default_na=False)

In [30]:
country_utc_dst_df.head()

Unnamed: 0,country_code,utc_dst_time_offset_cleaned
0,AX,3.0
1,AF,4.5
2,AL,2.0
3,DZ,1.0
4,AD,2.0


In [31]:
country_utc_dst_df.describe()

Unnamed: 0,utc_dst_time_offset_cleaned
count,246.0
mean,1.861789
std,5.021302
min,-11.0
25%,-1.5
50%,2.0
75%,4.0
max,13.0


In [32]:
country_utc_dst_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246 entries, 0 to 245
Data columns (total 2 columns):
country_code                   246 non-null object
utc_dst_time_offset_cleaned    246 non-null float64
dtypes: float64(1), object(1)
memory usage: 3.9+ KB


In [15]:
countries_utc_dst_dict = dict(zip(country_utc_dst_df['country_code'].tolist(), country_utc_dst_df['utc_dst_time_offset_cleaned'].tolist()))
countries_utc_dst_broad = sc.broadcast(countries_utc_dst_dict) 

In [34]:
type(countries_utc_dst_broad) # 나라별 현지시간 spark broadcast 선언

pyspark.broadcast.Broadcast

In [16]:
us_states_utc_dst_df = pd.read_csv('us_states_abbrev_bst.csv', keep_default_na=False)

In [36]:
us_states_utc_dst_df.head()

Unnamed: 0,state_abb,utc_dst_time_offset_cleaned
0,AL,-5.0
1,AK,-8.0
2,AZ,-7.0
3,AR,-5.0
4,CA,-7.0


In [37]:
us_states_utc_dst_df.describe()

Unnamed: 0,utc_dst_time_offset_cleaned
count,51.0
mean,-5.039216
std,1.280012
min,-10.0
25%,-5.5
50%,-5.0
75%,-4.0
max,-4.0


In [38]:
us_states_utc_dst_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 2 columns):
state_abb                      51 non-null object
utc_dst_time_offset_cleaned    51 non-null float64
dtypes: float64(1), object(1)
memory usage: 888.0+ bytes


In [17]:
us_states_utc_dst_dict = dict(zip(us_states_utc_dst_df['state_abb'].tolist(), us_states_utc_dst_df['utc_dst_time_offset_cleaned'].tolist()))
us_states_utc_dst_broad = sc.broadcast(us_states_utc_dst_dict)

In [40]:
type(us_states_utc_dst_broad) # 미국 주별 시차 spark broadcast 선언

pyspark.broadcast.Broadcast

In [18]:
ca_states_utc_dst_df = pd.read_csv('ca_states_abbrev_bst.csv', keep_default_na=False)

In [42]:
ca_states_utc_dst_df.head()

Unnamed: 0,state_abb,utc_dst_time_offset_cleaned
0,AB,-6.0
1,BC,-7.0
2,MB,-5.0
3,NB,-3.0
4,NL,-3.0


In [43]:
ca_states_utc_dst_df.describe()

Unnamed: 0,utc_dst_time_offset_cleaned
count,12.0
mean,-4.666667
std,1.556998
min,-7.0
25%,-6.0
50%,-4.5
75%,-3.0
max,-3.0


In [44]:
ca_states_utc_dst_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 2 columns):
state_abb                      12 non-null object
utc_dst_time_offset_cleaned    12 non-null float64
dtypes: float64(1), object(1)
memory usage: 264.0+ bytes


In [19]:
ca_countries_utc_dst_dict = dict(zip(ca_states_utc_dst_df['state_abb'].tolist(), ca_states_utc_dst_df['utc_dst_time_offset_cleaned'].tolist()))
ca_countries_utc_dst_broad = sc.broadcast(ca_countries_utc_dst_dict)

In [46]:
type(ca_countries_utc_dst_broad) # 캐나다 주별 시차 spark broadcast 선언

pyspark.broadcast.Broadcast

### Loading competition csvs

In [20]:
events_schema = StructType(
                    [StructField("display_id", IntegerType(), True),
                    StructField("uuid_event", StringType(), True),                    
                    StructField("document_id_event", IntegerType(), True),
                    StructField("timestamp_event", IntegerType(), True),
                    StructField("platform_event", IntegerType(), True),
                    StructField("geo_location_event", StringType(), True)]
                    )

events_df = spark.read.schema(events_schema).options(header='true', inferschema='false', nullValue='\\N') \
                .csv(DATA_BUCKET_FOLDER + "events.csv") \
                .withColumn('dummyEvents', F.lit(1)) \
                .withColumn('day_event', truncate_day_from_timestamp_udf('timestamp_event')) \
                .withColumn('event_country', extract_country_udf('geo_location_event')) \
                .withColumn('event_country_state', extract_country_state_udf('geo_location_event')) \
                .alias('events')  
                    
# day_event : timesstamp_event / 1000 / 60 / 60 / 24
# extract_country_udf : left 2글자로 국가 정보 생성
# extract_country_state_udf : left 6글자로 국가>주 정보 생성

In [48]:
events_df.printSchema()

root
 |-- display_id: integer (nullable = true)
 |-- uuid_event: string (nullable = true)
 |-- document_id_event: integer (nullable = true)
 |-- timestamp_event: integer (nullable = true)
 |-- platform_event: integer (nullable = true)
 |-- geo_location_event: string (nullable = true)
 |-- dummyEvents: integer (nullable = false)
 |-- day_event: integer (nullable = true)
 |-- event_country: string (nullable = true)
 |-- event_country_state: string (nullable = true)



In [50]:
print((events_df.count(), len(events_df.columns)))

(23120126, 10)


In [51]:
events_df.show()

+----------+--------------+-----------------+---------------+--------------+------------------+-----------+---------+-------------+-------------------+
|display_id|    uuid_event|document_id_event|timestamp_event|platform_event|geo_location_event|dummyEvents|day_event|event_country|event_country_state|
+----------+--------------+-----------------+---------------+--------------+------------------+-----------+---------+-------------+-------------------+
|         1|cb8c55702adb93|           379743|             61|             3|         US>SC>519|          1|        0|           US|              US>SC|
|         2|79a85fa78311b9|          1794259|             81|             2|         US>CA>807|          1|        0|           US|              US>CA|
|         3|822932ce3d8757|          1179111|            182|             2|         US>MI>505|          1|        0|           US|              US>MI|
|         4|85281d0a49f7ac|          1777797|            234|             2|         US>

In [21]:
page_views_schema = StructType(
                    [StructField("uuid_pv", StringType(), True),
                    StructField("document_id_pv", IntegerType(), True),
                    StructField("timestamp_pv", IntegerType(), True),
                    StructField("platform_pv", IntegerType(), True),
                    StructField("geo_location_pv", StringType(), True),
                    StructField("traffic_source_pv", IntegerType(), True)]
                    )
page_views_df = spark.read.schema(page_views_schema).options(header='true', inferschema='false', nullValue='\\N') \
                .csv(DATA_BUCKET_FOLDER+"page_views.csv") \
                .withColumn('day_pv', truncate_day_from_timestamp_udf('timestamp_pv')) \
                .alias('page_views')        
            
page_views_df.createOrReplaceTempView('page_views')

In [53]:
page_views_df.printSchema()

root
 |-- uuid_pv: string (nullable = true)
 |-- document_id_pv: integer (nullable = true)
 |-- timestamp_pv: integer (nullable = true)
 |-- platform_pv: integer (nullable = true)
 |-- geo_location_pv: string (nullable = true)
 |-- traffic_source_pv: integer (nullable = true)
 |-- day_pv: integer (nullable = true)



In [54]:
print((page_views_df.count(), len(page_views_df.columns)))

(2034275448, 7)


In [55]:
page_views_df.show()

+--------------+--------------+------------+-----------+---------------+-----------------+------+
|       uuid_pv|document_id_pv|timestamp_pv|platform_pv|geo_location_pv|traffic_source_pv|day_pv|
+--------------+--------------+------------+-----------+---------------+-----------------+------+
|1fd5f051fba643|           120|    31905835|          1|             RS|                2|     0|
|8557aa9004be3b|           120|    32053104|          1|          VN>44|                2|     0|
|c351b277a358f0|           120|    54013023|          1|          KR>12|                1|     0|
|8205775c5387f9|           120|    44196592|          1|          IN>16|                2|     0|
|9cb0ccd8458371|           120|    65817371|          1|      US>CA>807|                2|     0|
|2aa611f32875c7|           120|    71495491|          1|          CA>ON|                2|     0|
|f55a6eaf2b34ab|           120|    73309199|          1|          BR>27|                2|     0|
|cc01b582c8cbff|    

In [22]:
page_views_users_df  = spark.sql('''
                    SELECT uuid_pv, document_id_pv, max(timestamp_pv) as max_timestamp_pv, 1 as dummyPageView
                    FROM page_views p 
                    GROUP BY uuid_pv, document_id_pv
                    ''').alias('page_views_users')

In [57]:
page_views_users_df.printSchema()

root
 |-- uuid_pv: string (nullable = true)
 |-- document_id_pv: integer (nullable = true)
 |-- max_timestamp_pv: integer (nullable = true)
 |-- dummyPageView: integer (nullable = false)



In [58]:
print((page_views_users_df.count(), len(page_views_users_df.columns)))

(1949299181, 4)


In [59]:
page_views_users_df.show()

+--------------+--------------+----------------+-------------+
|       uuid_pv|document_id_pv|max_timestamp_pv|dummyPageView|
+--------------+--------------+----------------+-------------+
|20b525e43e4b36|        824710|       923192155|            1|
|7066c252715a0e|        824710|       901480520|            1|
|87da960cbe5ed1|        824710|       944133593|            1|
|ccee4e6767b39a|        824710|       949665837|            1|
|5429c686ee412b|        824710|       937211799|            1|
|418df96a37d0b4|        824710|       884116598|            1|
|efa274f080bdd1|        824710|       904405946|            1|
|3d3ba3ee3e5395|        824710|       944944673|            1|
|dedea73b58d6f1|        824710|       934285584|            1|
|2cb00c0d8f1ee8|        824710|      1071755772|            1|
|7983373d9d5be2|        824710|       872154848|            1|
|88911b2cc71be4|        824710|       864023440|            1|
|560c330d04cd2d|        824710|       949168071|       

In [23]:
promoted_content_schema = StructType(
                    [StructField("ad_id", IntegerType(), True),
                    StructField("document_id_promo", IntegerType(), True),                    
                    StructField("campaign_id", IntegerType(), True),
                    StructField("advertiser_id", IntegerType(), True)]
                    )

promoted_content_df = spark.read.schema(promoted_content_schema).options(header='true', inferschema='false', nullValue='\\N') \
                .csv(DATA_BUCKET_FOLDER+"promoted_content.csv") \
                .withColumn('dummyPromotedContent', F.lit(1)).alias('promoted_content').cache()

In [61]:
promoted_content_df.printSchema()

root
 |-- ad_id: integer (nullable = true)
 |-- document_id_promo: integer (nullable = true)
 |-- campaign_id: integer (nullable = true)
 |-- advertiser_id: integer (nullable = true)
 |-- dummyPromotedContent: integer (nullable = false)



In [62]:
print((promoted_content_df.count(), len(promoted_content_df.columns)))

(559583, 5)


In [63]:
promoted_content_df.show()

+-----+-----------------+-----------+-------------+--------------------+
|ad_id|document_id_promo|campaign_id|advertiser_id|dummyPromotedContent|
+-----+-----------------+-----------+-------------+--------------------+
|    1|             6614|          1|            7|                   1|
|    2|           471467|          2|            7|                   1|
|    3|             7692|          3|            7|                   1|
|    4|           471471|          2|            7|                   1|
|    5|           471472|          2|            7|                   1|
|    6|            12736|          1|            7|                   1|
|    7|            12808|          1|            7|                   1|
|    8|           471477|          2|            7|                   1|
|    9|            13379|          1|            7|                   1|
|   10|            13885|          1|            7|                   1|
|   11|            14230|          1|            7|

In [24]:
documents_meta_schema = StructType(
                    [StructField("document_id_doc", IntegerType(), True),
                    StructField("source_id", IntegerType(), True),                    
                    StructField("publisher_id", IntegerType(), True),
                    StructField("publish_time", TimestampType(), True)]
                    )

documents_meta_df = spark.read.schema(documents_meta_schema).options(header='true', inferschema='false', nullValue='\\N') \
                .csv(DATA_BUCKET_FOLDER+"documents_meta.csv") \
                .withColumn('dummyDocumentsMeta', F.lit(1)).alias('documents_meta').cache()

In [65]:
documents_meta_df.printSchema()

root
 |-- document_id_doc: integer (nullable = true)
 |-- source_id: integer (nullable = true)
 |-- publisher_id: integer (nullable = true)
 |-- publish_time: timestamp (nullable = true)
 |-- dummyDocumentsMeta: integer (nullable = false)



In [66]:
print((documents_meta_df.count(), len(documents_meta_df.columns)))

(2999334, 5)


In [67]:
documents_meta_df.show()

+---------------+---------+------------+--------------------+------------------+
|document_id_doc|source_id|publisher_id|        publish_time|dummyDocumentsMeta|
+---------------+---------+------------+--------------------+------------------+
|        1595802|        1|         603|2016-06-05 00:00:...|                 1|
|        1524246|        1|         603|2016-05-26 11:00:...|                 1|
|        1617787|        1|         603|2016-05-27 00:00:...|                 1|
|        1615583|        1|         603|2016-06-07 00:00:...|                 1|
|        1615460|        1|         603|2016-06-20 00:00:...|                 1|
|        1615354|        1|         603|2016-06-10 00:00:...|                 1|
|        1614611|        1|         603|2016-06-05 13:00:...|                 1|
|        1614235|        1|         603|2016-06-09 00:00:...|                 1|
|        1614225|        1|         603|2016-06-09 00:00:...|                 1|
|        1488264|        1| 

In [25]:
#Joining with Page Views to get traffic_source_pv
events_joined_df = events_df.join(documents_meta_df \
                                  .withColumnRenamed('source_id', 'source_id_doc_event') \
                                  .withColumnRenamed('publisher_id', 'publisher_doc_event') \
                                  .withColumnRenamed('publish_time', 'publish_time_doc_event')
                                  , on=F.col("document_id_event") == F.col("document_id_doc"), how='left') \
                            .join(page_views_df, 
                                           on=[F.col('uuid_event') == F.col('uuid_pv'),
                                               F.col('document_id_event') == F.col('document_id_pv'),
                                               F.col('platform_event') == F.col('platform_pv'),
                                               F.col('geo_location_event') == F.col('geo_location_pv'),
                                               F.col('day_event') == F.col('day_pv')],
                                           how='left') \
                                    .alias('events').cache()

In [69]:
events_joined_df.printSchema()

root
 |-- display_id: integer (nullable = true)
 |-- uuid_event: string (nullable = true)
 |-- document_id_event: integer (nullable = true)
 |-- timestamp_event: integer (nullable = true)
 |-- platform_event: integer (nullable = true)
 |-- geo_location_event: string (nullable = true)
 |-- dummyEvents: integer (nullable = false)
 |-- day_event: integer (nullable = true)
 |-- event_country: string (nullable = true)
 |-- event_country_state: string (nullable = true)
 |-- document_id_doc: integer (nullable = true)
 |-- source_id_doc_event: integer (nullable = true)
 |-- publisher_doc_event: integer (nullable = true)
 |-- publish_time_doc_event: timestamp (nullable = true)
 |-- dummyDocumentsMeta: integer (nullable = true)
 |-- uuid_pv: string (nullable = true)
 |-- document_id_pv: integer (nullable = true)
 |-- timestamp_pv: integer (nullable = true)
 |-- platform_pv: integer (nullable = true)
 |-- geo_location_pv: string (nullable = true)
 |-- traffic_source_pv: integer (nullable = true)


In [70]:
print((events_joined_df.count(), len(events_joined_df.columns)))

(23120126, 22)


In [71]:
events_joined_df.take(3)

[Row(display_id=12610276, uuid_event=u'1000d5a7147ee0', document_id_event=1340752, timestamp_event=834427439, platform_event=3, geo_location_event=u'US>TN>659', dummyEvents=1, day_event=9, event_country=u'US', event_country_state=u'US>TN', document_id_doc=1340752, source_id_doc_event=10314, publisher_doc_event=1178, publish_time_doc_event=None, dummyDocumentsMeta=1, uuid_pv=u'1000d5a7147ee0', document_id_pv=1340752, timestamp_pv=834427439, platform_pv=3, geo_location_pv=u'US>TN>659', traffic_source_pv=1, day_pv=9),
 Row(display_id=13660182, uuid_event=u'100100be253e64', document_id_event=1147029, timestamp_event=904492080, platform_event=2, geo_location_event=u'GB', dummyEvents=1, day_event=10, event_country=u'GB', event_country_state=u'GB', document_id_doc=1147029, source_id_doc_event=1296, publisher_doc_event=777, publish_time_doc_event=None, dummyDocumentsMeta=1, uuid_pv=u'100100be253e64', document_id_pv=1147029, timestamp_pv=904492080, platform_pv=2, geo_location_pv=u'GB', traffic_

In [26]:
documents_categories_schema = StructType(
                    [StructField("document_id_cat", IntegerType(), True),
                    StructField("category_id", IntegerType(), True),                    
                    StructField("confidence_level_cat", FloatType(), True)]
                    )

documents_categories_df = spark.read.schema(documents_categories_schema).options(header='true', inferschema='false', nullValue='\\N') \
                .csv(DATA_BUCKET_FOLDER+"documents_categories.csv") \
                .alias('documents_categories').cache()

In [73]:
documents_categories_df.printSchema()

root
 |-- document_id_cat: integer (nullable = true)
 |-- category_id: integer (nullable = true)
 |-- confidence_level_cat: float (nullable = true)



In [74]:
print((documents_categories_df.count(), len(documents_categories_df.columns)))

(5481475, 3)


In [75]:
documents_categories_df.show()

+---------------+-----------+--------------------+
|document_id_cat|category_id|confidence_level_cat|
+---------------+-----------+--------------------+
|        1595802|       1611|                0.92|
|        1595802|       1610|                0.07|
|        1524246|       1807|                0.92|
|        1524246|       1608|                0.07|
|        1617787|       1807|                0.92|
|        1617787|       1608|                0.07|
|        1615583|       1305|                0.92|
|        1615583|       1806|                0.07|
|        1615460|       1613|           0.5406464|
|        1615460|       1603|          0.04113614|
|        1615354|       1608|          0.50502783|
|        1615354|       1807|          0.03842603|
|        1614611|       1606|           0.9191688|
|        1614611|       1608|          0.06993675|
|        1614235|       1606|                0.92|
|        1614235|       1608|                0.07|
|        1614225|       1608|  

In [27]:
documents_categories_grouped_df = documents_categories_df.groupBy('document_id_cat') \
                                            .agg(F.collect_list('category_id').alias('category_id_list'),
                                                 F.collect_list('confidence_level_cat').alias('confidence_level_cat_list')) \
                                            .withColumn('dummyDocumentsCategory', F.lit(1)) \
                                            .alias('documents_categories_grouped')

In [77]:
documents_categories_grouped_df.printSchema()

root
 |-- document_id_cat: integer (nullable = true)
 |-- category_id_list: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- confidence_level_cat_list: array (nullable = true)
 |    |-- element: float (containsNull = true)
 |-- dummyDocumentsCategory: integer (nullable = false)



In [78]:
print((documents_categories_grouped_df.count(), len(documents_categories_grouped_df.columns)))

(2828649, 4)


In [79]:
documents_categories_grouped_df.show()

+---------------+----------------+-------------------------+----------------------+
|document_id_cat|category_id_list|confidence_level_cat_list|dummyDocumentsCategory|
+---------------+----------------+-------------------------+----------------------+
|            148|    [1403, 1702]|             [0.92, 0.07]|                     1|
|            463|    [1513, 1808]|     [0.8932095, 0.067...|                     1|
|            471|    [1504, 1609]|             [0.92, 0.07]|                     1|
|            496|    [1210, 1203]|             [0.92, 0.07]|                     1|
|            833|    [1305, 2004]|             [0.92, 0.07]|                     1|
|           1088|    [2006, 1210]|     [0.8364613, 0.063...|                     1|
|           1238|    [1100, 1407]|     [0.34836665, 0.02...|                     1|
|           1342|    [1408, 2004]|     [0.42835742, 0.03...|                     1|
|           1580|    [1403, 1402]|     [0.65625566, 0.04...|                

In [28]:
documents_topics_schema = StructType(
                    [StructField("document_id_top", IntegerType(), True),
                    StructField("topic_id", IntegerType(), True),                    
                    StructField("confidence_level_top", FloatType(), True)]
                    )

documents_topics_df = spark.read.schema(documents_topics_schema).options(header='true', inferschema='false', nullValue='\\N') \
                .csv(DATA_BUCKET_FOLDER+"documents_topics.csv")  \
                .alias('documents_topics').cache()

In [81]:
documents_topics_df.printSchema()

root
 |-- document_id_top: integer (nullable = true)
 |-- topic_id: integer (nullable = true)
 |-- confidence_level_top: float (nullable = true)



In [82]:
print((documents_topics_df.count(), len(documents_topics_df.columns)))

(11325960, 3)


In [83]:
documents_topics_df.show()

+---------------+--------+--------------------+
|document_id_top|topic_id|confidence_level_top|
+---------------+--------+--------------------+
|        1595802|     140|          0.07311316|
|        1595802|      16|         0.059416488|
|        1595802|     143|         0.045420755|
|        1595802|     170|          0.03886743|
|        1524246|     113|           0.1964504|
|        1524246|     260|          0.14287816|
|        1524246|      92|          0.03315913|
|        1524246|     168|        0.0140903415|
|        1524246|      54|          0.00878222|
|        1524246|     207|         0.008282372|
|        1617787|     113|          0.21689232|
|        1617787|     260|          0.09631251|
|        1617787|     258|         0.048729967|
|        1617787|      10|         0.031135853|
|        1617787|     168|         0.013120055|
|        1617787|     148|         0.013110327|
|        1615583|      89|           0.3163065|
|        1615583|     198|         0.015

In [29]:
documents_topics_grouped_df = documents_topics_df.groupBy('document_id_top') \
                                            .agg(F.collect_list('topic_id').alias('topic_id_list'),
                                                 F.collect_list('confidence_level_top').alias('confidence_level_top_list')) \
                                            .withColumn('dummyDocumentsTopics', F.lit(1)) \
                                            .alias('documents_topics_grouped')

In [85]:
documents_topics_grouped_df.printSchema()

root
 |-- document_id_top: integer (nullable = true)
 |-- topic_id_list: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- confidence_level_top_list: array (nullable = true)
 |    |-- element: float (containsNull = true)
 |-- dummyDocumentsTopics: integer (nullable = false)



In [86]:
print((documents_topics_grouped_df.count(), len(documents_topics_grouped_df.columns)))

(2495423, 4)


In [87]:
documents_topics_grouped_df.show()

+---------------+--------------------+-------------------------+--------------------+
|document_id_top|       topic_id_list|confidence_level_top_list|dummyDocumentsTopics|
+---------------+--------------------+-------------------------+--------------------+
|            148|[153, 140, 8, 172...|     [0.07523697, 0.07...|                   1|
|            463|[181, 292, 24, 25...|     [0.11870128, 0.05...|                   1|
|            471|[285, 238, 153, 193]|     [0.15588789, 0.04...|                   1|
|            496|[244, 294, 196, 1...|     [0.18284231, 0.11...|                   1|
|            833|[294, 89, 174, 86...|     [0.11430275, 0.04...|                   1|
|           1088|[107, 75, 153, 64...|     [0.10822894, 0.06...|                   1|
|           1238| [89, 221, 192, 236]|     [0.023348164, 0.0...|                   1|
|           1342|[271, 283, 181, 2...|     [0.0457309, 0.025...|                   1|
|           1580|[8, 37, 136, 12, ...|     [0.08965496

In [30]:
documents_entities_schema = StructType(
                    [StructField("document_id_ent", IntegerType(), True),
                    StructField("entity_id", StringType(), True),                    
                    StructField("confidence_level_ent", FloatType(), True)]
                    )

documents_entities_df = spark.read.schema(documents_entities_schema).options(header='true', inferschema='false', nullValue='\\N') \
                .csv(DATA_BUCKET_FOLDER+"documents_entities.csv")  \
                .alias('documents_entities').cache()

In [89]:
documents_entities_df.printSchema()

root
 |-- document_id_ent: integer (nullable = true)
 |-- entity_id: string (nullable = true)
 |-- confidence_level_ent: float (nullable = true)



In [90]:
print((documents_entities_df.count(), len(documents_entities_df.columns)))

(5537552, 3)


In [91]:
documents_entities_df.show()

+---------------+--------------------+--------------------+
|document_id_ent|           entity_id|confidence_level_ent|
+---------------+--------------------+--------------------+
|        1524246|f9eec25663db4cd83...|          0.67286533|
|        1524246|55ebcfbdaff1d6f60...|           0.3991137|
|        1524246|839907a972930b17b...|          0.39209574|
|        1524246|04d8f9a1ad48f126d...|          0.21399638|
|        1617787|612a1d17685a498af...|          0.38619283|
|        1617787|fb8c6cb0879e0de87...|          0.36411646|
|        1617787|793c6a6cf386edb82...|          0.34916824|
|        1617787|b525b84d5ed52a345...|          0.28700453|
|        1617787|758cb9cb3014607cb...|          0.23795699|
|        1617787|d523aaba6d3916f8b...|          0.23579852|
|        1617787|053e3ebba26a8d00e...|           0.2299529|
|        1617787|935b2d2f0f49a80c2...|          0.22793758|
|        1617787|3d0d43e01b616b4ff...|          0.22574973|
|        1617787|74b442766892bbf15...|  

In [31]:
documents_entities_grouped_df = documents_entities_df.groupBy('document_id_ent') \
                                            .agg(F.collect_list('entity_id').alias('entity_id_list'),
                                                 F.collect_list('confidence_level_ent').alias('confidence_level_ent_list')) \
                                            .withColumn('dummyDocumentsEntities', F.lit(1)) \
                                            .alias('documents_entities_grouped')

In [93]:
documents_entities_grouped_df.printSchema()

root
 |-- document_id_ent: integer (nullable = true)
 |-- entity_id_list: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- confidence_level_ent_list: array (nullable = true)
 |    |-- element: float (containsNull = true)
 |-- dummyDocumentsEntities: integer (nullable = false)



In [94]:
print((documents_entities_grouped_df.count(), len(documents_entities_grouped_df.columns)))

(1791420, 4)


In [95]:
documents_entities_grouped_df.show()

+---------------+--------------------+-------------------------+----------------------+
|document_id_ent|      entity_id_list|confidence_level_ent_list|dummyDocumentsEntities|
+---------------+--------------------+-------------------------+----------------------+
|            148|[e1c74838563ef5d2...|     [0.6320258, 0.404...|                     1|
|            463|[aaa0246895d43735...|              [0.6939791]|                     1|
|            496|[0ffa5e294bd46905...|              [0.3608937]|                     1|
|            833|[430da13f06eed7d5...|     [0.5932388, 0.240...|                     1|
|           1088|[94101adfc2f6bccb...|              [0.9564353]|                     1|
|           1580|[86b630e436676e43...|     [0.92001617, 0.44...|                     1|
|           1645|[976e5e062b216f23...|     [0.66670954, 0.61...|                     1|
|           1959|[806f6ef8cca7644d...|             [0.31478134]|                     1|
|           2122|[bad3651e69ae38

In [32]:
clicks_train_schema = StructType(
                    [StructField("display_id", IntegerType(), True),
                    StructField("ad_id", IntegerType(), True),                    
                    StructField("clicked", IntegerType(), True)]
                    )

clicks_train_df = spark.read.schema(clicks_train_schema).options(header='true', inferschema='false', nullValue='\\N') \
                .csv(DATA_BUCKET_FOLDER+"clicks_train.csv") \
                .withColumn('dummyClicksTrain', F.lit(1)).alias('clicks_train')

In [97]:
clicks_train_df.printSchema()

root
 |-- display_id: integer (nullable = true)
 |-- ad_id: integer (nullable = true)
 |-- clicked: integer (nullable = true)
 |-- dummyClicksTrain: integer (nullable = false)



In [98]:
print((clicks_train_df.count(), len(clicks_train_df.columns)))

(87141731, 4)


In [99]:
clicks_train_df.show()

+----------+------+-------+----------------+
|display_id| ad_id|clicked|dummyClicksTrain|
+----------+------+-------+----------------+
|         1| 42337|      0|               1|
|         1|139684|      0|               1|
|         1|144739|      1|               1|
|         1|156824|      0|               1|
|         1|279295|      0|               1|
|         1|296965|      0|               1|
|         2|125211|      0|               1|
|         2|156535|      0|               1|
|         2|169564|      0|               1|
|         2|308455|      1|               1|
|         3| 71547|      0|               1|
|         3| 95814|      0|               1|
|         3|152141|      0|               1|
|         3|183846|      0|               1|
|         3|228657|      1|               1|
|         3|250082|      0|               1|
|         4|149930|      0|               1|
|         4|153623|      1|               1|
|         4|184709|      0|               1|
|         

In [33]:
clicks_train_joined_df = clicks_train_df \
                         .join(promoted_content_df, on='ad_id', how='left') \
                         .join(documents_meta_df, on=F.col("promoted_content.document_id_promo") == F.col("documents_meta.document_id_doc"), how='left') \
                         .join(events_joined_df, on='display_id', how='left')                         
clicks_train_joined_df.createOrReplaceTempView('clicks_train_joined')

In [34]:
train_set_df = clicks_train_joined_df.cache() 
print("train_set_df.count() =", train_set_df.count())

('train_set_df.count() =', 87141731)


In [102]:
clicks_train_joined_df.printSchema()

root
 |-- display_id: integer (nullable = true)
 |-- ad_id: integer (nullable = true)
 |-- clicked: integer (nullable = true)
 |-- dummyClicksTrain: integer (nullable = false)
 |-- document_id_promo: integer (nullable = true)
 |-- campaign_id: integer (nullable = true)
 |-- advertiser_id: integer (nullable = true)
 |-- dummyPromotedContent: integer (nullable = true)
 |-- document_id_doc: integer (nullable = true)
 |-- source_id: integer (nullable = true)
 |-- publisher_id: integer (nullable = true)
 |-- publish_time: timestamp (nullable = true)
 |-- dummyDocumentsMeta: integer (nullable = true)
 |-- uuid_event: string (nullable = true)
 |-- document_id_event: integer (nullable = true)
 |-- timestamp_event: integer (nullable = true)
 |-- platform_event: integer (nullable = true)
 |-- geo_location_event: string (nullable = true)
 |-- dummyEvents: integer (nullable = true)
 |-- day_event: integer (nullable = true)
 |-- event_country: string (nullable = true)
 |-- event_country_state: stri

In [103]:
print((clicks_train_joined_df.count(), len(clicks_train_joined_df.columns)))

(87141731, 34)


In [104]:
clicks_train_joined_df.take(3)

[Row(display_id=148, ad_id=89351, clicked=1, dummyClicksTrain=1, document_id_promo=990613, campaign_id=7617, advertiser_id=2181, dummyPromotedContent=1, document_id_doc=990613, source_id=9457, publisher_id=None, publish_time=datetime.datetime(2015, 12, 9, 0, 0), dummyDocumentsMeta=1, uuid_event=u'9adce6a5363308', document_id_event=1205772, timestamp_event=11202, platform_event=2, geo_location_event=u'US>LA>612', dummyEvents=1, day_event=0, event_country=u'US', event_country_state=u'US>LA', document_id_doc=1205772, source_id_doc_event=9135, publisher_doc_event=1042, publish_time_doc_event=datetime.datetime(2016, 3, 29, 9, 0), dummyDocumentsMeta=1, uuid_pv=u'9adce6a5363308', document_id_pv=1205772, timestamp_pv=11202, platform_pv=2, geo_location_pv=u'US>LA>612', traffic_source_pv=1, day_pv=0),
 Row(display_id=148, ad_id=152140, clicked=0, dummyClicksTrain=1, document_id_promo=1060089, campaign_id=19032, advertiser_id=1593, dummyPromotedContent=1, document_id_doc=1060089, source_id=7744, 

In [105]:
table_name = 'user_profiles'

user_profiles_df = spark.read.parquet(OUTPUT_BUCKET_FOLDER+table_name) \
                    .withColumn('dummyUserProfiles', F.lit(1)).alias('user_profiles')

In [106]:
user_profiles_df.printSchema()

root
 |-- uuid: string (nullable = true)
 |-- doc_ids: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- views: integer (nullable = true)
 |-- categories: map (nullable = true)
 |    |-- key: integer
 |    |-- value: array (valueContainsNull = true)
 |    |    |-- element: float (containsNull = true)
 |-- topics: map (nullable = true)
 |    |-- key: integer
 |    |-- value: array (valueContainsNull = true)
 |    |    |-- element: float (containsNull = true)
 |-- entities: map (nullable = true)
 |    |-- key: string
 |    |-- value: array (valueContainsNull = true)
 |    |    |-- element: float (containsNull = true)
 |-- dummyUserProfiles: integer (nullable = false)



In [107]:
print((user_profiles_df.count(), len(user_profiles_df.columns)))

(5799892, 7)


In [108]:
user_profiles_df.take(3)

[Row(uuid=u'10013d303c2ba1', doc_ids=[310127, 359657, 1089917, 1711891, 682801, 983284, 850397, 2447637, 1407237, 2321683, 2785127, 2308397, 2601056, 2838528, 343231, 2131946, 230739, 1831392, 1437722, 797277, 2165085, 545939, 688403, 812208, 2693392], views=25, categories={1408: [5.914079666137695, 0.4950000047683716], 1807: [4.223730087280273, 0.056340292096138], 1808: [10.069393157958984, 0.13506388664245605], 1702: [3.987312078475952, 0.4914774000644684], 1703: [3.4640963077545166, 0.07000000029802322], 1609: [3.8473405838012695, 0.8618168830871582], 1210: [8.12431812286377, 0.07000000029802322], 1602: [3.6978328227996826, 0.9200000166893005], 1100: [5.296771049499512, 0.3502991199493408], 1612: [9.049626350402832, 0.7232668399810791], 2004: [2.935142993927002, 0.9200000166893005], 1503: [3.9572343826293945, 0.07000000029802322], 1505: [12.203084945678711, 0.7605463266372681], 1510: [7.15358829498291, 0.06941717118024826], 1511: [9.554322242736816, 0.036232683807611465], 1513: [9.5

### Test set

In [121]:
clicks_test_schema = StructType(
                    [StructField("display_id", IntegerType(), True),
                    StructField("ad_id", IntegerType(), True)]
                    )

clicks_test_df = spark.read.schema(clicks_test_schema).options(header='true', inferschema='false', nullValue='\\N') \
                    .csv(DATA_BUCKET_FOLDER + "clicks_test.csv") \
                    .withColumn('dummyClicksTest', F.lit(1)) \
                    .withColumn('clicked', F.lit(-999)) \
                    .alias('clicks_test')

In [110]:
clicks_test_df.printSchema()

root
 |-- display_id: integer (nullable = true)
 |-- ad_id: integer (nullable = true)
 |-- dummyClicksTest: integer (nullable = false)
 |-- clicked: integer (nullable = false)



In [111]:
print((clicks_test_df.count(), len(clicks_test_df.columns)))

(32225162, 4)


In [112]:
clicks_test_df.show()

+----------+------+---------------+-------+
|display_id| ad_id|dummyClicksTest|clicked|
+----------+------+---------------+-------+
|  16874594| 66758|              1|   -999|
|  16874594|150083|              1|   -999|
|  16874594|162754|              1|   -999|
|  16874594|170392|              1|   -999|
|  16874594|172888|              1|   -999|
|  16874594|180797|              1|   -999|
|  16874595|  8846|              1|   -999|
|  16874595| 30609|              1|   -999|
|  16874595|143982|              1|   -999|
|  16874596| 11430|              1|   -999|
|  16874596| 57197|              1|   -999|
|  16874596|132820|              1|   -999|
|  16874596|153260|              1|   -999|
|  16874596|173005|              1|   -999|
|  16874596|288385|              1|   -999|
|  16874596|289122|              1|   -999|
|  16874596|289915|              1|   -999|
|  16874597|137858|              1|   -999|
|  16874597|143981|              1|   -999|
|  16874597|155945|             

In [122]:
test_set_df = clicks_test_df \
                         .join(promoted_content_df, on='ad_id', how='left') \
                         .join(documents_meta_df, on=F.col("promoted_content.document_id_promo") == F.col("documents_meta.document_id_doc"), how='left') \
                         .join(documents_categories_grouped_df, on=F.col("document_id_promo") == F.col("documents_categories_grouped.document_id_cat"), how='left') \
                         .join(documents_topics_grouped_df, on=F.col("document_id_promo") == F.col("documents_topics_grouped.document_id_top"), how='left') \
                         .join(documents_entities_grouped_df, on=F.col("document_id_promo") == F.col("documents_entities_grouped.document_id_ent"), how='left') \
                         .join(events_joined_df, on='display_id', how='left') \
                         .join(documents_categories_grouped_df \
                                   .withColumnRenamed('category_id_list', 'doc_event_category_id_list')
                                   .withColumnRenamed('confidence_level_cat_list', 'doc_event_confidence_level_cat_list') \
                                   .alias('documents_event_categories_grouped'), 
                               on=F.col("document_id_event") == F.col("documents_event_categories_grouped.document_id_cat"), 
                               how='left') \
                         .join(documents_topics_grouped_df \
                                   .withColumnRenamed('topic_id_list', 'doc_event_topic_id_list')
                                   .withColumnRenamed('confidence_level_top_list', 'doc_event_confidence_level_top_list') \
                                   .alias('documents_event_topics_grouped'), 
                               on=F.col("document_id_event") == F.col("documents_event_topics_grouped.document_id_top"), 
                               how='left') \
                         .join(documents_entities_grouped_df \
                                   .withColumnRenamed('entity_id_list', 'doc_event_entity_id_list')
                                   .withColumnRenamed('confidence_level_ent_list', 'doc_event_confidence_level_ent_list') \
                                   .alias('documents_event_entities_grouped'), 
                               on=F.col("document_id_event") == F.col("documents_event_entities_grouped.document_id_ent"), 
                               how='left') \
                         .join(page_views_users_df, on=[F.col("events.uuid_event") == F.col("page_views_users.uuid_pv"),
                                                        F.col("promoted_content.document_id_promo") == F.col("page_views_users.document_id_pv")], 
                                                  how='left')

In [114]:
test_set_df.printSchema()

root
 |-- display_id: integer (nullable = true)
 |-- ad_id: integer (nullable = true)
 |-- dummyClicksTest: integer (nullable = false)
 |-- clicked: integer (nullable = false)
 |-- document_id_promo: integer (nullable = true)
 |-- campaign_id: integer (nullable = true)
 |-- advertiser_id: integer (nullable = true)
 |-- dummyPromotedContent: integer (nullable = true)
 |-- document_id_doc: integer (nullable = true)
 |-- source_id: integer (nullable = true)
 |-- publisher_id: integer (nullable = true)
 |-- publish_time: timestamp (nullable = true)
 |-- dummyDocumentsMeta: integer (nullable = true)
 |-- document_id_cat: integer (nullable = true)
 |-- category_id_list: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- confidence_level_cat_list: array (nullable = true)
 |    |-- element: float (containsNull = true)
 |-- dummyDocumentsCategory: integer (nullable = true)
 |-- document_id_top: integer (nullable = true)
 |-- topic_id_list: array (nullable = true)
 |  

In [115]:
print((test_set_df.count(), len(test_set_df.columns)))

(32225162, 62)


In [116]:
test_set_df.take(3)

[Row(display_id=16874687, ad_id=153193, dummyClicksTest=1, clicked=-999, document_id_promo=1233659, campaign_id=19459, advertiser_id=2808, dummyPromotedContent=1, document_id_doc=1233659, source_id=10873, publisher_id=1137, publish_time=datetime.datetime(2016, 4, 1, 0, 0), dummyDocumentsMeta=1, document_id_cat=1233659, category_id_list=[1403, 1407], confidence_level_cat_list=[0.9200000166893005, 0.07000000029802322], dummyDocumentsCategory=1, document_id_top=1233659, topic_id_list=[160, 227], confidence_level_top_list=[0.4109724164009094, 0.013395454734563828], dummyDocumentsTopics=1, document_id_ent=1233659, entity_id_list=[u'a63009e1cf4402dbf540a2e97b65174a'], confidence_level_ent_list=[0.20150183141231537], dummyDocumentsEntities=1, uuid_event=u'174d97c7b30e7f', document_id_event=1790354, timestamp_event=41274, platform_event=2, geo_location_event=u'US>NC>544', dummyEvents=1, day_event=0, event_country=u'US', event_country_state=u'US>NC', document_id_doc=1790354, source_id_doc_event

### Training models

In [117]:
def is_null(value):
    return value == None or len(str(value).strip()) == 0

In [118]:
LESS_SPECIAL_CAT_VALUE = 'less'
def get_category_field_values_counts(field, df, min_threshold=10):
    category_counts = dict(list(filter(lambda x: not is_null(x[0]) and x[1] >= min_threshold, df.select(field).groupBy(field).count().rdd.map(lambda x: (x[0], x[1])).collect())))
    #Adding a special value to create a feature for values in this category that are less than min_threshold 
    category_counts[LESS_SPECIAL_CAT_VALUE] = -1
    return category_counts

### Building category values counters and indexers

In [119]:
event_country_values_counts = get_category_field_values_counts('event_country', events_df, min_threshold=10)
len(event_country_values_counts)
#All non-null categories: 230

222

In [120]:
event_country_values_counts

{u'--': 935,
 u'A1': 548,
 u'A2': 247,
 u'AD': 69,
 u'AE': 25199,
 u'AF': 515,
 u'AG': 684,
 u'AI': 87,
 u'AL': 2230,
 u'AM': 465,
 u'AN': 902,
 u'AO': 605,
 u'AP': 1201,
 u'AR': 5654,
 u'AS': 196,
 u'AT': 10814,
 u'AU': 483021,
 u'AW': 752,
 u'AX': 46,
 u'AZ': 474,
 u'BA': 1460,
 u'BB': 2410,
 u'BD': 9093,
 u'BE': 14112,
 u'BF': 105,
 u'BG': 3994,
 u'BH': 1979,
 u'BI': 90,
 u'BJ': 200,
 u'BM': 1125,
 u'BN': 1640,
 u'BO': 694,
 u'BR': 12902,
 u'BS': 2537,
 u'BT': 391,
 u'BW': 2325,
 u'BY': 320,
 u'BZ': 900,
 u'CA': 1215350,
 u'CD': 312,
 u'CG': 67,
 u'CH': 23326,
 u'CI': 456,
 u'CK': 54,
 u'CL': 3653,
 u'CM': 1358,
 u'CN': 10152,
 u'CO': 5219,
 u'CR': 3497,
 u'CV': 66,
 u'CY': 2403,
 u'CZ': 5706,
 u'DE': 82384,
 u'DJ': 69,
 u'DK': 24304,
 u'DM': 306,
 u'DO': 2960,
 u'DZ': 1410,
 u'EC': 2122,
 u'EE': 1466,
 u'EG': 5596,
 u'ES': 24648,
 u'ET': 5462,
 u'EU': 2534,
 u'FI': 13560,
 u'FJ': 1967,
 u'FK': 11,
 u'FM': 140,
 u'FO': 129,
 u'FR': 38755,
 u'FX': 390,
 u'GA': 110,
 u'GB': 1117544,
 

In [121]:
event_country_state_values_counts = get_category_field_values_counts('event_country_state', events_df, min_threshold=10)
len(event_country_state_values_counts)

1892

In [122]:
event_country_state_values_counts

{u'MQ>00': 70,
 u'VE>25': 1338,
 u'VE>20': 45,
 u'VE>23': 150,
 u'US>FL': 1128279,
 u'GB>G9': 615,
 u'GU': 388,
 u'GT': 26,
 u'GR': 2265,
 u'GQ': 38,
 u'GP': 16,
 u'JP>38': 120,
 u'JP>39': 31,
 u'GE': 177,
 u'GD': 210,
 u'GB': 141998,
 u'GA': 31,
 u'JP>30': 49,
 u'JP>31': 80,
 u'JP>32': 948,
 u'JP>33': 14,
 u'JP>34': 737,
 u'ME>00': 370,
 u'JP>36': 18,
 u'JP>37': 208,
 u'NZ>E9': 11891,
 u'IT>15': 874,
 u'IT>14': 339,
 u'GB>J4': 943,
 u'GB>J5': 2641,
 u'GB>J6': 2224,
 u'GB>J7': 4731,
 u'FI>06': 72,
 u'GB>J1': 8854,
 u'GB>J2': 2175,
 u'TH>42': 18,
 u'AR>22': 18,
 u'AR>23': 14,
 u'FI>08': 268,
 u'AR>21': 91,
 u'GB>J8': 12796,
 u'GB>J9': 5402,
 u'MG>05': 90,
 u'MG>04': 28,
 u'RU>81': 15,
 u'TH>47': 106,
 u'BM>03': 1112,
 u'BM>06': 10,
 u'RU>88': 28,
 u'ZM': 2012,
 u'TH>48': 16,
 u'ZA': 12630,
 u'ZW': 3373,
 u'BO>08': 81,
 u'BO>02': 95,
 u'BO>03': 129,
 u'BO>04': 371,
 u'PH>21': 3058,
 u'LC>06': 12,
 u'LC>03': 584,
 u'HU>24': 45,
 u'HU>25': 35,
 u'HU>22': 34,
 u'HU>23': 70,
 u'HU>20': 41,
 

In [123]:
event_geo_location_values_counts = get_category_field_values_counts('geo_location_event', events_df, min_threshold=10)
len(event_geo_location_values_counts)
#All non-null categories: 2988

2273

In [124]:
event_geo_location_values_counts

{u'US>MI>513': 40417,
 u'MQ>00': 70,
 u'VE>25': 1338,
 u'US>MT>756': 8209,
 u'US>MT>754': 7715,
 u'VE>23': 150,
 u'US>FL': 34787,
 u'GB>G9': 615,
 u'GU': 388,
 u'GT': 26,
 u'GR': 2265,
 u'GQ': 38,
 u'GP': 16,
 u'JP>38': 120,
 u'JP>39': 31,
 u'GE': 177,
 u'GD': 210,
 u'GB': 141998,
 u'GA': 31,
 u'US>CA>866': 62388,
 u'US>WI>676': 3218,
 u'JP>31': 80,
 u'JP>32': 948,
 u'JP>33': 14,
 u'JP>34': 737,
 u'ME>00': 370,
 u'JP>36': 18,
 u'JP>37': 208,
 u'NZ>E9': 11891,
 u'US>OK>612': 823,
 u'IT>15': 874,
 u'IT>14': 339,
 u'US>NY>555': 43256,
 u'GB>J4': 943,
 u'GB>J5': 2641,
 u'GB>J6': 2224,
 u'GB>J7': 4731,
 u'FI>06': 72,
 u'GB>J1': 8854,
 u'GB>J2': 2175,
 u'TH>42': 18,
 u'US>KY>531': 617,
 u'AR>23': 14,
 u'FI>08': 268,
 u'AR>21': 91,
 u'GB>J8': 12796,
 u'GB>J9': 5402,
 u'AR>24': 30,
 u'MG>04': 28,
 u'RU>81': 15,
 u'TH>47': 106,
 u'BM>03': 1112,
 u'BM>06': 10,
 u'RU>88': 28,
 u'ZM': 2012,
 u'TH>48': 16,
 u'US>SD>624': 756,
 u'ZA': 12630,
 u'ZW': 3373,
 u'BO>08': 81,
 u'US>TX>692': 14229,
 u'US>N

In [125]:
doc_entity_id_values_counts = get_category_field_values_counts('entity_id', documents_entities_df, min_threshold=10)
len(doc_entity_id_values_counts)
#All non-null categories: 1326009

52439

In [126]:
doc_entity_id_values_counts

{u'9a6b13a595f807ee62db7d1f10f5f37a': 11,
 u'c20d4b21a9ecf82877a4d816e529cf5b': 45,
 u'd7da908ad6f7afb1f783429230b0cf7e': 69,
 u'12a2b20572cbb90b30f6f8183f8f367b': 18,
 u'fa59b55b26967c2dcf5a3ce02293c20b': 14,
 u'169ce6cd2c77c3b017c1495f3fb4b00a': 11,
 u'78de74e9f8b222d39887ab8bdaf4c828': 14,
 u'0233417c3437cdbcf16450e1cb2c19cc': 527,
 u'0578474d15a9531500933ad3bb5d8706': 229,
 u'6d12d25ff3c4215a9193e7e2a2357443': 10,
 u'409ee03a271af97578fc17302a9a268c': 844,
 u'dcca18dabda5e6bb651d8a5c0d237c7d': 62,
 u'fbe170889e5ce6eb9acd623ac4908189': 36,
 u'a7c15a7b0f42578991e6e2f5d55ceef7': 40,
 u'90365f76ac3098a20e7b3ac1dda96569': 74,
 u'd4f384eaebfe088ea531e2f701d347e3': 67,
 u'50cb037d51eb2b8d2c2ea0c74c4889ff': 15,
 u'689a4c96c7c2a0b426991483e51d7412': 46,
 u'a7da3b8a135f09100b2db09182cbc34e': 12,
 u'b785a00e9f206308131cd3a177ce1f98': 12,
 u'9ab63daa6060ca31e8a8bc02d6359f5e': 53,
 u'7912899cd2bb90c62799b8ccb6f3ccdb': 166,
 u'52918e8e4db89037178692b2a4f65114': 13,
 u'beed6c21aceb64d5771ec71df57

### Processing average CTR by categories

In [127]:
def get_percentiles(df, field, quantiles_levels=None, max_error_rate=0.0):
    if quantiles_levels == None:
        quantiles_levels = np.arange(0.0, 1.1, 0.1).tolist() 
    quantiles = df.approxQuantile(field, quantiles_levels, max_error_rate)
    return dict(zip(quantiles_levels, quantiles))

In [128]:
#REG = 10
REG = 0
ctr_udf = F.udf(lambda clicks, views: clicks / float(views + REG), FloatType())

### Average CTR by ad_id

In [129]:
ad_id_popularity_df = train_set_df.groupby('ad_id').agg(F.sum('clicked').alias('clicks'), 
                                                               F.count('*').alias('views')) \
                                         .withColumn('ctr', ctr_udf('clicks','views'))

In [130]:
ad_id_popularity_df.printSchema()

root
 |-- ad_id: integer (nullable = true)
 |-- clicks: long (nullable = true)
 |-- views: long (nullable = false)
 |-- ctr: float (nullable = true)



In [131]:
print((ad_id_popularity_df.count(), len(ad_id_popularity_df.columns)))

(478950, 4)


In [132]:
ad_id_popularity_df.show()

+------+------+-----+-----------+
| ad_id|clicks|views|        ctr|
+------+------+-----+-----------+
| 80033|  6104|18315| 0.33327875|
|338512|     0|    2|        0.0|
|212010|   148|  728|  0.2032967|
| 92834|   177| 1127| 0.15705413|
| 90550|    93|  582| 0.15979381|
|309434|    25|  159|  0.1572327|
| 31367| 18838|50338|  0.3742302|
|155350|    67|  653| 0.10260337|
|261168|     3|   16|     0.1875|
|  7993|     8|  281| 0.02846975|
|175528|   167| 3137|0.053235576|
|268622|     6|  161| 0.03726708|
| 33569|   888| 4203| 0.21127766|
| 70863|    69|  243| 0.28395063|
|312383|    11|   53| 0.20754717|
|175842|   565| 2703|   0.209027|
|328279|   292| 1881| 0.15523657|
|155251|  1300| 5624|  0.2311522|
|235447|  1256|19431| 0.06463898|
| 12046|  1031| 7866| 0.13107044|
+------+------+-----+-----------+
only showing top 20 rows



In [133]:
get_percentiles(ad_id_popularity_df, 'clicks')

{0.0: 0.0,
 0.1: 0.0,
 0.2: 0.0,
 0.30000000000000004: 0.0,
 0.4: 0.0,
 0.5: 1.0,
 0.6000000000000001: 1.0,
 0.7000000000000001: 2.0,
 0.8: 4.0,
 0.9: 12.0,
 1.0: 44824.0}

In [134]:
get_percentiles(ad_id_popularity_df, 'views')

{0.0: 1.0,
 0.1: 1.0,
 0.2: 1.0,
 0.30000000000000004: 2.0,
 0.4: 3.0,
 0.5: 5.0,
 0.6000000000000001: 9.0,
 0.7000000000000001: 14.0,
 0.8: 26.0,
 0.9: 81.0,
 1.0: 211824.0}

In [135]:
ad_id_popularity = ad_id_popularity_df.filter('views > 5').select('ad_id', 'ctr', 'views') \
                    .rdd.map(lambda x: (x['ad_id'], (x['ctr'], x['views'], 1, 1))).collectAsMap()

In [136]:
ad_id_popularity_broad = sc.broadcast(ad_id_popularity)

In [137]:
list(ad_id_popularity.values())[:3]

[(0.04545454680919647, 22, 1, 1),
 (0.06832297891378403, 161, 1, 1),
 (0.125, 32, 1, 1)]

In [138]:
len(ad_id_popularity)

236228

In [139]:
ad_id_avg_ctr = sum(map(lambda x: x[0], ad_id_popularity.values())) / float(len(ad_id_popularity))
ad_id_avg_ctr

0.15279814571842562

In [140]:
ad_id_weighted_avg_ctr = sum(map(lambda x: x[0]*x[1], ad_id_popularity.values())) / float(sum(map(lambda x: x[1], ad_id_popularity.values())))
ad_id_weighted_avg_ctr

0.19398178324024012

In [141]:
ad_id_views_median = np.median(np.array(list(map(lambda x: x[1], ad_id_popularity.values()))))
ad_id_views_median

19.0

In [142]:
ad_id_views_mean = sum(map(lambda x: x[1], ad_id_popularity.values())) / float(len(ad_id_popularity))
ad_id_views_mean

366.68235348900214

### Average CTR by document_id (promoted_content)

In [143]:
document_id_popularity_df = train_set_df.groupby('document_id_promo').agg(F.sum('clicked').alias('clicks'), 
                                                               F.count('*').alias('views'),
                                                               F.countDistinct('ad_id').alias('distinct_ad_ids')) \
                                         .withColumn('ctr', ctr_udf('clicks','views'))

In [144]:
document_id_popularity_df.printSchema()

root
 |-- document_id_promo: integer (nullable = true)
 |-- clicks: long (nullable = true)
 |-- views: long (nullable = false)
 |-- distinct_ad_ids: long (nullable = false)
 |-- ctr: float (nullable = true)



In [145]:
print((document_id_popularity_df.count(), len(document_id_popularity_df.columns)))

(164476, 5)


In [146]:
document_id_popularity_df.show()

+-----------------+------+------+---------------+-----------+
|document_id_promo|clicks| views|distinct_ad_ids|        ctr|
+-----------------+------+------+---------------+-----------+
|          1294383|  2564| 10354|              1| 0.24763377|
|          1164357|    10|    55|              7| 0.18181819|
|          1151028| 23571|608297|            386|0.038749162|
|          1461403|  1430|  8724|              9| 0.16391563|
|           365428|     0|     1|              1|        0.0|
|          2377716|     0|     3|              1|        0.0|
|          1804029|     8|    67|             21| 0.11940298|
|          1102706|   156|   854|            328| 0.18266979|
|          1476904|     2|     9|              2| 0.22222222|
|           870597|  3142| 12495|             64| 0.25146058|
|          1191112|    14|   101|              6| 0.13861386|
|           351175|   269|   712|              3|   0.377809|
|          1112412|     2|    39|             10|0.051282052|
|       

In [147]:
document_id_popularity = document_id_popularity_df.filter('views > 5').select('document_id_promo', 'ctr', 'views', 'distinct_ad_ids') \
                                                .rdd.map(lambda x: (x['document_id_promo'], (x['ctr'], x['views'], x['distinct_ad_ids'], 1))).collectAsMap()
len(document_id_popularity)

88857

In [148]:
document_id_popularity

{1: (0.08571428805589676, 35, 2, 1),
 786436: (0.0, 6, 1, 1),
 2097157: (0.1538461595773697, 39, 2, 1),
 2359297: (0.1428571492433548, 7, 1, 1),
 2359304: (0.08695652335882187, 69, 2, 1),
 1310741: (0.15826086699962616, 575, 12, 1),
 1791321: (0.0, 17, 1, 1),
 1572896: (0.1666666716337204, 24, 4, 1),
 1660251: (0.0, 6, 2, 1),
 1572903: (0.016949152573943138, 59, 1, 1),
 1572905: (0.1305343508720398, 1310, 1, 1),
 1572906: (0.05882352963089943, 34, 1, 1),
 1572907: (0.09638553857803345, 498, 2, 1),
 1572908: (0.05882352963089943, 17, 1, 1),
 1572909: (0.08116883039474487, 308, 4, 1),
 1572911: (0.060975611209869385, 82, 1, 1),
 1310770: (0.1661721020936966, 337, 42, 1),
 1354419: (0.1599999964237213, 50, 1, 1),
 1572917: (0.0, 6, 1, 1),
 1048630: (0.20000000298023224, 75, 9, 1),
 1048631: (0.44184422492980957, 31493, 3, 1),
 1048632: (0.11538461595773697, 104, 2, 1),
 1572921: (0.20000000298023224, 10, 3, 1),
 1848799: (0.2613636255264282, 88, 1, 1),
 1048638: (0.043478261679410934, 23,

In [149]:
document_id_popularity_broad = sc.broadcast(document_id_popularity)

In [150]:
get_percentiles(document_id_popularity_df, 'clicks')

{0.0: 0.0,
 0.1: 0.0,
 0.2: 0.0,
 0.30000000000000004: 0.0,
 0.4: 0.0,
 0.5: 1.0,
 0.6000000000000001: 1.0,
 0.7000000000000001: 3.0,
 0.8: 7.0,
 0.9: 36.0,
 1.0: 61075.0}

In [151]:
get_percentiles(document_id_popularity_df, 'views')

{0.0: 1.0,
 0.1: 1.0,
 0.2: 1.0,
 0.30000000000000004: 2.0,
 0.4: 4.0,
 0.5: 7.0,
 0.6000000000000001: 13.0,
 0.7000000000000001: 24.0,
 0.8: 56.0,
 0.9: 242.0,
 1.0: 773127.0}

In [152]:
document_id_avg_ctr = sum(map(lambda x: x[0], document_id_popularity.values())) / float(len(document_id_popularity))
document_id_avg_ctr

0.1465989063948768

In [153]:
document_id_weighted_avg_ctr = sum(list(map(lambda x: x[0]*x[1], document_id_popularity.values()))) / float(sum(list(map(lambda x: x[1], document_id_popularity.values()))))
document_id_weighted_avg_ctr

0.193792460686562

In [154]:
document_id_views_median = np.median(np.array(list(map(lambda x: x[1], document_id_popularity.values()))))
document_id_views_median

30.0

In [155]:
document_id_views_mean = sum(map(lambda x: x[1], document_id_popularity.values())) / float(len(document_id_popularity))
document_id_views_mean

978.9117570928571

### Average CTR by (doc_event, doc_ad)

In [156]:
doc_event_doc_ad_avg_ctr_df = train_set_df.groupBy('document_id_event', 'document_id_promo') \
                                    .agg(F.sum('clicked').alias('clicks'), 
                                         F.count('*').alias('views'),
                                         F.countDistinct('ad_id').alias('distinct_ad_ids')) \
                                    .withColumn('ctr', ctr_udf('clicks','views'))

In [157]:
doc_event_doc_ad_avg_ctr_df.printSchema()

root
 |-- document_id_event: integer (nullable = true)
 |-- document_id_promo: integer (nullable = true)
 |-- clicks: long (nullable = true)
 |-- views: long (nullable = false)
 |-- distinct_ad_ids: long (nullable = false)
 |-- ctr: float (nullable = true)



In [158]:
print((doc_event_doc_ad_avg_ctr_df.count(), len(doc_event_doc_ad_avg_ctr_df.columns)))

(28491064, 6)


In [159]:
doc_event_doc_ad_avg_ctr_df.show()

+-----------------+-----------------+------+-----+---------------+-----------+
|document_id_event|document_id_promo|clicks|views|distinct_ad_ids|        ctr|
+-----------------+-----------------+------+-----+---------------+-----------+
|          1784896|           821432|     1|    2|              1|        0.5|
|           777551|           208091|     2|   35|             10|0.057142857|
|          2462925|          1151028|     0|    9|              7|        0.0|
|           552426|          1618856|     0|    1|              1|        0.0|
|          1838591|          1459616|     0|    1|              1|        0.0|
|          1954284|          1139269|     0|    6|              3|        0.0|
|          2041726|           468143|    19|   49|              3|  0.3877551|
|          2109475|          1442454|     0|    5|              2|        0.0|
|          2151753|          1128940|    23|  206|              1| 0.11165048|
|           281998|          1037977|     0|    1|  

In [160]:
doc_event_doc_ad_avg_ctr = doc_event_doc_ad_avg_ctr_df.filter('views > 5') \
                    .select('document_id_event', 'document_id_promo','ctr', 'views', 'distinct_ad_ids') \
                    .rdd.map(lambda x: ((x['document_id_event'], x['document_id_promo']), (x['ctr'], x['views'], x['distinct_ad_ids'], 1))).collectAsMap()        

len(doc_event_doc_ad_avg_ctr)

1902834

In [161]:
doc_event_doc_ad_avg_ctr

{(947072, 2299567): (0.1388888955116272, 72, 2, 1),
 (2107784, 905507): (0.375, 16, 2, 1),
 (289513, 1247375): (0.3333333432674408, 9, 1, 1),
 (1387108, 1426866): (0.30000001192092896, 10, 1, 1),
 (1042843, 1139308): (0.42465752363204956, 73, 1, 1),
 (1938192, 1044683): (0.0, 6, 3, 1),
 (635051, 1093088): (0.0, 6, 1, 1),
 (821422, 929998): (0.42105263471603394, 19, 3, 1),
 (1827718, 1666555): (0.046875, 192, 2, 1),
 (2232023, 1534322): (0.0, 25, 4, 1),
 (1671384, 1456020): (0.3333333432674408, 6, 1, 1),
 (2046208, 483905): (0.0833333358168602, 12, 7, 1),
 (275913, 1514666): (0.0, 25, 3, 1),
 (813253, 1107172): (0.1538461595773697, 13, 1, 1),
 (2813658, 861466): (0.1666666716337204, 6, 1, 1),
 (1469469, 1827718): (0.2857142984867096, 7, 3, 1),
 (1980824, 1536100): (0.10000000149011612, 40, 2, 1),
 (1692806, 1465603): (0.09375, 32, 1, 1),
 (2637529, 1776315): (0.75, 16, 1, 1),
 (92207, 1416840): (0.4000000059604645, 15, 1, 1),
 (2442337, 552073): (0.47999998927116394, 25, 1, 1),
 (192120

In [162]:
doc_event_doc_ad_avg_ctr_broad = sc.broadcast(doc_event_doc_ad_avg_ctr)

### Average CTR by country, source_id

In [163]:
source_id_by_country_popularity_df = train_set_df.select('clicked', 'source_id', 'event_country', 'ad_id') \
                                            .groupby('event_country', 'source_id').agg(F.sum('clicked').alias('clicks'), 
                                                                             F.count('*').alias('views'),
                                                                             F.countDistinct('ad_id').alias('distinct_ad_ids')) \
                                         .withColumn('ctr', ctr_udf('clicks','views'))

In [164]:
source_id_by_country_popularity_df.printSchema()

root
 |-- event_country: string (nullable = true)
 |-- source_id: integer (nullable = true)
 |-- clicks: long (nullable = true)
 |-- views: long (nullable = false)
 |-- distinct_ad_ids: long (nullable = false)
 |-- ctr: float (nullable = true)



In [165]:
print((source_id_by_country_popularity_df.count(), len(source_id_by_country_popularity_df.columns)))

(59137, 6)


In [166]:
source_id_by_country_popularity_df.show()

+-------------+---------+------+------+---------------+-----------+
|event_country|source_id|clicks| views|distinct_ad_ids|        ctr|
+-------------+---------+------+------+---------------+-----------+
|           AU|     7424|  1870|  5622|              1| 0.33262184|
|           US|    10525| 26381| 49208|            108|   0.536112|
|           CZ|    12260|    37|   127|              4|  0.2913386|
|           US|     4776| 34498|194778|           4382| 0.17711446|
|           US|     4600|  7516| 28982|            246| 0.25933337|
|           NZ|    13698|   308|   849|             54| 0.36277974|
|           FJ|     7866|    68|   428|             58|  0.1588785|
|           NZ|     7530|    22|   480|             34|0.045833334|
|           CA|     5592|   753|  5741|            348| 0.13116182|
|           PR|    13433|     0|     1|              1|        0.0|
|           KH|     6813|     0|     1|              1|        0.0|
|           PT|    12688|   142|   765|         

In [167]:
#source_id_popularity = source_id_popularity_df.filter('views > 100 and source_id is not null').select('source_id', 'ctr').rdd.collectAsMap()
source_id_by_country_popularity = source_id_by_country_popularity_df.filter('views > 5 and source_id is not null and event_country <> ""').select('event_country', 'source_id', 'ctr', 'views', 'distinct_ad_ids') \
        .rdd.map(lambda x: ((x['event_country'], x['source_id']), (x['ctr'], x['views'], x['distinct_ad_ids'], 1))).collectAsMap()
len(source_id_by_country_popularity)

34333

In [168]:
source_id_by_country_popularity

{(u'CO', 13256): (0.28188976645469666, 635, 3, 1),
 (u'EG', 9414): (0.19565217196941376, 46, 1, 1),
 (u'MX', 6560): (0.1875, 48, 1, 1),
 (u'FI', 11808): (0.095940962433815, 271, 9, 1),
 (u'BD', 144): (0.07339449226856232, 109, 2, 1),
 (u'KR', 2413): (0.2604166567325592, 96, 16, 1),
 (u'PH', 10029): (0.12063991278409958, 7626, 35, 1),
 (u'GB', 11739): (0.1028938889503479, 311, 3, 1),
 (u'CA', 12629): (0.1428571492433548, 7, 2, 1),
 (u'JO', 11355): (0.4285714328289032, 7, 1, 1),
 (u'US', 10127): (0.20000000298023224, 30, 4, 1),
 (u'CN', 3421): (0.0, 12, 1, 1),
 (u'AU', 6491): (0.3566669523715973, 11857, 838, 1),
 (u'AN', 10783): (0.11340206116437912, 97, 7, 1),
 (u'DK', 8088): (0.025641025975346565, 39, 1, 1),
 (u'GB', 4394): (0.1827586144208908, 1160, 41, 1),
 (u'VI', 13861): (0.7142857313156128, 7, 3, 1),
 (u'KR', 12028): (0.2949308753013611, 217, 5, 1),
 (u'US', 4484): (0.06923949718475342, 881, 8, 1),
 (u'CL', 8825): (0.20000000298023224, 10, 1, 1),
 (u'BN', 4536): (0.083333335816860

In [169]:
source_id_by_country_popularity_broad = sc.broadcast(source_id_by_country_popularity)

In [170]:
source_id_by_country_avg_ctr = sum(map(lambda x: x[0], source_id_by_country_popularity.values())) / float(len(source_id_by_country_popularity))
source_id_by_country_avg_ctr

0.1846588083811812

In [171]:
source_id_by_country_weighted_avg_ctr = sum(map(lambda x: x[0]*x[1], source_id_by_country_popularity.values())) / float(sum(map(lambda x: x[1], source_id_by_country_popularity.values())))
source_id_by_country_weighted_avg_ctr

0.19367626877411454

In [172]:
source_id_by_country_views_median = np.median(np.array(list(map(lambda x: x[1], source_id_by_country_popularity.values()))))
source_id_by_country_views_median

41.0

In [173]:
source_id_by_country_views_mean = sum(map(lambda x: x[1], source_id_by_country_popularity.values())) / float(len(source_id_by_country_popularity))
source_id_by_country_views_mean

2534.7824833250806

### Average CTR by source_id

In [174]:
source_id_popularity_df = train_set_df.select('clicked', 'source_id', 'ad_id') \
                                            .groupby('source_id').agg(F.sum('clicked').alias('clicks'), 
                                                                     F.count('*').alias('views'),
                                                                     F.countDistinct('ad_id').alias('distinct_ad_ids')) \
                                         .withColumn('ctr', ctr_udf('clicks','views'))

In [175]:
source_id_popularity_df.printSchema()

root
 |-- source_id: integer (nullable = true)
 |-- clicks: long (nullable = true)
 |-- views: long (nullable = false)
 |-- distinct_ad_ids: long (nullable = false)
 |-- ctr: float (nullable = true)



In [176]:
print((source_id_popularity_df.count(), len(source_id_popularity_df.columns)))

(6991, 5)


In [177]:
source_id_popularity_df.show()

+---------+------+-----+---------------+-----------+
|source_id|clicks|views|distinct_ad_ids|        ctr|
+---------+------+-----+---------------+-----------+
|     8592|  1455| 4367|            305| 0.33318067|
|    13623|     3|    8|              1|      0.375|
|     5300|   425| 5851|             20|0.072637156|
|     7993|    44|  290|            104| 0.15172414|
|     7240|   653| 4167|             10| 0.15670747|
|     7340|  3351| 7244|             35| 0.46258974|
|    13289|    13|   99|             11| 0.13131313|
|     7833|   996| 5436|              9| 0.18322296|
|      496|  1682|12462|             32|  0.1349703|
|     9376|    63|  577|              1| 0.10918544|
|    11858|  1059| 5273|             24| 0.20083444|
|     6397|    11|   61|              5| 0.18032786|
|     6658|   256| 1730|             17| 0.14797688|
|     6620|   201| 1530|              3| 0.13137256|
|    12940|   511| 3553|             11| 0.14382212|
|     7754|   541| 6182|             18|0.0875

In [178]:
source_id_popularity = source_id_popularity_df.filter('views > 10 and source_id is not null').select('source_id', 'ctr', 'views', 'distinct_ad_ids') \
                            .rdd.map(lambda x: (x['source_id'], (x['ctr'], x['views'], x['distinct_ad_ids'], 1))).collectAsMap()
len(source_id_popularity)

5855

In [179]:
source_id_popularity

{2: (0.19870968163013458, 775, 12, 1),
 3: (0.31182795763015747, 93, 7, 1),
 4: (0.13669486343860626, 15370, 112, 1),
 5: (0.10560625791549683, 2301, 30, 1),
 6: (0.22402800619602203, 74821, 53, 1),
 7: (0.20227093994617462, 20344, 42, 1),
 9: (0.12158623337745667, 26730, 137, 1),
 10: (0.1317494660615921, 1389, 25, 1),
 13: (0.18857069313526154, 15469, 8, 1),
 14: (0.07328244298696518, 655, 3, 1),
 15: (0.11295680701732635, 301, 1, 1),
 16: (0.1878787875175476, 165, 17, 1),
 17: (0.16619236767292023, 1757, 9, 1),
 19: (0.28042328357696533, 567, 3, 1),
 20: (0.08695652335882187, 23, 1, 1),
 21: (0.13406293094158173, 731, 1, 1),
 22: (0.37469491362571716, 28271, 195, 1),
 26: (0.375, 16, 2, 1),
 27: (0.0, 26, 1, 1),
 28: (0.3075379431247711, 23919, 16, 1),
 29: (0.0714285746216774, 98, 34, 1),
 30: (0.11237595230340958, 3426, 21, 1),
 31: (0.23849202692508698, 9841, 101, 1),
 32: (0.26847946643829346, 11621, 27, 1),
 34: (0.09615384787321091, 156, 1, 1),
 36: (0.09944751113653183, 181, 

In [180]:
source_id_popularity_broad = sc.broadcast(source_id_popularity)

In [181]:
get_percentiles(source_id_popularity_df, 'clicks')

{0.0: 0.0,
 0.1: 0.0,
 0.2: 2.0,
 0.30000000000000004: 8.0,
 0.4: 28.0,
 0.5: 78.0,
 0.6000000000000001: 179.0,
 0.7000000000000001: 470.0,
 0.8: 1303.0,
 0.9: 5164.0,
 1.0: 97059.0}

In [182]:
get_percentiles(source_id_popularity_df, 'views')

{0.0: 1.0,
 0.1: 4.0,
 0.2: 18.0,
 0.30000000000000004: 69.0,
 0.4: 210.0,
 0.5: 520.0,
 0.6000000000000001: 1161.0,
 0.7000000000000001: 2700.0,
 0.8: 7109.0,
 0.9: 22996.0,
 1.0: 1364504.0}

In [183]:
source_id_popularity = source_id_popularity_df.filter('views > 100 and source_id is not null').select('source_id', 'ctr').rdd.collectAsMap()

In [184]:
source_id_popularity

{8193: 0.27902621030807495,
 2: 0.19870968163013458,
 4: 0.13669486343860626,
 5: 0.10560625791549683,
 8198: 0.15815025568008423,
 8199: 0.17023329436779022,
 9: 0.12158623337745667,
 8202: 0.13105656206607819,
 8203: 0.0445205494761467,
 8204: 0.2681387960910797,
 8205: 0.18887105584144592,
 14: 0.07328244298696518,
 15: 0.11295680701732635,
 16: 0.1878787875175476,
 17: 0.16619236767292023,
 19: 0.28042328357696533,
 21: 0.13406293094158173,
 22: 0.37469491362571716,
 28: 0.3075379431247711,
 30: 0.11237595230340958,
 31: 0.23849202692508698,
 32: 0.26847946643829346,
 8225: 0.27945029735565186,
 34: 0.09615384787321091,
 36: 0.09944751113653183,
 6: 0.22402800619602203,
 38: 0.08571428805589676,
 8231: 0.10968661308288574,
 40: 0.07359569519758224,
 8233: 0.1334696114063263,
 43: 0.16478267312049866,
 8237: 0.16102683544158936,
 8238: 0.17415572702884674,
 8239: 0.28535762429237366,
 48: 0.17448918521404266,
 12296: 0.5822587013244629,
 51: 0.3217550218105316,
 52: 0.20256233215332

### Average CTR by publisher_id

In [185]:
publisher_popularity_df = train_set_df.select('clicked', 'publisher_id', 'ad_id') \
                                            .groupby('publisher_id').agg(F.sum('clicked').alias('clicks'), 
                                                                             F.count('*').alias('views'),
                                                                              F.countDistinct('ad_id').alias('distinct_ad_ids')) \
                                         .withColumn('ctr', ctr_udf('clicks','views'))

In [186]:
publisher_popularity_df.printSchema()

root
 |-- publisher_id: integer (nullable = true)
 |-- clicks: long (nullable = true)
 |-- views: long (nullable = false)
 |-- distinct_ad_ids: long (nullable = false)
 |-- ctr: float (nullable = true)



In [187]:
print((publisher_popularity_df.count(), len(publisher_popularity_df.columns)))

(883, 5)


In [188]:
publisher_popularity_df.show()

+------------+------+------+---------------+----------+
|publisher_id|clicks| views|distinct_ad_ids|       ctr|
+------------+------+------+---------------+----------+
|         148|  7596| 23315|            703|0.32579884|
|        1088| 30201| 56596|             76| 0.5336243|
|         471|   493|  4580|             12|0.10764192|
|         833|     3|     8|              1|     0.375|
|        1238|   435|  3309|              3|0.13145965|
|         463|     1|    26|              2|0.03846154|
|         897| 26982| 67083|            647|0.40221813|
|         540|   802|  1886|             44| 0.4252386|
|         243|  1159|  6654|             32|0.17418094|
|         623|  3178| 16119|              6|0.19715863|
|        1127|  1594|  6478|              4| 0.2460636|
|         737|     0|     2|              2|       0.0|
|         392|    33|   261|              4|0.12643678|
|        1025|   652|  2663|              1|0.24483664|
|        1143| 51446|368869|           1091|0.13

In [189]:
publisher_popularity = publisher_popularity_df.filter('views > 10 and publisher_id is not null').select('publisher_id', 'ctr', 'views', 'distinct_ad_ids') \
                            .rdd.map(lambda x: (x['publisher_id'], (x['ctr'], x['views'], x['distinct_ad_ids'], 1))).collectAsMap()
len(publisher_popularity)

742

In [190]:
publisher_popularity

{2: (0.21247687935829163, 29735, 61, 1),
 3: (0.20963317155838013, 19682, 9, 1),
 4: (0.21732792258262634, 9603, 11, 1),
 5: (0.46546033024787903, 16966, 139, 1),
 6: (0.1349703073501587, 12462, 32, 1),
 8: (0.15000000596046448, 20, 1, 1),
 9: (0.2965448200702667, 251419, 6834, 1),
 11: (0.07692307978868484, 13, 1, 1),
 12: (0.1317494660615921, 1389, 25, 1),
 13: (0.28196147084236145, 571, 4, 1),
 14: (0.24947915971279144, 1920, 1, 1),
 15: (0.12890994548797607, 1055, 3, 1),
 18: (0.3288207948207855, 37525, 1399, 1),
 19: (0.1876543164253235, 13365, 1636, 1),
 20: (0.1830357164144516, 224, 8, 1),
 21: (0.17599999904632568, 375, 2, 1),
 23: (0.0833333358168602, 12, 4, 1),
 24: (0.19283238053321838, 17886, 10, 1),
 26: (0.21608413755893707, 181906, 35, 1),
 28: (0.14543642103672028, 18056, 61, 1),
 31: (0.266978919506073, 1281, 2, 1),
 32: (0.17142857611179352, 735, 46, 1),
 35: (0.12031907588243484, 9026, 16, 1),
 36: (0.1547406017780304, 7826, 17, 1),
 37: (0.30743280053138733, 177847,

In [191]:
publisher_popularity_broad = sc.broadcast(publisher_popularity)

In [192]:
get_percentiles(publisher_popularity_df, 'clicks')

{0.0: 0.0,
 0.1: 0.0,
 0.2: 4.0,
 0.30000000000000004: 33.0,
 0.4: 150.0,
 0.5: 511.0,
 0.6000000000000001: 1343.0,
 0.7000000000000001: 3250.0,
 0.8: 8368.0,
 0.9: 27395.0,
 1.0: 10383344.0}

In [193]:
get_percentiles(publisher_popularity_df, 'views')

{0.0: 1.0,
 0.1: 3.0,
 0.2: 30.0,
 0.30000000000000004: 249.0,
 0.4: 954.0,
 0.5: 2848.0,
 0.6000000000000001: 6654.0,
 0.7000000000000001: 15339.0,
 0.8: 34287.0,
 0.9: 104730.0,
 1.0: 53233366.0}

In [194]:
publisher_id_popularity = publisher_popularity_df.filter('views > 100 and publisher_id is not null').select('publisher_id', 'ctr').rdd.collectAsMap()
len(publisher_id_popularity)

654

In [195]:
publisher_id_popularity

{2: 0.21247687935829163,
 3: 0.20963317155838013,
 4: 0.21732792258262634,
 5: 0.46546033024787903,
 6: 0.1349703073501587,
 9: 0.2965448200702667,
 12: 0.1317494660615921,
 13: 0.28196147084236145,
 14: 0.24947915971279144,
 15: 0.12890994548797607,
 18: 0.3288207948207855,
 19: 0.1876543164253235,
 20: 0.1830357164144516,
 21: 0.17599999904632568,
 24: 0.19283238053321838,
 26: 0.21608413755893707,
 28: 0.14543642103672028,
 31: 0.266978919506073,
 32: 0.17142857611179352,
 35: 0.12031907588243484,
 36: 0.1547406017780304,
 37: 0.30743280053138733,
 38: 0.20494799315929413,
 41: 0.1428571492433548,
 42: 0.35568514466285706,
 46: 0.31025588512420654,
 47: 0.18914729356765747,
 48: 0.08266890794038773,
 49: 0.23726347088813782,
 51: 0.17144490778446198,
 52: 0.19441497325897217,
 54: 0.22184966504573822,
 56: 0.20899635553359985,
 57: 0.3255387246608734,
 58: 0.09147952497005463,
 59: 0.17025257647037506,
 60: 0.20916105806827545,
 61: 0.1538461595773697,
 62: 0.16959750652313232,
 63:

### Average CTR by advertiser_id

In [196]:
advertiser_id_popularity_df = train_set_df.select('clicked', 'advertiser_id', 'ad_id') \
                                            .groupby('advertiser_id').agg(F.sum('clicked').alias('clicks'), 
                                                                          F.count('*').alias('views'),
                                                                          F.countDistinct('ad_id').alias('distinct_ad_ids')) \
                                         .withColumn('ctr', ctr_udf('clicks','views'))

In [197]:
advertiser_id_popularity_df.printSchema()

root
 |-- advertiser_id: integer (nullable = true)
 |-- clicks: long (nullable = true)
 |-- views: long (nullable = false)
 |-- distinct_ad_ids: long (nullable = false)
 |-- ctr: float (nullable = true)



In [198]:
print((advertiser_id_popularity_df.count(), len(advertiser_id_popularity_df.columns)))

(4174, 5)


In [199]:
advertiser_id_popularity_df.show()

+-------------+------+-----+---------------+-----------+
|advertiser_id|clicks|views|distinct_ad_ids|        ctr|
+-------------+------+-----+---------------+-----------+
|         1591|  2626|11869|            103| 0.22124863|
|          471|   811| 6061|             85|  0.1338063|
|         1829| 21116|46569|              4| 0.45343468|
|         3997|   763| 2577|              6|  0.2960807|
|         1088|   304| 2093|              5| 0.14524606|
|         1959|  1156| 7589|             34| 0.15232573|
|         2366|   181|  666|              4| 0.27177176|
|          496|  2314|20272|             15|0.114147596|
|         1580|     6|   25|              2|       0.24|
|         2122|    85|  522|              4| 0.16283526|
|         4101|     8|   69|              2| 0.11594203|
|         3749|  4346|15971|              8|  0.2721182|
|         3918|    75|  625|              8|       0.12|
|         2866|    59|  293|              3| 0.20136519|
|         1645|    29|  182|   

In [200]:
advertiser_id_popularity = advertiser_id_popularity_df.filter('views > 10 and advertiser_id is not null').select('advertiser_id', 'ctr', 'views', 'distinct_ad_ids') \
                            .rdd.map(lambda x: (x['advertiser_id'], (x['ctr'], x['views'], x['distinct_ad_ids'], 1))).collectAsMap()
len(advertiser_id_popularity)

3731

In [201]:
advertiser_id_popularity

{2: (0.0833333358168602, 48, 2, 1),
 3: (0.17528611421585083, 6903, 12, 1),
 4: (0.25985532999038696, 26407, 164, 1),
 5: (0.1931409388780594, 211545, 44, 1),
 6: (0.15849056839942932, 265, 14, 1),
 7: (0.20297718048095703, 7188, 91, 1),
 8: (0.30201035737991333, 41684, 465, 1),
 9: (0.3255387246608734, 35965, 254, 1),
 10: (0.2234499752521515, 1629, 6, 1),
 12: (0.31865569949150085, 87480, 18, 1),
 13: (0.2915695607662201, 87540, 563, 1),
 14: (0.18626461923122406, 211221, 301, 1),
 15: (0.1818564236164093, 7132, 556, 1),
 16: (0.27595236897468567, 142829, 1644, 1),
 17: (0.17626526951789856, 212861, 59, 1),
 18: (0.13665175437927246, 5364, 30, 1),
 19: (0.11376555263996124, 425744, 589, 1),
 20: (0.5749469995498657, 77388, 225, 1),
 21: (0.29374560713768005, 35575, 39, 1),
 22: (0.31746935844421387, 21054, 50, 1),
 23: (0.3257988393306732, 23315, 703, 1),
 24: (0.23449349403381348, 204914, 630, 1),
 25: (0.18305084109306335, 590, 11, 1),
 26: (0.24836742877960205, 60334, 97, 1),
 27:

In [202]:
advertiser_id_popularity_broad = sc.broadcast(advertiser_id_popularity)

In [203]:
get_percentiles(advertiser_id_popularity_df, 'clicks')

{0.0: 0.0,
 0.1: 1.0,
 0.2: 9.0,
 0.30000000000000004: 38.0,
 0.4: 101.0,
 0.5: 220.0,
 0.6000000000000001: 545.0,
 0.7000000000000001: 1250.0,
 0.8: 3359.0,
 0.9: 10588.0,
 1.0: 52469.0}

In [204]:
get_percentiles(advertiser_id_popularity_df, 'views')

{0.0: 1.0,
 0.1: 9.0,
 0.2: 76.0,
 0.30000000000000004: 290.0,
 0.4: 678.0,
 0.5: 1397.0,
 0.6000000000000001: 3131.0,
 0.7000000000000001: 6869.0,
 0.8: 16661.0,
 0.9: 46624.0,
 1.0: 1364504.0}

In [205]:
advertiser_id_popularity = advertiser_id_popularity_df.filter('views > 100 and advertiser_id is not null').select('advertiser_id', 'ctr').rdd.collectAsMap()
len(advertiser_id_popularity)

3268

In [206]:
advertiser_id_popularity

{3: 0.17528611421585083,
 4: 0.25985532999038696,
 5: 0.1931409388780594,
 6: 0.15849056839942932,
 7: 0.20297718048095703,
 8: 0.30201035737991333,
 9: 0.3255387246608734,
 10: 0.2234499752521515,
 12: 0.31865569949150085,
 13: 0.2915695607662201,
 14: 0.18626461923122406,
 15: 0.1818564236164093,
 16: 0.27595236897468567,
 17: 0.17626526951789856,
 18: 0.13665175437927246,
 19: 0.11376555263996124,
 20: 0.5749469995498657,
 21: 0.29374560713768005,
 22: 0.31746935844421387,
 23: 0.3257988393306732,
 24: 0.23449349403381348,
 25: 0.18305084109306335,
 26: 0.24836742877960205,
 27: 0.3493138253688812,
 28: 0.2947576940059662,
 29: 0.3068811595439911,
 30: 0.4603879749774933,
 31: 0.10358566045761108,
 32: 0.22290685772895813,
 33: 0.22039473056793213,
 34: 0.21484927833080292,
 35: 0.1447107046842575,
 36: 0.3252851665019989,
 37: 0.1631205677986145,
 38: 0.20851029455661774,
 39: 0.10450919717550278,
 40: 0.1823529452085495,
 44: 0.36965376138687134,
 45: 0.19198617339134216,
 46: 0.2

### Average CTR by campaign_id

In [207]:
campaign_id_popularity_df = train_set_df.select('clicked', 'campaign_id', 'ad_id') \
                                            .groupby('campaign_id').agg(F.sum('clicked').alias('clicks'), 
                                                                        F.count('*').alias('views'),
                                                                        F.countDistinct('ad_id').alias('distinct_ad_ids')) \
                                         .withColumn('ctr', ctr_udf('clicks','views'))

In [208]:
campaign_id_popularity_df.printSchema()

root
 |-- campaign_id: integer (nullable = true)
 |-- clicks: long (nullable = true)
 |-- views: long (nullable = false)
 |-- distinct_ad_ids: long (nullable = false)
 |-- ctr: float (nullable = true)



In [209]:
print((campaign_id_popularity_df.count(), len(campaign_id_popularity_df.columns)))

(32676, 5)


In [210]:
campaign_id_popularity_df.show()

+-----------+------+-----+---------------+-----------+
|campaign_id|clicks|views|distinct_ad_ids|        ctr|
+-----------+------+-----+---------------+-----------+
|      30654|   433| 3519|             14| 0.12304632|
|      29894|  3005|15758|             11| 0.19069679|
|      26087|   392| 1343|             99| 0.29188386|
|      12799|   337| 4869|            296| 0.06921339|
|      31236|   807| 2422|             98| 0.33319572|
|      28664|   264|  908|             48|  0.2907489|
|      33602|     8|  423|             30| 0.01891253|
|      18800|   844| 3006|             24|  0.2807718|
|      33569|    10|   29|             21|  0.3448276|
|      24347|    16|   34|             21| 0.47058824|
|      30970|    95|  619|              9| 0.15347335|
|      13840|    12|  116|              5| 0.10344828|
|       9900|    20|   85|              7| 0.23529412|
|      31912|    17|  172|              9| 0.09883721|
|      33722|     3|   14|             10| 0.21428572|
|       48

In [211]:
campaign_id_popularity = campaign_id_popularity_df.filter('views > 10 and campaign_id is not null').select('campaign_id', 'ctr', 'views', 'distinct_ad_ids') \
                            .rdd.map(lambda x: (x['campaign_id'], (x['ctr'], x['views'], x['distinct_ad_ids'], 1))).collectAsMap()
len(campaign_id_popularity)

26965

In [212]:
campaign_id_popularity

{1: (0.24167196452617645, 4713, 30, 1),
 2: (0.126379132270813, 1994, 55, 1),
 3: (0.11918604373931885, 344, 2, 1),
 4: (0.26589906215667725, 9293, 251, 1),
 5: (0.03539822995662689, 113, 12, 1),
 6: (0.36313992738723755, 2930, 4, 1),
 7: (0.02711864374577999, 590, 10, 1),
 8: (0.09549795091152191, 733, 25, 1),
 10: (0.07453416287899017, 161, 19, 1),
 11: (0.19682539999485016, 315, 16, 1),
 12: (0.06106870248913765, 131, 16, 1),
 13: (0.19708029925823212, 137, 4, 1),
 16: (0.021922428160905838, 593, 20, 1),
 17: (0.0595238097012043, 168, 14, 1),
 19: (0.0, 13, 2, 1),
 20: (0.06299212574958801, 127, 5, 1),
 21: (0.036036036908626556, 111, 6, 1),
 22: (0.1136866956949234, 6773, 38, 1),
 23: (0.0416666679084301, 72, 33, 1),
 24: (0.04800000041723251, 125, 6, 1),
 25: (0.043478261679410934, 23, 6, 1),
 26: (0.09610763937234879, 2081, 38, 1),
 27: (0.1345565766096115, 654, 10, 1),
 28: (0.2202606499195099, 26396, 145, 1),
 29: (0.21179251372814178, 16502, 221, 1),
 30: (0.36330217123031616,

In [213]:
campaign_id_popularity_broad = sc.broadcast(campaign_id_popularity)

In [214]:
get_percentiles(campaign_id_popularity_df, 'clicks')

{0.0: 0.0,
 0.1: 0.0,
 0.2: 1.0,
 0.30000000000000004: 4.0,
 0.4: 9.0,
 0.5: 19.0,
 0.6000000000000001: 41.0,
 0.7000000000000001: 95.0,
 0.8: 244.0,
 0.9: 887.0,
 1.0: 51382.0}

In [None]:
get_percentiles(campaign_id_popularity_df, 'views')

{0.0: 1.0,
 0.1: 4.0,
 0.2: 14.0,
 0.30000000000000004: 35.0,
 0.4: 74.0,
 0.5: 145.0,
 0.6000000000000001: 289.0,
 0.7000000000000001: 632.0,
 0.8: 1503.0,
 0.9: 4839.0,
 1.0: 698626.0}

In [None]:
campaign_id_popularity = campaign_id_popularity_df.filter('views > 100 and campaign_id is not null').select('campaign_id', 'ctr').rdd.collectAsMap()
len(campaign_id_popularity)

18103

In [None]:
campaign_id_popularity

{1: 0.24167196452617645,
 2: 0.126379132270813,
 3: 0.11918604373931885,
 4: 0.26589906215667725,
 5: 0.03539822995662689,
 6: 0.36313992738723755,
 7: 0.02711864374577999,
 8: 0.09549795091152191,
 10: 0.07453416287899017,
 11: 0.19682539999485016,
 12: 0.06106870248913765,
 13: 0.19708029925823212,
 16: 0.021922428160905838,
 17: 0.0595238097012043,
 20: 0.06299212574958801,
 21: 0.036036036908626556,
 22: 0.1136866956949234,
 24: 0.04800000041723251,
 32794: 0.3256784975528717,
 27: 0.1345565766096115,
 28: 0.2202606499195099,
 29: 0.21179251372814178,
 30: 0.36330217123031616,
 31: 0.03717472031712532,
 32: 0.44258373975753784,
 33: 0.07500000298023224,
 34: 0.05487804859876633,
 36: 0.07821612060070038,
 37: 0.16061611473560333,
 39: 0.03781512752175331,
 40: 0.4150943458080292,
 43: 0.1708860695362091,
 44: 0.14569535851478577,
 45: 0.05627705529332161,
 32814: 0.07829181849956512,
 49: 0.05588993802666664,
 32818: 0.21613934636116028,
 32819: 0.1767088621854782,
 32820: 0.234375

### Average CTR by category

In [None]:
category_id_popularity_df = train_set_df.join(documents_categories_df.alias('cat_local'), on=F.col("document_id_promo") == F.col("cat_local.document_id_cat"), how='inner') \
                                        .select('clicked', 'category_id', 'confidence_level_cat', 'ad_id') \
                                        .groupby('category_id').agg(F.sum('clicked').alias('clicks'), 
                                                                    F.count('*').alias('views'),
                                                                    F.mean('confidence_level_cat').alias('avg_confidence_level_cat'),
                                                                    F.countDistinct('ad_id').alias('distinct_ad_ids')) \
                                         .withColumn('ctr', ctr_udf('clicks','views'))

In [None]:
category_id_popularity_df.printSchema()

root
 |-- category_id: integer (nullable = true)
 |-- clicks: long (nullable = true)
 |-- views: long (nullable = false)
 |-- avg_confidence_level_cat: double (nullable = true)
 |-- distinct_ad_ids: long (nullable = false)
 |-- ctr: float (nullable = true)



In [None]:
print((category_id_popularity_df.count(), len(category_id_popularity_df.columns)))

In [None]:
category_id_popularity_df.show()

In [None]:
category_id_popularity = category_id_popularity_df.filter('views > 10').select('category_id', 'ctr', 'views', 'avg_confidence_level_cat', 'distinct_ad_ids') \
                            .rdd.map(lambda x: (x['category_id'], (x['ctr'], x['views'], x['distinct_ad_ids'], x['avg_confidence_level_cat']))).collectAsMap()
len(category_id_popularity)

In [None]:
category_id_popularity

In [None]:
category_id_popularity_broad = sc.broadcast(category_id_popularity)

In [None]:
list(category_id_popularity.values())[:10]

In [None]:
np.median(np.array(list(map(lambda x: x[1], category_id_popularity.values()))))

In [None]:
sum(map(lambda x: x[1], category_id_popularity.values())) / float(len(category_id_popularity))

In [None]:
# 기본 적으로 범주에 계측 구조가 있음
category_id_popularity

### Average CTR by (country, category)

In [374]:
category_id_by_country_popularity_df = train_set_df.join(documents_categories_df.alias('cat_local'), on=F.col("document_id_promo") == F.col("cat_local.document_id_cat"), how='inner') \
                                        .select('clicked', 'category_id', 'confidence_level_cat', 'event_country', 'ad_id') \
                                        .groupby('event_country','category_id').agg(F.sum('clicked').alias('clicks'), 
                                                                                    F.count('*').alias('views'),
                                                                                    F.mean('confidence_level_cat').alias('avg_confidence_level_cat'),
                                                                                    F.countDistinct('ad_id').alias('distinct_ad_ids')) \
                                         .withColumn('ctr', ctr_udf('clicks','views'))

In [375]:
category_id_by_country_popularity_df.printSchema()

root
 |-- event_country: string (nullable = true)
 |-- category_id: integer (nullable = true)
 |-- clicks: long (nullable = true)
 |-- views: long (nullable = false)
 |-- avg_confidence_level_cat: double (nullable = true)
 |-- distinct_ad_ids: long (nullable = false)
 |-- ctr: float (nullable = true)



In [376]:
print((category_id_by_country_popularity_df.count(), len(category_id_by_country_popularity_df.columns)))

(16989, 7)


In [377]:
category_id_by_country_popularity_df.show()

+-------------+-----------+------+------+------------------------+---------------+----------+
|event_country|category_id|clicks| views|avg_confidence_level_cat|distinct_ad_ids|       ctr|
+-------------+-----------+------+------+------------------------+---------------+----------+
|           CA|       2006| 15901| 65097|     0.11925122044140551|            362|0.24426624|
|           ME|       2004|    18|    76|      0.5238837418391517|             40|0.23684211|
|           US|       1614| 26125|167145|     0.07809115396128045|           1045|0.15630141|
|           ZM|       1403|   539|  3261|      0.4250108736982168|            275|0.16528672|
|           PK|       1614|     8|    69|     0.05645706677350445|             10|0.11594203|
|           GM|       2100|     8|    63|     0.33396321951988195|             37|0.12698413|
|           ET|       1608|     6|    88|     0.48367606704546645|             30|0.06818182|
|           AE|       1808|   378|  1997|      0.52439140321

In [378]:
category_id_by_country_popularity = category_id_by_country_popularity_df.filter('views > 10 and event_country <> ""').select('event_country', 'category_id', 'ctr', 'views', 'avg_confidence_level_cat', 'distinct_ad_ids') \
                                     .rdd.map(lambda x: ((x['event_country'], x['category_id']), (x['ctr'], x['views'], x['distinct_ad_ids'], x['avg_confidence_level_cat']))).collectAsMap()
len(category_id_by_country_popularity)

11910

In [379]:
category_id_by_country_popularity

{(u'CN', 1406): (0.23770491778850555, 122, 39, 0.3471184579319641),
 (u'SZ', 1205): (0.10294117778539658, 204, 78, 0.5798742633276418),
 (u'VI', 1806): (0.42307692766189575, 26, 14, 0.3387056732406983),
 (u'LB', 1512): (0.28947368264198303, 38, 6, 0.17339473825536275),
 (u'NZ', 1208): (0.24856816232204437, 873, 9, 0.0824493313846173),
 (u'GY', 1209): (0.13131313025951385, 99, 30, 0.7053498483893245),
 (u'JO', 1902): (0.1818181872367859, 132, 4, 0.5836100066927347),
 (u'BD', 1612): (0.3095068037509918, 1399, 45, 0.19271252034305247),
 (u'VE', 1710): (0.40740740299224854, 27, 2, 0.07000000029802322),
 (u'AP', 1912): (0.13333334028720856, 15, 8, 0.29045941183964413),
 (u'GD', 1209): (0.1304347813129425, 46, 15, 0.4392626123583835),
 (u'MT', 1209): (0.13108614087104797, 267, 45, 0.585657180011384),
 (u'LV', 1606): (0.13513512909412384, 37, 18, 0.2984846726664015),
 (u'LB', 1407): (0.1428571492433548, 91, 24, 0.4725292758545378),
 (u'IS', 2002): (0.08527132123708725, 258, 22, 0.836863785635

In [380]:
category_id_by_country_popularity_broad = sc.broadcast(category_id_by_country_popularity)

### Average CTR by Topic

In [381]:
topic_id_popularity_df = train_set_df.join(documents_topics_df.alias('top_local'), on=F.col("document_id_promo") == F.col("top_local.document_id_top"), how='inner') \
                                        .select('clicked', 'topic_id', 'confidence_level_top', 'ad_id') \
                                        .groupby('topic_id').agg(F.sum('clicked').alias('clicks'), 
                                                                 F.count('*').alias('views'),
                                                                 F.mean('confidence_level_top').alias('avg_confidence_level_top'),
                                                                 F.countDistinct('ad_id').alias('distinct_ad_ids')) \
                                         .withColumn('ctr', ctr_udf('clicks','views'))

In [382]:
topic_id_popularity_df.printSchema()

root
 |-- topic_id: integer (nullable = true)
 |-- clicks: long (nullable = true)
 |-- views: long (nullable = false)
 |-- avg_confidence_level_top: double (nullable = true)
 |-- distinct_ad_ids: long (nullable = false)
 |-- ctr: float (nullable = true)



In [383]:
print((topic_id_popularity_df.count(), len(topic_id_popularity_df.columns)))

(300, 6)


In [384]:
topic_id_popularity_df.show()

+--------+------+-------+------------------------+---------------+----------+
|topic_id|clicks|  views|avg_confidence_level_top|distinct_ad_ids|       ctr|
+--------+------+-------+------------------------+---------------+----------+
|     148|331735|1363828|     0.13426192197494377|           7619|0.24323815|
|     243| 84521| 293363|    0.019790835068855633|           2678|0.28811064|
|      31| 21534|  92833|      0.2865985556782942|           7454|0.23196493|
|      85|488166|2239670|     0.10823763279640373|          17386|0.21796337|
|     251| 30832| 143973|     0.03564186165744235|            871|0.21415126|
|     137| 51233| 244190|      0.0775061593027229|           2976|0.20980793|
|      65|136858| 693871|     0.06298363105674712|           8780|0.19723839|
|      53| 25715| 101158|     0.11223356000747166|            858| 0.2542063|
|     255| 80075| 331512|      0.1907090215284508|           1065| 0.2415448|
|     296|518516|2475490|     0.18770395020878267|          1109

In [385]:
topic_id_popularity = topic_id_popularity_df.filter('views > 10').select('topic_id', 'ctr', 'views', 'avg_confidence_level_top', 'distinct_ad_ids') \
                            .rdd.map(lambda x: (x['topic_id'], (x['ctr'], x['views'], x['distinct_ad_ids'], x['avg_confidence_level_top']))).collectAsMap()
len(topic_id_popularity)

300

In [386]:
topic_id_popularity

{0: (0.22588878870010376, 676594, 2978, 0.12454212120266196),
 1: (0.18011969327926636, 743550, 3756, 0.07329653618804657),
 2: (0.16660380363464355, 795342, 6653, 0.08900207955325216),
 3: (0.2456766963005066, 160699, 1260, 0.10843043340940528),
 4: (0.2584708034992218, 139184, 1040, 0.05792622426544966),
 5: (0.12331098318099976, 406598, 2634, 0.013693360938958827),
 6: (0.23703773319721222, 256128, 1346, 0.16646229807952262),
 7: (0.15672895312309265, 95260, 616, 0.1281327989038726),
 8: (0.20744867622852325, 2159069, 22224, 0.05014412952315001),
 9: (0.1843664050102234, 343734, 3087, 0.054393204499710156),
 10: (0.19282761216163635, 1740591, 15773, 0.06371794086913858),
 11: (0.21087400615215302, 756115, 3399, 0.15042760267743474),
 12: (0.21754512190818787, 462157, 1409, 0.0939444048727272),
 13: (0.18837426602840424, 659485, 6295, 0.02453314905163204),
 14: (0.21031053364276886, 28049, 431, 0.0883687646427234),
 15: (0.16473616659641266, 944316, 6615, 0.10157626263389441),
 16: (

In [387]:
topic_id_popularity_broad = sc.broadcast(topic_id_popularity)

In [388]:
sum(map(lambda x: x[1], topic_id_popularity.values())) / float(len(topic_id_popularity))

768711.9333333333

In [389]:
sum(map(lambda x: x[2]*x[1], topic_id_popularity.values())) / float(len(topic_id_popularity))

11640508052.943333

### Average CTR by (country, topic)

In [390]:
topic_id_by_country_popularity_df = train_set_df.join(documents_topics_df.alias('top_local'), on=F.col("document_id_promo") == F.col("top_local.document_id_top"), how='inner') \
                                        .select('clicked', 'topic_id', 'confidence_level_top','event_country', 'ad_id') \
                                        .groupby('event_country','topic_id').agg(F.sum('clicked').alias('clicks'), 
                                                                             F.count('*').alias('views'),
                                                                             F.mean('confidence_level_top').alias('avg_confidence_level_top'),
                                                                             F.countDistinct('ad_id').alias('distinct_ad_ids')) \
                                         .withColumn('ctr', ctr_udf('clicks','views'))

In [391]:
topic_id_by_country_popularity_df.printSchema()

root
 |-- event_country: string (nullable = true)
 |-- topic_id: integer (nullable = true)
 |-- clicks: long (nullable = true)
 |-- views: long (nullable = false)
 |-- avg_confidence_level_top: double (nullable = true)
 |-- distinct_ad_ids: long (nullable = false)
 |-- ctr: float (nullable = true)



In [392]:
print((topic_id_by_country_popularity_df.count(), len(topic_id_by_country_popularity_df.columns)))

(56203, 7)


In [393]:
topic_id_by_country_popularity_df.show()

+-------------+--------+------+------+------------------------+---------------+----------+
|event_country|topic_id|clicks| views|avg_confidence_level_top|distinct_ad_ids|       ctr|
+-------------+--------+------+------+------------------------+---------------+----------+
|           CA|      92| 21662|139614|    0.029356814903765235|           2377|0.15515636|
|           CA|      24| 42212|230148|     0.11419139114405072|           2152|0.18341242|
|           AU|     224|   965|  5426|     0.03639321019934814|            878| 0.1778474|
|           CO|      20|   200|   723|     0.10841352173885946|            107| 0.2766252|
|           NZ|      83|    11|   131|     0.07516456949204661|             64|0.08396947|
|           FI|      16|   374|  2898|     0.05158171412265706|            185|0.12905452|
|           GT|     160|   117|   456|     0.28109774911335034|             56|0.25657895|
|           SG|     276|  4561| 14924|    0.029084066187222025|            548|0.30561513|

In [394]:
topic_id_id_by_country_popularity = topic_id_by_country_popularity_df.filter('views > 10 and event_country <> ""').select('event_country', 'topic_id', 'ctr', 'views', 'avg_confidence_level_top', 'distinct_ad_ids') \
                            .rdd.map(lambda x: ((x['event_country'], x['topic_id']), (x['ctr'], x['views'], x['distinct_ad_ids'], x['avg_confidence_level_top']))).collectAsMap()
len(topic_id_id_by_country_popularity)

36611

In [395]:
topic_id_id_by_country_popularity

{(u'GA', 249): (0.06666667014360428, 15, 10, 0.08688442669808864),
 (u'GH', 101): (0.07511737197637558, 426, 12, 0.21659394290446884),
 (u'DE', 291): (0.14864864945411682, 1702, 75, 0.05379584194713663),
 (u'CL', 133): (0.2777777910232544, 18, 6, 0.011680232723140054),
 (u'PA', 140): (0.19706840813159943, 614, 118, 0.1928344390109327),
 (u'UA', 3): (0.3333333432674408, 15, 5, 0.07004577070474624),
 (u'LR', 82): (0.22727273404598236, 22, 7, 0.014361084608191793),
 (u'BM', 198): (0.1764705926179886, 323, 59, 0.14440360544508918),
 (u'ZM', 150): (0.4615384638309479, 39, 13, 0.013496448453038167),
 (u'CL', 211): (0.4166666567325592, 12, 8, 0.02117649690868954),
 (u'SI', 79): (0.4615384638309479, 13, 3, 0.009371809088266812),
 (u'BY', 140): (0.28723403811454773, 94, 47, 0.1638008347276519),
 (u'CM', 86): (0.11999999731779099, 25, 11, 0.04738843351602554),
 (u'ID', 270): (0.15415821969509125, 493, 25, 0.025094599361034604),
 (u'DK', 35): (0.10884353518486023, 147, 17, 0.014556696917228147),


In [396]:
topic_id_id_by_country_popularity_broad = sc.broadcast(topic_id_id_by_country_popularity)

### Average CTR by Entity

In [397]:
entity_id_popularity_df = train_set_df.join(documents_entities_df.alias('ent_local'), on=F.col("document_id_promo") == F.col("ent_local.document_id_ent"), how='inner') \
                                        .select('clicked', 'entity_id', 'confidence_level_ent', 'ad_id') \
                                        .groupby('entity_id').agg(F.sum('clicked').alias('clicks'), 
                                                                  F.count('*').alias('views'),
                                                                  F.mean('confidence_level_ent').alias('avg_confidence_level_ent'),
                                                                  F.countDistinct('ad_id').alias('distinct_ad_ids')) \
                                         .withColumn('ctr', ctr_udf('clicks','views'))

In [398]:
entity_id_popularity_df.printSchema()

root
 |-- entity_id: string (nullable = true)
 |-- clicks: long (nullable = true)
 |-- views: long (nullable = false)
 |-- avg_confidence_level_ent: double (nullable = true)
 |-- distinct_ad_ids: long (nullable = false)
 |-- ctr: float (nullable = true)



In [399]:
print((entity_id_popularity_df.count(), len(entity_id_popularity_df.columns)))

(146351, 6)


In [400]:
entity_id_popularity_df.show()

+--------------------+------+-----+------------------------+---------------+-----------+
|           entity_id|clicks|views|avg_confidence_level_ent|distinct_ad_ids|        ctr|
+--------------------+------+-----+------------------------+---------------+-----------+
|4700853c94c999ca8...|  2373| 8392|      0.3403614094024533|            101|  0.2827693|
|0eb21b66eaa6a00fe...|   334| 2150|     0.39640515967857004|            221| 0.15534884|
|2be0010e2a40c1ece...|  2855|23496|      0.3197848407179371|             74|0.121510044|
|cae9ea88182855368...|     0|   10|      0.3920667827129364|              6|        0.0|
|44df7e630674aa8c3...|  3326|17158|     0.01942388064267405|              6| 0.19384544|
|693f7c2971614fad3...|   340| 3230|      0.2390390865850744|             41| 0.10526316|
|b0d8f60f7664f2ae1...|   606| 4149|      0.6945773475312071|             33| 0.14605929|
|616108818dda595f2...|     0|    2|      0.2946639955043793|              1|        0.0|
|982a6834b565749d9...

In [401]:
entity_id_popularity = entity_id_popularity_df.filter('views > 5').select('entity_id', 'ctr', 'views', 'avg_confidence_level_ent', 'distinct_ad_ids') \
                                     .rdd.map(lambda x: (x['entity_id'], (x['ctr'], x['views'], x['distinct_ad_ids'], x['avg_confidence_level_ent']))).collectAsMap()
len(entity_id_popularity)

89004

In [402]:
entity_id_popularity

{u'0b9f0fef5f202855d326451f57472cf1': (0.0, 6, 1, 0.3419959843158722),
 u'945cfbbb25dd0d3d9ffa685272bd10cd': (0.2295081913471222,
  61,
  4,
  0.708254518078976),
 u'd7da908ad6f7afb1f783429230b0cf7e': (0.23076923191547394,
  65,
  7,
  0.4389866787653703),
 u'b1139aedfdb8e2226e5a6c89253883f4': (0.13480663299560547,
  905,
  2,
  0.38677939772605896),
 u'7c006c07568c0c2f6e47a3df621f137a': (0.06896551698446274,
  29,
  4,
  0.5017787218093872),
 u'2bf9b1cd6346d4ec680c9da1e7416d82': (0.1428571492433548,
  7,
  3,
  0.40570440888404846),
 u'78de74e9f8b222d39887ab8bdaf4c828': (0.23589743673801422,
  3510,
  106,
  0.4096468263541871),
 u'0233417c3437cdbcf16450e1cb2c19cc': (0.1010332927107811,
  3484,
  77,
  0.2830689114814648),
 u'0578474d15a9531500933ad3bb5d8706': (0.16357605159282684,
  25022,
  25,
  0.401252938082379),
 u'd0140aa3579f194aa7a35f29f4b16893': (0.07488986849784851,
  227,
  3,
  0.3870576322078705),
 u'409ee03a271af97578fc17302a9a268c': (0.21005459129810333,
  27112,
  77,

In [403]:
entity_id_popularity_broad = sc.broadcast(entity_id_popularity)

In [404]:
np.median(np.array(list(map(lambda x: x[1], entity_id_popularity.values()))))

48.0

In [405]:
sum(map(lambda x: x[1], entity_id_popularity.values())) / float(len(entity_id_popularity))

2449.352096534987

### Average CTR by (country, entity)

In [None]:
entity_id_by_country_popularity_df = train_set_df.join(documents_entities_df.alias('ent_local'), on=F.col("document_id_promo") == F.col("ent_local.document_id_ent"), how='inner') \
                                        .select('clicked', 'entity_id', 'event_country', 'confidence_level_ent','ad_id') \
                                        .groupby('event_country','entity_id').agg(F.sum('clicked').alias('clicks'), 
                                                                             F.count('*').alias('views'),
                                                                             F.mean('confidence_level_ent').alias('avg_confidence_level_ent'),
                                                                             F.countDistinct('ad_id').alias('distinct_ad_ids')) \
                                         .withColumn('ctr', ctr_udf('clicks','views'))

In [407]:
entity_id_by_country_popularity_df.printSchema()

root
 |-- event_country: string (nullable = true)
 |-- entity_id: string (nullable = true)
 |-- clicks: long (nullable = true)
 |-- views: long (nullable = false)
 |-- avg_confidence_level_ent: double (nullable = true)
 |-- distinct_ad_ids: long (nullable = false)
 |-- ctr: float (nullable = true)



In [408]:
print((entity_id_by_country_popularity_df.count(), len(entity_id_by_country_popularity_df.columns)))

(629432, 7)


In [None]:
entity_id_by_country_popularity_df.show()

+-------------+--------------------+------+------+------------------------+---------------+-----------+
|event_country|           entity_id|clicks| views|avg_confidence_level_ent|distinct_ad_ids|        ctr|
+-------------+--------------------+------+------+------------------------+---------------+-----------+
|           EE|edff6d24609915ca2...|     0|     1|      0.7931986451148987|              1|        0.0|
|           HR|547ee2aeabd67dc74...|    13|    67|      0.5127823352813721|             12| 0.19402985|
|           US|916b51d124ff44480...|  4612| 18571|     0.28301924562892006|            171|  0.2483442|
|           CL|d4bae7e19df1cdcb1...|     2|    22|      0.6119086769494143|              4| 0.09090909|
|           BE|3a32ddea60e9a4f73...|   109|   350|      0.4005678594112396|             66| 0.31142858|
|           US|e7f1ca00a9fe11a52...|  7377| 26483|     0.24046145796762314|            255| 0.27855605|
|           US|4c39808057a299f87...| 71857|296130|     0.3227297

In [257]:
entity_id_by_country_popularity = entity_id_by_country_popularity_df.filter('views > 5 and event_country <> ""').select('event_country', 'entity_id', 'ctr', 'views', 'avg_confidence_level_ent', 'distinct_ad_ids') \
                .rdd.map(lambda x: ((x['event_country'], x['entity_id']), (x['ctr'], x['views'], x['distinct_ad_ids'], x['avg_confidence_level_ent']))).collectAsMap()
len(entity_id_by_country_popularity)

262920

In [None]:
entity_id_by_country_popularity

In [None]:
entity_id_by_country_popularity_broad = sc.broadcast(entity_id_by_country_popularity)

### Loading # docs by categories, topics, entities

In [202]:
import cPickle #python2
#import _pickle as cPickle #python3

# 참고 - http://pythonstudy.xyz/python/article/510-%EC%A7%81%EB%A0%AC%ED%99%94%EC%99%80-%EC%97%AD%EC%A7%81%EB%A0%AC%ED%99%94

In [203]:
with open('categories_docs_counts.pickle', 'rb') as input_file:
    categories_docs_counts = cPickle.load(input_file)    
len(categories_docs_counts)

97

In [204]:
categories_docs_counts

{1000: 5074,
 1100: 212249,
 1200: 7,
 1202: 3259,
 1203: 30511,
 1204: 8258,
 1205: 103539,
 1206: 7523,
 1207: 28540,
 1208: 3575,
 1209: 34390,
 1210: 51624,
 1211: 4871,
 1302: 29105,
 1303: 14198,
 1304: 6660,
 1305: 5426,
 1306: 13248,
 1307: 1506,
 1308: 473,
 1400: 1,
 1402: 54763,
 1403: 572107,
 1404: 30667,
 1405: 64063,
 1406: 54394,
 1407: 124783,
 1408: 155883,
 1500: 2,
 1502: 16262,
 1503: 57335,
 1504: 15084,
 1505: 51339,
 1506: 14846,
 1507: 2569,
 1509: 2941,
 1510: 83877,
 1511: 25254,
 1512: 18502,
 1513: 276203,
 1514: 60131,
 1515: 31992,
 1516: 2454,
 1600: 3,
 1602: 74315,
 1603: 77881,
 1604: 67342,
 1605: 2094,
 1606: 32408,
 1607: 25349,
 1608: 57479,
 1609: 63995,
 1610: 49192,
 1611: 50112,
 1612: 32503,
 1613: 44567,
 1614: 9141,
 1700: 1,
 1702: 408499,
 1703: 93883,
 1704: 2861,
 1705: 6553,
 1706: 105170,
 1707: 136830,
 1708: 142908,
 1709: 18922,
 1710: 13242,
 1711: 46729,
 1800: 8,
 1802: 7586,
 1804: 3632,
 1805: 40300,
 1806: 68248,
 1807: 43922

In [None]:
with open('topics_docs_counts.pickle', 'rb') as input_file:
    topics_docs_counts = cPickle.load(input_file)
len(topics_docs_counts)

In [None]:
topics_docs_counts

In [None]:
with open('entities_docs_counts.pickle', 'rb') as input_file:
    entities_docs_counts = cPickle.load(input_file)
len(entities_docs_counts)

In [None]:
entities_docs_counts

In [None]:
documents_total = documents_meta_df.count()
documents_total

### Exploring Publish Time

In [None]:
publish_times_df = train_set_df.filter('publish_time is not null').select('document_id_promo','publish_time').distinct().select(F.col('publish_time').cast(IntegerType()))

In [None]:
publish_times_df.printSchema()

In [None]:
print((publish_times_df.count(), len(publish_times_df.columns)))

In [None]:
publish_times_df.show()

In [None]:
publish_time_percentiles = get_percentiles(publish_times_df, 'publish_time', quantiles_levels=[0.5], max_error_rate=0.001)
publish_time_percentiles

In [None]:
publish_time_median = int(publish_time_percentiles[0.5])
publish_time_median

In [None]:
datetime.datetime.utcfromtimestamp(publish_time_median)

In [None]:
def get_days_diff(newer_timestamp, older_timestamp):
    sec_diff = newer_timestamp - older_timestamp
    days_diff = sec_diff / 60 / 60 / 24
    return days_diff

def get_time_decay_factor(timestamp, timestamp_ref=None, alpha=0.001):
    if timestamp_ref == None:
        timestamp_ref = time.time()
        
    days_diff = get_days_diff(timestamp_ref, timestamp) # get_days_diff : time.time() - timestamp
    denominator = math.pow(1+alpha, days_diff)
    if denominator != 0:
        return 1.0 / denominator
    else:
        return 0.0

In [None]:
def convert_odd_timestamp(timestamp_ms_relative):
    TIMESTAMP_DELTA=1465876799998
    return datetime.datetime.fromtimestamp((int(timestamp_ms_relative)+TIMESTAMP_DELTA)//1000)

In [None]:
TIME_DECAY_ALPHA = 0.0005

In [None]:
ref_dates = [
                1476714880, # 7 days
                1474727680, # 30 days
                1469370880, # 90 days
                1461508480,  # 180 days
                1445697280, # 1 year
                1414161280 # 2 years
]

for d in ref_dates:
    print(datetime.datetime.utcfromtimestamp(d), get_time_decay_factor(d, alpha=TIME_DECAY_ALPHA))

### Get local time

In [None]:
DEFAULT_TZ_EST = -4.0

In [None]:
def get_local_utc_bst_tz(event_country, event_country_state):
    local_tz = DEFAULT_TZ_EST
    if len(event_country) > 0:
        if event_country in countries_utc_dst_broad.value:
            local_tz = countries_utc_dst_broad.value[event_country]
            if len(event_country_state)>2:
                state = event_country_state[3:5]
                if event_country == 'US':  
                    if state in us_states_utc_dst_broad.value:
                        local_tz = us_states_utc_dst_broad.value[state]                
                elif event_country == 'CA':
                    if state in ca_countries_utc_dst_broad.value:
                        local_tz = ca_countries_utc_dst_broad.value[state] 
    return float(local_tz)

In [None]:
hour_bins_dict = {'EARLY_MORNING': 1,
             'MORNING': 2,
             'MIDDAY': 3,
             'AFTERNOON': 4,
             'EVENING': 5,
             'NIGHT': 6}

hour_bins_values = sorted(hour_bins_dict.values())

In [None]:
def get_hour_bin(hour):
    if hour >= 5 and hour < 8:
        hour_bin = hour_bins_dict['EARLY_MORNING']
    elif hour >= 8 and hour < 11:
        hour_bin = hour_bins_dict['MORNING']
    elif hour >= 11 and hour < 14:
        hour_bin = hour_bins_dict['MIDDAY']
    elif hour >= 14 and hour < 19:
        hour_bin = hour_bins_dict['AFTERNOON']
    elif hour >= 19 and hour < 22:
        hour_bin = hour_bins_dict['EVENING']
    else:
        hour_bin = hour_bins_dict['NIGHT']
    return hour_bin

In [None]:
def get_local_datetime(dt, event_country, event_country_state):
    local_tz = get_local_utc_bst_tz(event_country, event_country_state)  
    tz_delta = local_tz - DEFAULT_TZ_EST
    local_time = dt +  datetime.timedelta(hours=tz_delta)
    return local_time

In [None]:
get_local_datetime(datetime.datetime.now(), 'US', 'US>CA')

In [None]:
def is_weekend(dt):
    return dt.weekday() >= 5

In [None]:
is_weekend(datetime.datetime(2016, 6, 14))

### Average CTR functions

In [None]:
timestamp_ref = date_time_to_unix_epoch(datetime.datetime(2016, 6, 29, 3, 59, 59))
timestamp_ref

In [None]:
decay_factor_default = get_time_decay_factor(publish_time_median, timestamp_ref, alpha=TIME_DECAY_ALPHA)
print("decay_factor_default", decay_factor_default)

In [None]:
def get_confidence_sample_size(sample, max_for_reference=100000):
    #Avoiding overflow for large sample size
    if sample >= max_for_reference:
        return 1.0

    ref_log = math.log(1+max_for_reference, 2) #Curiosly reference in log  with base 2 gives a slightly higher score, so I will keep
    
    return math.log(1+sample) / float(ref_log)
    
for i in [0,0.5,1,2,3,4,5,10,20,30,100,200,300,1000,2000,3000,10000,20000,30000, 50000, 90000, 100000, 500000, 900000, 1000000, 2171607]:
    print(i, get_confidence_sample_size(i))

In [None]:
def get_popularity(an_id, a_dict):
    return (a_dict[an_id][0], get_confidence_sample_size(a_dict[an_id][1] / float(a_dict[an_id][2])) * a_dict[an_id][3]) if an_id in a_dict else (None, None)

In [None]:
def get_weighted_avg_popularity_from_list(ids_list, confidence_ids_list, pop_dict):
    pops = list(filter(lambda x: x[0][0]!=None, [(get_popularity(an_id, pop_dict), confidence) for an_id, confidence in zip(ids_list, confidence_ids_list)]))
    #print("pops",pops)
    if len(pops) > 0:
        weighted_avg = sum(map(lambda x: x[0][0]*x[0][1]*x[1], pops)) / float(sum(map(lambda x: x[0][1]*x[1], pops)))
        confidence = max(map(lambda x: x[0][1]*x[1], pops))
        return weighted_avg, confidence
    else:
        return None, None

In [None]:
def get_weighted_avg_country_popularity_from_list(event_country, ids_list, confidence_ids_list, pop_dict):
    pops = list(filter(lambda x: x[0][0]!=None, [(get_popularity((event_country, an_id), pop_dict), confidence) for an_id, confidence in zip(ids_list, confidence_ids_list)]))
    
    if len(pops) > 0:
        weighted_avg = sum(map(lambda x: x[0][0]*x[0][1]*x[1], pops)) / float(sum(map(lambda x: x[0][1]*x[1], pops)))
        confidence = max(map(lambda x: x[0][1]*x[1], pops))
        return weighted_avg, confidence
    else:
        return None, None

In [None]:
def get_popularity_score(event_country, ad_id, document_id, source_id, 
                         publisher_id, advertiser_id, campaign_id, document_id_event,
                            category_ids_by_doc, cat_confidence_level_by_doc, 
                            topic_ids_by_doc, top_confidence_level_by_doc,
                            entity_ids_by_doc, ent_confidence_level_by_doc,
                            output_detailed_list=False):
    probs = []
    
    avg_ctr, confidence = get_popularity(ad_id, ad_id_popularity_broad.value)    
    if avg_ctr != None:
        probs.append(('pop_ad_id', avg_ctr, confidence))
        
    avg_ctr, confidence = get_popularity(document_id, document_id_popularity_broad.value)
    if avg_ctr != None:
        probs.append(('pop_document_id', avg_ctr, confidence))  
        
    avg_ctr, confidence = get_popularity((document_id_event, document_id), doc_event_doc_ad_avg_ctr_broad.value)
    if avg_ctr != None:
        probs.append(('pop_doc_event_doc_ad', avg_ctr, confidence))
        
        
    if source_id != -1:
        avg_ctr = None
        if event_country != '':
            avg_ctr, confidence = get_popularity((event_country, source_id), source_id_by_country_popularity_broad.value)
        if avg_ctr != None:
            probs.append(('pop_source_id_country', avg_ctr, confidence))
            
        avg_ctr, confidence = get_popularity(source_id, source_id_popularity_broad.value)        
        if avg_ctr != None:
            probs.append(('pop_source_id', avg_ctr, confidence))
            
            
    if publisher_id != None:
        avg_ctr, confidence = get_popularity(publisher_id, publisher_popularity_broad.value)
        if avg_ctr != None:
            probs.append(('pop_publisher_id', avg_ctr, confidence)) 
            
    if advertiser_id != None:
        avg_ctr, confidence = get_popularity(advertiser_id, advertiser_id_popularity_broad.value)
        if avg_ctr != None:
            probs.append(('pop_advertiser_id', avg_ctr, confidence)) 
    
    if campaign_id != None:
        avg_ctr, confidence = get_popularity(campaign_id, campaign_id_popularity_broad.value)
        if avg_ctr != None:
            probs.append(('pop_campain_id', avg_ctr, confidence))  

    if len(entity_ids_by_doc) > 0: 
        avg_ctr = None
        if event_country != '':
            avg_ctr, confidence = get_weighted_avg_country_popularity_from_list(event_country, entity_ids_by_doc, ent_confidence_level_by_doc, 
                                        entity_id_by_country_popularity_broad.value)
        if avg_ctr != None:
            probs.append(('pop_entity_id_country', avg_ctr, confidence))
            
        avg_ctr, confidence = get_weighted_avg_popularity_from_list(entity_ids_by_doc, ent_confidence_level_by_doc, 
                                                                    entity_id_popularity_broad.value) 
        if avg_ctr != None:
            probs.append(('pop_entity_id', avg_ctr, confidence))
            
    
    
    if len(topic_ids_by_doc) > 0:  
        avg_ctr = None
        if event_country != '':
            avg_ctr, confidence = get_weighted_avg_country_popularity_from_list(event_country, topic_ids_by_doc, top_confidence_level_by_doc, 
                                        topic_id_id_by_country_popularity_broad.value)
        if avg_ctr != None:
            probs.append(('pop_topic_id_country', avg_ctr, confidence))
            
        avg_ctr, confidence = get_weighted_avg_popularity_from_list(topic_ids_by_doc, top_confidence_level_by_doc, 
                                                                    topic_id_popularity_broad.value)            
        if avg_ctr != None:
            probs.append(('pop_topic_id', avg_ctr, confidence))
    
    
    if len(category_ids_by_doc) > 0:  
        avg_ctr = None
        if event_country != '':
            avg_ctr, confidence = get_weighted_avg_country_popularity_from_list(event_country, category_ids_by_doc, cat_confidence_level_by_doc, 
                                        category_id_by_country_popularity_broad.value)
        if avg_ctr != None:
            probs.append(('pop_category_id_country', avg_ctr, confidence))
        
        avg_ctr, confidence = get_weighted_avg_popularity_from_list(category_ids_by_doc, cat_confidence_level_by_doc, 
                                                                    category_id_popularity_broad.value)
        if avg_ctr != None:
            probs.append(('pop_category_id', avg_ctr, confidence))
    
    #print("[get_popularity_score] probs", probs)
    if output_detailed_list:
        return probs
    
    else:    
        if len(probs) > 0:
            #weighted_avg_probs_by_confidence = sum(map(lambda x: x[1] *  math.log(1+x[2],2), probs)) / float(sum(map(lambda x: math.log(1+x[2],2), probs)))        
            weighted_avg_probs_by_confidence = sum(map(lambda x: x[1] * x[2], probs)) / float(sum(map(lambda x: x[2], probs)))                
            confidence = max(map(lambda x: x[2], probs))
            return weighted_avg_probs_by_confidence, confidence
        else:
            return None, None

### Content-Based similarity functions

In [None]:
def cosine_similarity_dicts(dict1, dict2):
    dict1_norm = math.sqrt(sum([v**2 for v in dict1.values()]))
    dict2_norm = math.sqrt(sum([v**2 for v in dict2.values()]))
    
    sum_common_aspects = 0.0
    intersections = 0
    for key in dict1:
        if key in dict2:
            sum_common_aspects += dict1[key] * dict2[key] 
            intersections += 1
        
    return sum_common_aspects / (dict1_norm * dict2_norm), intersections

In [None]:
def cosine_similarity_user_docs_aspects(user_aspect_profile, doc_aspect_ids, doc_aspects_confidence, aspect_docs_counts):
    if user_aspect_profile==None or len(user_aspect_profile) == 0 or doc_aspect_ids == None or len(doc_aspect_ids) == 0:
        return None, None
        
    doc_aspects = dict(zip(doc_aspect_ids, doc_aspects_confidence))
    doc_aspects_tfidf_confid = {}
    for key in doc_aspects:
        tf = 1.0
        idf = math.log(math.log(documents_total / float(aspect_docs_counts[key])))
        confidence = doc_aspects[key]
        doc_aspects_tfidf_confid[key] = tf*idf * confidence
        
    user_aspects_tfidf_confid = {}    
    for key in user_aspect_profile:
        tfidf = user_aspect_profile[key][0]
        confidence = user_aspect_profile[key][1]
        user_aspects_tfidf_confid[key] = tfidf * confidence
        
    similarity, intersections = cosine_similarity_dicts(doc_aspects_tfidf_confid, user_aspects_tfidf_confid)
    
    if intersections > 0:
        #P(A intersect B)_intersections = P(A)^intersections * P(B)^intersections
        random_error = math.pow(len(doc_aspects)         / float(len(aspect_docs_counts)), intersections) * \
                       math.pow(len(user_aspect_profile) / float(len(aspect_docs_counts)), intersections)
        confidence = 1.0 - random_error
    else:
        #P(A not intersect B) = 1 - P(A intersect B)
        random_error = 1 - ((len(doc_aspects) / float(len(aspect_docs_counts))) * \
                            (len(user_aspect_profile) / float(len(aspect_docs_counts))))
    
    confidence = 1.0 - random_error    
    
    return similarity, confidence

In [None]:
def cosine_similarity_doc_event_doc_ad_aspects(doc_event_aspect_ids, doc_event_aspects_confidence, 
                                               doc_ad_aspect_ids, doc_ad_aspects_confidence, 
                                               aspect_docs_counts):
    if doc_event_aspect_ids == None or len(doc_event_aspect_ids) == 0 or \
       doc_ad_aspect_ids == None or len(doc_ad_aspect_ids) == 0:
        return None, None
        
    doc_event_aspects = dict(zip(doc_event_aspect_ids, doc_event_aspects_confidence))
    doc_event_aspects_tfidf_confid = {}
    for key in doc_event_aspect_ids:
        tf = 1.0
        idf = math.log(math.log(documents_total / float(aspect_docs_counts[key])))
        confidence = doc_event_aspects[key]
        doc_event_aspects_tfidf_confid[key] = tf*idf * confidence
        
    doc_ad_aspects = dict(zip(doc_ad_aspect_ids, doc_ad_aspects_confidence))
    doc_ad_aspects_tfidf_confid = {}
    for key in doc_ad_aspect_ids:
        tf = 1.0
        idf = math.log(math.log(documents_total / float(aspect_docs_counts[key])))
        confidence = doc_ad_aspects[key]
        doc_ad_aspects_tfidf_confid[key] = tf*idf * confidence
        
    similarity, intersections = cosine_similarity_dicts(doc_event_aspects_tfidf_confid, doc_ad_aspects_tfidf_confid)
    
    if intersections > 0:
        #P(A intersect B)_intersections = P(A)^intersections * P(B)^intersections
        random_error = math.pow(len(doc_event_aspect_ids) / float(len(aspect_docs_counts)), intersections) * \
                       math.pow(len(doc_ad_aspect_ids) / float(len(aspect_docs_counts)), intersections)
        confidence = 1.0 - random_error
    else:
        #P(A not intersect B) = 1 - P(A intersect B)
        random_error = 1 - ((len(doc_event_aspect_ids) / float(len(aspect_docs_counts))) * \
                            (len(doc_ad_aspect_ids) / float(len(aspect_docs_counts))))
    
    confidence = 1.0 - random_error    
    
    return similarity, confidence

In [None]:
def get_user_cb_interest_score(user_views_count, user_categories, user_topics, user_entities, 
                            timestamp_event, category_ids_by_doc, cat_confidence_level_by_doc, 
                            topic_ids_by_doc, top_confidence_level_by_doc,
                            entity_ids_by_doc, ent_confidence_level_by_doc, 
                            output_detailed_list=False):

    #Content-Based
    
    sims = []
    
    categories_similarity, cat_sim_confidence = cosine_similarity_user_docs_aspects(user_categories, category_ids_by_doc, cat_confidence_level_by_doc, categories_docs_counts)
    if categories_similarity != None:
        sims.append(('user_doc_ad_sim_categories', categories_similarity, cat_sim_confidence))
    
    topics_similarity, top_sim_confidence = cosine_similarity_user_docs_aspects(user_topics, topic_ids_by_doc, top_confidence_level_by_doc, topics_docs_counts)
    if topics_similarity != None:
        sims.append(('user_doc_ad_sim_topics', topics_similarity, top_sim_confidence))
    
    entities_similarity, entity_sim_confid = cosine_similarity_user_docs_aspects(user_entities, entity_ids_by_doc, ent_confidence_level_by_doc, entities_docs_counts)
    if entities_similarity != None:
        sims.append(('user_doc_ad_sim_entities', entities_similarity, entity_sim_confid))
    
    if output_detailed_list:
        return sims
    else:
        if len(sims) > 0:
            weighted_avg_sim_by_confidence = sum(map(lambda x: x[1]*x[2], sims)) / float(sum(map(lambda x: x[2], sims)))
            confidence = sum(map(lambda x: x[2], sims)) / float(len(sims))

            #print("[get_user_cb_interest_score] sims: {} | Avg: {} - Confid: {}".format(sims, weighted_avg_sim_by_confidence, confidence))
            return weighted_avg_sim_by_confidence, confidence
        else:
            return None, None

In [None]:
def get_doc_event_doc_ad_cb_similarity_score(doc_event_category_ids, doc_event_cat_confidence_levels, 
                                             doc_event_topic_ids, doc_event_top_confidence_levels,
                                             doc_event_entity_ids, doc_event_ent_confidence_levels, 
                                             doc_ad_category_ids, doc_ad_cat_confidence_levels, 
                                             doc_ad_topic_ids, doc_ad_top_confidence_levels,
                                             doc_ad_entity_ids, doc_ad_ent_confidence_levels,
                            output_detailed_list=False):

    #Content-Based
    sims = []
    
    
    
    categories_similarity, cat_sim_confidence = cosine_similarity_doc_event_doc_ad_aspects(
                                                    doc_event_category_ids, doc_event_cat_confidence_levels, 
                                                    doc_ad_category_ids, doc_ad_cat_confidence_levels, 
                                                    categories_docs_counts)
    if categories_similarity != None:
        sims.append(('doc_event_doc_ad_sim_categories', categories_similarity, cat_sim_confidence))
    
    topics_similarity, top_sim_confidence = cosine_similarity_doc_event_doc_ad_aspects(
                                                    doc_event_topic_ids, doc_event_top_confidence_levels, 
                                                    doc_ad_topic_ids, doc_ad_top_confidence_levels, 
                                                    topics_docs_counts)
    
    if topics_similarity != None:
        sims.append(('doc_event_doc_ad_sim_topics', topics_similarity, top_sim_confidence))
        
    entities_similarity, entity_sim_confid = cosine_similarity_doc_event_doc_ad_aspects(
                                                    doc_event_entity_ids, doc_event_ent_confidence_levels, 
                                                    doc_ad_entity_ids, doc_ad_ent_confidence_levels, 
                                                    entities_docs_counts)
    
    if entities_similarity != None:
        sims.append(('doc_event_doc_ad_sim_entities', entities_similarity, entity_sim_confid))
    
    if output_detailed_list:
        return sims
    else:
        if len(sims) > 0:
            weighted_avg_sim_by_confidence = sum(map(lambda x: x[1]*x[2], sims)) / float(sum(map(lambda x: x[2], sims)))
            confidence = sum(map(lambda x: x[2], sims)) / float(len(sims))

            #print("[get_user_cb_interest_score] sims: {} | Avg: {} - Confid: {}".format(sims, weighted_avg_sim_by_confidence, confidence))
            return weighted_avg_sim_by_confidence, confidence
        else:
            return None, None

### Feature Vector export

In [9]:
bool_feature_names = ['event_weekend',
                      'user_has_already_viewed_doc']

In [38]:
int_feature_names = ['user_views',
                    'ad_views',
                    'doc_views',
                    'doc_event_days_since_published',
                    'doc_event_hour',
                    'doc_ad_days_since_published']

In [39]:
float_feature_names = ['pop_ad_id',       
                'pop_ad_id_conf',   
                'pop_ad_id_conf_multipl', 
                'pop_document_id',                
                'pop_document_id_conf',
                'pop_document_id_conf_multipl',
                'pop_publisher_id',
                'pop_publisher_id_conf',
                'pop_publisher_id_conf_multipl',
                'pop_advertiser_id',
                'pop_advertiser_id_conf',
                'pop_advertiser_id_conf_multipl',
                'pop_campain_id',
                'pop_campain_id_conf',
                'pop_campain_id_conf_multipl',
                'pop_doc_event_doc_ad',
                'pop_doc_event_doc_ad_conf',
                'pop_doc_event_doc_ad_conf_multipl',
                'pop_source_id',  
                'pop_source_id_conf',
                'pop_source_id_conf_multipl',
                'pop_source_id_country',
                'pop_source_id_country_conf',
                'pop_source_id_country_conf_multipl',
                'pop_entity_id',    
                'pop_entity_id_conf',
                'pop_entity_id_conf_multipl',
                'pop_entity_id_country',
                'pop_entity_id_country_conf',
                'pop_entity_id_country_conf_multipl',
                'pop_topic_id', 
                'pop_topic_id_conf',
                'pop_topic_id_conf_multipl',
                'pop_topic_id_country',
                'pop_topic_id_country_conf',
                'pop_topic_id_country_conf_multipl',
                'pop_category_id', 
                'pop_category_id_conf',
                'pop_category_id_conf_multipl',
                'pop_category_id_country',
                'pop_category_id_country_conf',
                'pop_category_id_country_conf_multipl',
                'user_doc_ad_sim_categories',    
                'user_doc_ad_sim_categories_conf',
                'user_doc_ad_sim_categories_conf_multipl',
                'user_doc_ad_sim_topics',    
                'user_doc_ad_sim_topics_conf',
                'user_doc_ad_sim_topics_conf_multipl',
                'user_doc_ad_sim_entities',                    
                'user_doc_ad_sim_entities_conf',
                'user_doc_ad_sim_entities_conf_multipl',
                'doc_event_doc_ad_sim_categories',    
                'doc_event_doc_ad_sim_categories_conf',
                'doc_event_doc_ad_sim_categories_conf_multipl',
                'doc_event_doc_ad_sim_topics',    
                'doc_event_doc_ad_sim_topics_conf',
                'doc_event_doc_ad_sim_topics_conf_multipl',
                'doc_event_doc_ad_sim_entities',                    
                'doc_event_doc_ad_sim_entities_conf',
                'doc_event_doc_ad_sim_entities_conf_multipl']

In [40]:
TRAFFIC_SOURCE_FV='traffic_source'
EVENT_HOUR_FV='event_hour'
EVENT_COUNTRY_FV = 'event_country'
EVENT_COUNTRY_STATE_FV = 'event_country_state'
EVENT_GEO_LOCATION_FV = 'event_geo_location'
EVENT_PLATFORM_FV = 'event_platform'
AD_ADVERTISER_FV = 'ad_advertiser'
DOC_AD_SOURCE_ID_FV='doc_ad_source_id'
DOC_AD_PUBLISHER_ID_FV='doc_ad_publisher_id'
DOC_EVENT_SOURCE_ID_FV='doc_event_source_id'
DOC_EVENT_PUBLISHER_ID_FV='doc_event_publisher_id'
DOC_AD_CATEGORY_ID_FV='doc_ad_category_id'
DOC_AD_TOPIC_ID_FV='doc_ad_topic_id'
DOC_AD_ENTITY_ID_FV='doc_ad_entity_id'
DOC_EVENT_CATEGORY_ID_FV='doc_event_category_id'
DOC_EVENT_TOPIC_ID_FV='doc_event_topic_id'
DOC_EVENT_ENTITY_ID_FV='doc_event_entity_id'

### Configuring feature vector

In [41]:
category_feature_names_integral = ['ad_advertiser',
 'doc_ad_category_id_1',
 'doc_ad_category_id_2',
 'doc_ad_category_id_3',
 'doc_ad_topic_id_1',
 'doc_ad_topic_id_2',
 'doc_ad_topic_id_3',
 'doc_ad_entity_id_1', 
 'doc_ad_entity_id_2', 
 'doc_ad_entity_id_3', 
 'doc_ad_entity_id_4', 
 'doc_ad_entity_id_5', 
 'doc_ad_entity_id_6', 
 'doc_ad_publisher_id',
 'doc_ad_source_id', 
 'doc_event_category_id_1',
 'doc_event_category_id_2',
 'doc_event_category_id_3',
 'doc_event_topic_id_1',
 'doc_event_topic_id_2',
 'doc_event_topic_id_3',
 'doc_event_entity_id_1',
 'doc_event_entity_id_2',
 'doc_event_entity_id_3',
 'doc_event_entity_id_4',
 'doc_event_entity_id_5',
 'doc_event_entity_id_6',
 'doc_event_publisher_id',
 'doc_event_source_id', 
 'event_country',
 'event_country_state',
 'event_geo_location',
 'event_hour',
 'event_platform',
 'traffic_source']

In [42]:
feature_vector_labels_integral = bool_feature_names + int_feature_names + float_feature_names + category_feature_names_integral

In [None]:
feature_vector_labels_integral_dict = dict([(key, idx) for idx, key in enumerate(feature_vector_labels_integral)])

In [None]:
with open('feature_vector_labels_integral.txt', 'w') as output:
    output.writelines('\n'.join(feature_vector_labels_integral))

In [None]:
def set_feature_vector_cat_value(field_name, field_value, feature_vector):
    if not is_null(field_value) and str(field_value) != '-1':
        feature_name = get_ohe_feature_name(field_name, field_value)
        if feature_name in feature_vector_labels_dict:
            feature_idx = feature_vector_labels_dict[feature_name]
        else:
            #Unpopular category value
            feature_idx = feature_vector_labels_dict[get_ohe_feature_name(field_name, LESS_SPECIAL_CAT_VALUE)]
            
        feature_vector[feature_idx] = float(1)
        
def set_feature_vector_cat_values(field_name, field_values, feature_vector):
    for field_value in field_values:
        set_feature_vector_cat_value(field_name, field_value, feature_vector)

In [None]:
def get_ad_feature_vector(user_doc_ids_viewed, user_views_count, user_categories, user_topics, user_entities, 
                            event_country, event_country_state,
                            ad_id, document_id, source_id, doc_ad_publish_time, timestamp_event, platform_event,
                            geo_location_event, 
                            doc_event_source_id, doc_event_publisher_id, doc_event_publish_time,
                            traffic_source_pv, advertiser_id, publisher_id,
                            campaign_id, document_id_event,
                            doc_ad_category_ids, doc_ad_cat_confidence_levels, 
                            doc_ad_topic_ids, doc_ad_top_confidence_levels,
                            doc_ad_entity_ids, doc_ad_ent_confidence_levels,
                            doc_event_category_ids, doc_event_cat_confidence_levels,
                            doc_event_topic_ids, doc_event_top_confidence_levels,
                            doc_event_entity_ids, doc_event_ent_confidence_levels):
             
    try:

        feature_vector = {}
        
        if user_views_count != None:
            feature_vector[feature_vector_labels_dict['user_views']] = float(user_views_count)
         
        if user_doc_ids_viewed != None:
            feature_vector[feature_vector_labels_dict['user_has_already_viewed_doc']] = float(document_id in user_doc_ids_viewed)               
          
        if ad_id in ad_id_popularity_broad.value:            
            feature_vector[feature_vector_labels_dict['ad_views']] = float(ad_id_popularity_broad.value[ad_id][1])
        
        if document_id in document_id_popularity_broad.value:
            feature_vector[feature_vector_labels_dict['doc_views']] = float(document_id_popularity_broad.value[document_id][1])            
            
        if timestamp_event > -1:
            dt_timestamp_event = convert_odd_timestamp(timestamp_event)
            if doc_ad_publish_time != None:
                delta_days = (dt_timestamp_event - doc_ad_publish_time).days
                if delta_days >= 0 and delta_days <= 365*10: #10 years
                    feature_vector[feature_vector_labels_dict['doc_ad_days_since_published']] = float(delta_days)
                        
            if doc_event_publish_time != None:
                delta_days = (dt_timestamp_event - doc_event_publish_time).days
                if delta_days >= 0 and delta_days <= 365*10: #10 years
                    feature_vector[feature_vector_labels_dict['doc_event_days_since_published']] = float(delta_days)
                    
            
            #Local period of the day (hours)
            dt_local_timestamp_event = get_local_datetime(dt_timestamp_event, event_country, event_country_state)    
            local_hour_bin = get_hour_bin(dt_local_timestamp_event.hour)            
            feature_vector[feature_vector_labels_dict['doc_event_hour']] = float(local_hour_bin) #Hour for Decision Trees
            set_feature_vector_cat_value(EVENT_HOUR_FV, local_hour_bin, feature_vector) #Period of day for FFM
            
            #Weekend
            weekend = int(is_weekend(dt_local_timestamp_event))
            feature_vector[feature_vector_labels_dict['event_weekend']] = float(weekend)                                                      
        
        conf_field_suffix = '_conf'
        conf_multiplied_field_suffix = '_conf_multipl'
        
        #Setting Popularity fields
        pop_scores = get_popularity_score(event_country, ad_id, document_id, source_id, 
                                publisher_id, advertiser_id, campaign_id, document_id_event,
                                doc_ad_category_ids, doc_ad_cat_confidence_levels, 
                                doc_ad_topic_ids, doc_ad_top_confidence_levels,
                                doc_ad_entity_ids, doc_ad_ent_confidence_levels,
                                output_detailed_list=True)
        
                                

        for score in pop_scores:
            feature_vector[feature_vector_labels_dict[score[0]]] = score[1]
            feature_vector[feature_vector_labels_dict[score[0]+conf_field_suffix]] = score[2]
            feature_vector[feature_vector_labels_dict[score[0]+conf_multiplied_field_suffix]] = score[1] * score[2]

        #Setting User-Doc_ad CB Similarity fields
        user_doc_ad_cb_sim_scores = get_user_cb_interest_score(user_views_count, user_categories, user_topics, user_entities, 
                                timestamp_event, 
                                 doc_ad_category_ids, doc_ad_cat_confidence_levels, 
                                 doc_ad_topic_ids, doc_ad_top_confidence_levels,
                                 doc_ad_entity_ids, doc_ad_ent_confidence_levels,
                                output_detailed_list=True)

        for score in user_doc_ad_cb_sim_scores:
            feature_vector[feature_vector_labels_dict[score[0]]] = score[1]
            feature_vector[feature_vector_labels_dict[score[0]+conf_field_suffix]] = score[2]
            feature_vector[feature_vector_labels_dict[score[0]+conf_multiplied_field_suffix]] = score[1] * score[2]
            
        #Setting Doc_event-doc_ad CB Similarity fields
        doc_event_doc_ad_cb_sim_scores = get_doc_event_doc_ad_cb_similarity_score(
                                            doc_event_category_ids, doc_event_cat_confidence_levels,
                                            doc_event_topic_ids, doc_event_top_confidence_levels,
                                            doc_event_entity_ids, doc_event_ent_confidence_levels,
                                            doc_ad_category_ids, doc_ad_cat_confidence_levels, 
                                            doc_ad_topic_ids, doc_ad_top_confidence_levels,
                                            doc_ad_entity_ids, doc_ad_ent_confidence_levels,
                                        output_detailed_list=True)
        
        for score in doc_event_doc_ad_cb_sim_scores:
            feature_vector[feature_vector_labels_dict[score[0]]] = score[1]
            feature_vector[feature_vector_labels_dict[score[0]+conf_field_suffix]] = score[2]
            feature_vector[feature_vector_labels_dict[score[0]+conf_multiplied_field_suffix]] = score[1] * score[2]
            
        set_feature_vector_cat_value(TRAFFIC_SOURCE_FV, traffic_source_pv, feature_vector)
        set_feature_vector_cat_value(EVENT_COUNTRY_FV, event_country, feature_vector)
        set_feature_vector_cat_value(EVENT_COUNTRY_STATE_FV, event_country_state, feature_vector)         
        set_feature_vector_cat_value(EVENT_GEO_LOCATION_FV, geo_location_event, feature_vector)
        set_feature_vector_cat_value(EVENT_PLATFORM_FV, platform_event, feature_vector)
        set_feature_vector_cat_value(AD_ADVERTISER_FV, advertiser_id, feature_vector)
        set_feature_vector_cat_value(DOC_AD_SOURCE_ID_FV, source_id, feature_vector)
        set_feature_vector_cat_value(DOC_AD_PUBLISHER_ID_FV, publisher_id, feature_vector)
        set_feature_vector_cat_value(DOC_EVENT_SOURCE_ID_FV, doc_event_source_id, feature_vector)
        set_feature_vector_cat_value(DOC_EVENT_PUBLISHER_ID_FV, doc_event_publisher_id, feature_vector)
        set_feature_vector_cat_values(DOC_AD_CATEGORY_ID_FV, doc_ad_category_ids, feature_vector)
        set_feature_vector_cat_values(DOC_AD_TOPIC_ID_FV, doc_ad_topic_ids, feature_vector)
        set_feature_vector_cat_values(DOC_AD_ENTITY_ID_FV, doc_ad_entity_ids, feature_vector)
        set_feature_vector_cat_values(DOC_EVENT_CATEGORY_ID_FV, doc_event_category_ids, feature_vector)
        set_feature_vector_cat_values(DOC_EVENT_TOPIC_ID_FV, doc_event_topic_ids, feature_vector)
        set_feature_vector_cat_values(DOC_EVENT_ENTITY_ID_FV, doc_event_entity_ids, feature_vector)
        
        #Creating dummy column as the last column because xgboost have a problem if the last column is undefined for all rows, 
        #saying that dimentions of data and feature_names do not match
        #feature_vector[feature_vector_labels_dict[DUMMY_FEATURE_COLUMN]] = float(0)
            
        #Ensuring that all elements are floats for compatibility with UDF output (ArrayType(FloatType()))
        #feature_vector = list([float(x) for x in feature_vector])
        
    except Exception as e:
        raise Exception("[get_ad_feature_vector] ERROR PROCESSING FEATURE VECTOR! Params: {}" \
                        .format([user_doc_ids_viewed, user_views_count, user_categories, user_topics, user_entities, 
                                 event_country, event_country_state,
                            ad_id, document_id, source_id, doc_ad_publish_time, timestamp_event, platform_event,
                            geo_location_event, 
                            doc_event_source_id, doc_event_publisher_id, doc_event_publish_time,
                            traffic_source_pv, advertiser_id, publisher_id,
                            campaign_id, document_id_event,
                            doc_ad_category_ids, doc_ad_cat_confidence_levels, 
                            doc_ad_topic_ids, doc_ad_top_confidence_levels,
                            doc_ad_entity_ids, doc_ad_ent_confidence_levels,
                            doc_event_category_ids, doc_event_cat_confidence_levels,
                            doc_event_topic_ids, doc_event_top_confidence_levels,
                            doc_event_entity_ids, doc_event_ent_confidence_levels]),
                        e)
    
    return SparseVector(len(feature_vector_labels_dict), feature_vector)

In [None]:
get_ad_feature_vector_udf = F.udf(lambda user_doc_ids_viewed, user_views_count, user_categories, user_topics, 
                                        user_entities, event_country, event_country_state, ad_id, document_id, source_id, 
                                        doc_ad_publish_time, timestamp_event, platform_event,
                                        geo_location_event, 
                                        doc_event_source_id, doc_event_publisher_id, doc_event_publish_time,
                                        traffic_source_pv, advertiser_id, publisher_id,
                                        campaign_id, document_id_event,
                                        category_ids_by_doc, cat_confidence_level_by_doc, 
                                        topic_ids_by_doc, top_confidence_level_by_doc,
                                        entity_ids_by_doc, ent_confidence_level_by_doc,
                                        doc_event_category_id_list, doc_event_confidence_level_cat_list,
                                        doc_event_topic_id_list, doc_event_confidence_level_top,
                                        doc_event_entity_id_list, doc_event_confidence_level_ent: \
                                         get_ad_feature_vector(user_doc_ids_viewed, user_views_count, user_categories, user_topics, user_entities, 
                                                            event_country, event_country_state, 
                                                            ad_id, document_id, source_id, doc_ad_publish_time, timestamp_event, platform_event,
                                                            geo_location_event, 
                                                            doc_event_source_id, doc_event_publisher_id, doc_event_publish_time,   
                                                            traffic_source_pv, advertiser_id, publisher_id,
                                                            campaign_id, document_id_event,
                                                            category_ids_by_doc, cat_confidence_level_by_doc, 
                                                            topic_ids_by_doc, top_confidence_level_by_doc,
                                                            entity_ids_by_doc, ent_confidence_level_by_doc,
                                                            doc_event_category_id_list, doc_event_confidence_level_cat_list,
                                                            doc_event_topic_id_list, doc_event_confidence_level_top,
                                                            doc_event_entity_id_list, doc_event_confidence_level_ent),    
                            VectorUDT())

### Building feature vectors

In [None]:
def set_feature_vector_cat_value_integral(field_name, field_value, feature_vector):
    if not is_null(field_value): #and str(field_value) != '-1':
        feature_vector[feature_vector_labels_integral_dict[field_name]] = float(field_value)
        
def set_feature_vector_cat_top_multi_values_integral(field_name, values, confidences, feature_vector, top=5):
    top_values = list(filter(lambda z: z != -1, map(lambda y: y[0], sorted(zip(values, confidences), key=lambda x: -x[1]))))[:top]
    for idx, field_value in list(enumerate(top_values)):
        set_feature_vector_cat_value_integral('{}_{}'.format(field_name, idx+1), field_value, feature_vector)

In [None]:
def get_ad_feature_vector_integral(user_doc_ids_viewed, user_views_count, user_categories, user_topics, user_entities, 
                            event_country, event_country_state,
                            ad_id, document_id, source_id, doc_ad_publish_time, timestamp_event, platform_event,
                            geo_location_event, 
                            doc_event_source_id, doc_event_publisher_id, doc_event_publish_time,
                            traffic_source_pv, advertiser_id, publisher_id,
                            campaign_id, document_id_event,
                            doc_ad_category_ids, doc_ad_cat_confidence_levels, 
                            doc_ad_topic_ids, doc_ad_top_confidence_levels,
                            doc_ad_entity_ids, doc_ad_ent_confidence_levels,
                            doc_event_category_ids, doc_event_cat_confidence_levels,
                            doc_event_topic_ids, doc_event_top_confidence_levels,
                            doc_event_entity_ids, doc_event_ent_confidence_levels):
             
    try:

        feature_vector = {}
        
        if user_views_count != None:
            feature_vector[feature_vector_labels_integral_dict['user_views']] = float(user_views_count)
         
        if user_doc_ids_viewed != None:
            feature_vector[feature_vector_labels_integral_dict['user_has_already_viewed_doc']] = float(document_id in user_doc_ids_viewed)               
          
        if ad_id in ad_id_popularity_broad.value:            
            feature_vector[feature_vector_labels_integral_dict['ad_views']] = float(ad_id_popularity_broad.value[ad_id][1])
        
        if document_id in document_id_popularity_broad.value:
            feature_vector[feature_vector_labels_integral_dict['doc_views']] = float(document_id_popularity_broad.value[document_id][1])            
            
        if timestamp_event > -1:
            dt_timestamp_event = convert_odd_timestamp(timestamp_event)
            if doc_ad_publish_time != None:
                delta_days = (dt_timestamp_event - doc_ad_publish_time).days
                if delta_days >= 0 and delta_days <= 365*10: #10 years
                    feature_vector[feature_vector_labels_integral_dict['doc_ad_days_since_published']] = float(delta_days)
                        
            if doc_event_publish_time != None:
                delta_days = (dt_timestamp_event - doc_event_publish_time).days
                if delta_days >= 0 and delta_days <= 365*10: #10 years
                    feature_vector[feature_vector_labels_integral_dict['doc_event_days_since_published']] = float(delta_days)
                    
            
            #Local period of the day (hours)
            dt_local_timestamp_event = get_local_datetime(dt_timestamp_event, event_country, event_country_state)    
            local_hour_bin = get_hour_bin(dt_local_timestamp_event.hour)            
            feature_vector[feature_vector_labels_integral_dict['doc_event_hour']] = float(local_hour_bin) #Hour for Decision Trees
            set_feature_vector_cat_value_integral(EVENT_HOUR_FV, local_hour_bin, feature_vector) #Period of day for FFM
            
            #Weekend
            weekend = int(is_weekend(dt_local_timestamp_event))
            feature_vector[feature_vector_labels_integral_dict['event_weekend']] = float(weekend)               
                                        
        
        conf_field_suffix = '_conf'
        conf_multiplied_field_suffix = '_conf_multipl'
        
        #Setting Popularity fields
        pop_scores = get_popularity_score(event_country, ad_id, document_id, source_id, 
                                publisher_id, advertiser_id, campaign_id, document_id_event,
                                doc_ad_category_ids, doc_ad_cat_confidence_levels, 
                                doc_ad_topic_ids, doc_ad_top_confidence_levels,
                                doc_ad_entity_ids, doc_ad_ent_confidence_levels,
                                output_detailed_list=True)
        
                                

        for score in pop_scores:
            feature_vector[feature_vector_labels_integral_dict[score[0]]] = score[1]
            feature_vector[feature_vector_labels_integral_dict[score[0]+conf_field_suffix]] = score[2]
            feature_vector[feature_vector_labels_integral_dict[score[0]+conf_multiplied_field_suffix]] = score[1] * score[2]

        #Setting User-Doc_ad CB Similarity fields
        user_doc_ad_cb_sim_scores = get_user_cb_interest_score(user_views_count, user_categories, user_topics, user_entities, 
                                timestamp_event, 
                                 doc_ad_category_ids, doc_ad_cat_confidence_levels, 
                                 doc_ad_topic_ids, doc_ad_top_confidence_levels,
                                 doc_ad_entity_ids, doc_ad_ent_confidence_levels,
                                output_detailed_list=True)

        for score in user_doc_ad_cb_sim_scores:
            feature_vector[feature_vector_labels_integral_dict[score[0]]] = score[1]
            feature_vector[feature_vector_labels_integral_dict[score[0]+conf_field_suffix]] = score[2]
            feature_vector[feature_vector_labels_integral_dict[score[0]+conf_multiplied_field_suffix]] = score[1] * score[2]
            
        #Setting Doc_event-doc_ad CB Similarity fields
        doc_event_doc_ad_cb_sim_scores = get_doc_event_doc_ad_cb_similarity_score(
                                            doc_event_category_ids, doc_event_cat_confidence_levels,
                                            doc_event_topic_ids, doc_event_top_confidence_levels,
                                            doc_event_entity_ids, doc_event_ent_confidence_levels,
                                            doc_ad_category_ids, doc_ad_cat_confidence_levels, 
                                            doc_ad_topic_ids, doc_ad_top_confidence_levels,
                                            doc_ad_entity_ids, doc_ad_ent_confidence_levels,
                                        output_detailed_list=True)
        
        for score in doc_event_doc_ad_cb_sim_scores:
            feature_vector[feature_vector_labels_integral_dict[score[0]]] = score[1]
            feature_vector[feature_vector_labels_integral_dict[score[0]+conf_field_suffix]] = score[2]
            feature_vector[feature_vector_labels_integral_dict[score[0]+conf_multiplied_field_suffix]] = score[1] * score[2]
        
        #Process code for event_country
        if event_country in event_country_values_counts:
            event_country_code = event_country_values_counts[event_country]
        else:
            event_country_code = event_country_values_counts[LESS_SPECIAL_CAT_VALUE]                        
        set_feature_vector_cat_value_integral(EVENT_COUNTRY_FV, event_country_code, feature_vector)
        
        #Process code for event_country_state
        if event_country_state in event_country_state_values_counts:
            event_country_state_code = event_country_state_values_counts[event_country_state]
        else:
            event_country_state_code = event_country_state_values_counts[LESS_SPECIAL_CAT_VALUE]         
        set_feature_vector_cat_value_integral(EVENT_COUNTRY_STATE_FV, event_country_state_code, feature_vector)
                
        #Process code for geo_location_event
        if geo_location_event in event_geo_location_values_counts:
            geo_location_event_code = event_geo_location_values_counts[geo_location_event]
        else:
            geo_location_event_code = event_geo_location_values_counts[LESS_SPECIAL_CAT_VALUE]
        set_feature_vector_cat_value_integral(EVENT_GEO_LOCATION_FV, geo_location_event_code, feature_vector)   
         
        set_feature_vector_cat_value_integral(TRAFFIC_SOURCE_FV, traffic_source_pv, feature_vector)        
        set_feature_vector_cat_value_integral(EVENT_PLATFORM_FV, platform_event, feature_vector)
        set_feature_vector_cat_value_integral(AD_ADVERTISER_FV, advertiser_id, feature_vector)
        set_feature_vector_cat_value_integral(DOC_AD_SOURCE_ID_FV, source_id, feature_vector)
        set_feature_vector_cat_value_integral(DOC_AD_PUBLISHER_ID_FV, publisher_id, feature_vector)
        set_feature_vector_cat_value_integral(DOC_EVENT_SOURCE_ID_FV, doc_event_source_id, feature_vector)
        set_feature_vector_cat_value_integral(DOC_EVENT_PUBLISHER_ID_FV, doc_event_publisher_id, feature_vector)
                
        set_feature_vector_cat_top_multi_values_integral(DOC_AD_CATEGORY_ID_FV, doc_ad_category_ids, doc_ad_cat_confidence_levels, feature_vector, top=3)
        set_feature_vector_cat_top_multi_values_integral(DOC_AD_TOPIC_ID_FV, doc_ad_topic_ids, doc_ad_top_confidence_levels, feature_vector, top=3)
        
        set_feature_vector_cat_top_multi_values_integral(DOC_EVENT_CATEGORY_ID_FV, doc_event_category_ids, doc_event_cat_confidence_levels, feature_vector, top=3)
        set_feature_vector_cat_top_multi_values_integral(DOC_EVENT_TOPIC_ID_FV, doc_event_topic_ids, doc_event_top_confidence_levels, feature_vector, top=3)                           
        
        #Process codes for doc_ad_entity_ids
        doc_ad_entity_ids_codes = [doc_entity_id_values_counts[x] if x in doc_entity_id_values_counts 
                                   else doc_entity_id_values_counts[LESS_SPECIAL_CAT_VALUE] 
                                   for x in doc_ad_entity_ids]
        set_feature_vector_cat_top_multi_values_integral(DOC_AD_ENTITY_ID_FV, doc_ad_entity_ids_codes, doc_ad_ent_confidence_levels, feature_vector, top=6)
        
        
        #Process codes for doc_event_entity_ids
        doc_event_entity_ids_codes = [doc_entity_id_values_counts[x] if x in doc_entity_id_values_counts 
                                   else doc_entity_id_values_counts[LESS_SPECIAL_CAT_VALUE] 
                                   for x in doc_event_entity_ids]
        set_feature_vector_cat_top_multi_values_integral(DOC_EVENT_ENTITY_ID_FV, doc_event_entity_ids_codes, doc_event_ent_confidence_levels, feature_vector, top=6)
        
        #Creating dummy column as the last column because xgboost have a problem if the last column is undefined for all rows, 
        #saying that dimentions of data and feature_names do not match
        #feature_vector[feature_vector_labels_dict[DUMMY_FEATURE_COLUMN]] = float(0)
            
        #Ensuring that all elements are floats for compatibility with UDF output (ArrayType(FloatType()))
        #feature_vector = list([float(x) for x in feature_vector])
        
    except Exception as e:
        raise Exception("[get_ad_feature_vector_integral] ERROR PROCESSING FEATURE VECTOR! Params: {}" \
                        .format([user_doc_ids_viewed, user_views_count, user_categories, user_topics, user_entities, 
                                 event_country, event_country_state,
                            ad_id, document_id, source_id, doc_ad_publish_time, timestamp_event, platform_event,
                            geo_location_event, 
                            doc_event_source_id, doc_event_publisher_id, doc_event_publish_time,
                            traffic_source_pv, advertiser_id, publisher_id,
                            campaign_id, document_id_event,
                            doc_ad_category_ids, doc_ad_cat_confidence_levels, 
                            doc_ad_topic_ids, doc_ad_top_confidence_levels,
                            doc_ad_entity_ids, doc_ad_ent_confidence_levels,
                            doc_event_category_ids, doc_event_cat_confidence_levels,
                            doc_event_topic_ids, doc_event_top_confidence_levels,
                            doc_event_entity_ids, doc_event_ent_confidence_levels]),
                        e)
    
    return SparseVector(len(feature_vector_labels_integral_dict), feature_vector)

In [None]:
get_ad_feature_vector_integral_udf = F.udf(lambda user_doc_ids_viewed, user_views_count, user_categories, user_topics, 
                                        user_entities, event_country, event_country_state, ad_id, document_id, source_id, 
                                        doc_ad_publish_time, timestamp_event, platform_event,
                                        geo_location_event, 
                                        doc_event_source_id, doc_event_publisher_id, doc_event_publish_time,
                                        traffic_source_pv, advertiser_id, publisher_id,
                                        campaign_id, document_id_event,
                                        category_ids_by_doc, cat_confidence_level_by_doc, 
                                        topic_ids_by_doc, top_confidence_level_by_doc,
                                        entity_ids_by_doc, ent_confidence_level_by_doc,
                                        doc_event_category_id_list, doc_event_confidence_level_cat_list,
                                        doc_event_topic_id_list, doc_event_confidence_level_top,
                                        doc_event_entity_id_list, doc_event_confidence_level_ent: \
                                         get_ad_feature_vector_integral(user_doc_ids_viewed, user_views_count, user_categories, user_topics, user_entities, 
                                                            event_country, event_country_state, 
                                                            ad_id, document_id, source_id, doc_ad_publish_time, timestamp_event, platform_event,
                                                            geo_location_event, 
                                                            doc_event_source_id, doc_event_publisher_id, doc_event_publish_time,   
                                                            traffic_source_pv, advertiser_id, publisher_id,
                                                            campaign_id, document_id_event,
                                                            category_ids_by_doc, cat_confidence_level_by_doc, 
                                                            topic_ids_by_doc, top_confidence_level_by_doc,
                                                            entity_ids_by_doc, ent_confidence_level_by_doc,
                                                            doc_event_category_id_list, doc_event_confidence_level_cat_list,
                                                            doc_event_topic_id_list, doc_event_confidence_level_top,
                                                            doc_event_entity_id_list, doc_event_confidence_level_ent),    
                            VectorUDT())
                             #StructField("features", VectorUDT()))
                             #MapType(IntegerType(), FloatType()))

### Export Train set feature vectors

In [137]:
train_set_enriched_df = train_set_df \
                             .join(documents_categories_grouped_df, on=F.col("document_id_promo") == F.col("documents_categories_grouped.document_id_cat"), how='left') \
                             .join(documents_topics_grouped_df, on=F.col("document_id_promo") == F.col("documents_topics_grouped.document_id_top"), how='left') \
                             .join(documents_entities_grouped_df, on=F.col("document_id_promo") == F.col("documents_entities_grouped.document_id_ent"), how='left') \
                             .join(documents_categories_grouped_df \
                                       .withColumnRenamed('category_id_list', 'doc_event_category_id_list')
                                       .withColumnRenamed('confidence_level_cat_list', 'doc_event_confidence_level_cat_list') \
                                       .alias('documents_event_categories_grouped'), 
                                   on=F.col("document_id_event") == F.col("documents_event_categories_grouped.document_id_cat"), 
                                   how='left') \
                             .join(documents_topics_grouped_df \
                                       .withColumnRenamed('topic_id_list', 'doc_event_topic_id_list')
                                       .withColumnRenamed('confidence_level_top_list', 'doc_event_confidence_level_top_list') \
                                       .alias('documents_event_topics_grouped'), 
                                   on=F.col("document_id_event") == F.col("documents_event_topics_grouped.document_id_top"), 
                                   how='left') \
                             .join(documents_entities_grouped_df \
                                       .withColumnRenamed('entity_id_list', 'doc_event_entity_id_list')
                                       .withColumnRenamed('confidence_level_ent_list', 'doc_event_confidence_level_ent_list') \
                                       .alias('documents_event_entities_grouped'), 
                                   on=F.col("document_id_event") == F.col("documents_event_entities_grouped.document_id_ent"), 
                                   how='left') \
                            .select('display_id','uuid_event','event_country','event_country_state','platform_event',
                                    'source_id_doc_event', 'publisher_doc_event','publish_time_doc_event',
                                            'publish_time', 'ad_id','document_id_promo','clicked',   
                                           'geo_location_event', 'advertiser_id', 'publisher_id',
                                            'campaign_id', 'document_id_event',
                                            'traffic_source_pv',                                          
                                        int_list_null_to_empty_list_udf('doc_event_category_id_list').alias('doc_event_category_id_list'),
                                        float_list_null_to_empty_list_udf('doc_event_confidence_level_cat_list').alias('doc_event_confidence_level_cat_list'),
                                        int_list_null_to_empty_list_udf('doc_event_topic_id_list').alias('doc_event_topic_id_list'),
                                        float_list_null_to_empty_list_udf('doc_event_confidence_level_top_list').alias('doc_event_confidence_level_top_list'),
                                        str_list_null_to_empty_list_udf('doc_event_entity_id_list').alias('doc_event_entity_id_list'),
                                        float_list_null_to_empty_list_udf('doc_event_confidence_level_ent_list').alias('doc_event_confidence_level_ent_list'),
                                       int_null_to_minus_one_udf('source_id').alias('source_id'),                                      
                                       int_null_to_minus_one_udf('timestamp_event').alias('timestamp_event'),
                                       int_list_null_to_empty_list_udf('category_id_list').alias('category_id_list'), 
                                       float_list_null_to_empty_list_udf('confidence_level_cat_list').alias('confidence_level_cat_list'), 
                                       int_list_null_to_empty_list_udf('topic_id_list').alias('topic_id_list'), 
                                       float_list_null_to_empty_list_udf('confidence_level_top_list').alias('confidence_level_top_list'), 
                                       str_list_null_to_empty_list_udf('entity_id_list').alias('entity_id_list'), 
                                       float_list_null_to_empty_list_udf('confidence_level_ent_list').alias('confidence_level_ent_list')                                                       
                                      ) \
                            .join(user_profiles_df, on=[F.col("user_profiles.uuid") == F.col("uuid_event")], how='left') \
                            .withColumnRenamed('categories', 'user_categories') \
                            .withColumnRenamed('topics', 'user_topics') \
                            .withColumnRenamed('entities', 'user_entities') \
                            .withColumnRenamed('doc_ids', 'user_doc_ids_viewed') \
                            .withColumnRenamed('views', 'user_views_count')

In [138]:
print((train_set_enriched_df.count(), len(train_set_enriched_df.columns)))

(87141731, 39)


In [139]:
train_set_enriched_df.take(3)

[Row(display_id=16757900, uuid_event=u'100013af048bbf', event_country=u'US', event_country_state=u'US>CA', platform_event=3, source_id_doc_event=9462, publisher_doc_event=240, publish_time_doc_event=datetime.datetime(2012, 4, 12, 6, 0), publish_time=datetime.datetime(2014, 9, 26, 0, 0), ad_id=7033, document_id_promo=393333, clicked=0, geo_location_event=u'US>CA>807', advertiser_id=386, publisher_id=None, campaign_id=530, document_id_event=38915, traffic_source_pv=2, doc_event_category_id_list=[1205, 2003], doc_event_confidence_level_cat_list=[0.8515539169311523, 0.06479214876890182], doc_event_topic_id_list=[200, 67, 72, 279, 177], doc_event_confidence_level_top_list=[0.12818573415279388, 0.025913242250680923, 0.02038034237921238, 0.016375485807657242, 0.008541254326701164], doc_event_entity_id_list=[], doc_event_confidence_level_ent_list=[], source_id=4773, timestamp_event=1116182031, category_id_list=[1206, 1208], confidence_level_cat_list=[0.8132187128067017, 0.06187533959746361], t

In [140]:
train_set_feature_vectors_df = train_set_enriched_df \
                                .withColumn('feature_vector', 
                                            #get_ad_feature_vector_udf(
                                            get_ad_feature_vector_integral_udf(
                                                                'user_doc_ids_viewed',
                                                                'user_views_count',
                                                                'user_categories', 
                                                                'user_topics', 
                                                                'user_entities', 
                                                                'event_country', 
                                                                'event_country_state',
                                                                'ad_id', 
                                                                'document_id_promo', 
                                                                'source_id', 
                                                                'publish_time', 
                                                                'timestamp_event', 
                                                                'platform_event',
                                                                'geo_location_event', 
                                                                'source_id_doc_event', 
                                                                'publisher_doc_event',
                                                                'publish_time_doc_event',
                                                                'traffic_source_pv',
                                                                'advertiser_id', 
                                                                'publisher_id',
                                                                'campaign_id',
                                                                'document_id_event',
                                                                'category_id_list', 
                                                                'confidence_level_cat_list', 
                                                                'topic_id_list', 
                                                                'confidence_level_top_list',
                                                                'entity_id_list', 
                                                                'confidence_level_ent_list',
                                                                'doc_event_category_id_list',
                                                                'doc_event_confidence_level_cat_list',
                                                                'doc_event_topic_id_list',
                                                                'doc_event_confidence_level_top_list',
                                                                'doc_event_entity_id_list',
                                                                'doc_event_confidence_level_ent_list')) \
                            .select(F.col('uuid_event').alias('uuid'),
                                    'display_id',
                                    'ad_id',
                                    'document_id_event',
                                    F.col('document_id_promo').alias('document_id'),
                                    F.col('clicked').alias('label'),
                                    'feature_vector') #\
                            #.orderBy('display_id','ad_id')

In [141]:
train_feature_vector_gcs_folder_name = 'train_feature_vectors_integral'

In [None]:
%time train_set_feature_vectors_df.write.parquet(OUTPUT_BUCKET_FOLDER+train_feature_vector_gcs_folder_name, mode='overwrite')

In [142]:
print((train_set_feature_vectors_df.count(), len(train_set_feature_vectors_df.columns)))

(87141731, 7)


### Exporting integral feature vectors to CSV

In [129]:
train_feature_vectors_exported_df = spark.read.parquet(OUTPUT_BUCKET_FOLDER+train_feature_vector_gcs_folder_name)

In [130]:
train_feature_vector_integral_csv_folder_name = 'train_feature_vectors_integral.csv'

In [131]:
integral_headers = ['label', 'display_id', 'ad_id', 'doc_id', 'doc_event_id', 'is_leak'] + feature_vector_labels_integral
    
with open(train_feature_vector_integral_csv_folder_name+".header", 'w') as output:
    output.writelines('\n'.join(integral_headers))

In [132]:
def sparse_vector_to_csv_with_nulls_row(additional_column_values, vec, num_columns):    
    return ','.join([str(value) for value in additional_column_values] + 
                     list([ '{:.5}'.format(vec[x]) if x in vec.indices else '' for x in range(vec.size) ])[:num_columns]) \
            .replace('.0,',',')

In [134]:
train_feature_vectors_integral_csv_rdd = train_feature_vectors_exported_df.select(
     'label', 'display_id', 'ad_id', 'document_id', 'document_id_event', 'feature_vector').withColumn('is_leak', F.lit(-1)) \
     .rdd.map(lambda x: sparse_vector_to_csv_with_nulls_row([x['label'], x['display_id'], x['ad_id'], x['document_id'], x['document_id_event'], x['is_leak']], 
                                                  x['feature_vector'], len(integral_headers)))

In [None]:
%time train_feature_vectors_integral_csv_rdd.saveAsTextFile(OUTPUT_BUCKET_FOLDER+train_feature_vector_integral_csv_folder_name)

In [133]:
train_feature_vectors_exported_df.show(3)

+--------------+----------+------+-----------------+-----------+-----+--------------------+
|          uuid|display_id| ad_id|document_id_event|document_id|label|      feature_vector|
+--------------+----------+------+-----------------+-----------+-----+--------------------+
|100013af048bbf|  16757900|  7033|            38915|     393333|    0|(103,[0,3,4,5,6,7...|
|100013af048bbf|  16757900|156270|            38915|    1388416|    1|(103,[0,3,4,5,6,7...|
|100013af048bbf|  16757900|147242|            38915|    1108162|    0|(103,[0,3,4,5,6,8...|
+--------------+----------+------+-----------------+-----------+-----+--------------------+
only showing top 3 rows



In [135]:
train_feature_vectors_integral_csv_rdd.take(3)

['0,16757900,7033,393333,38915,-1,1,,,8360,2.2043e+04,1536,6,640,0.12045,0.54374,0.065496,0.10988,0.53597,0.05889,,,,0.11217,0.5053,0.056681,0.11217,0.5053,0.056681,0.17075,0.37136,0.06341,0.11217,0.5053,0.056681,0.11217,0.5053,0.056681,0.16354,0.025459,0.0041637,0.16428,0.025611,0.0042074,0.15996,0.0095915,0.0015343,0.16398,0.010573,0.0017337,0.12309,0.23087,0.028419,0.12345,0.24642,0.03042,,,,,,,,,,0,0.00042512,0,0,5.5556e-05,0,,,,386,1206,1208,,26,,,448,,,,,,,4773,1205,2003,,200,67,72,,,,,,,240,9462,1.8595e+07,2.3953e+06,6.9579e+05,6,3,2.0',
 '1,16757900,156270,1388416,38915,-1,1,,,2.322e+04,2.4827e+04,1536,6,81,0.29742,0.60524,0.18001,0.29335,0.56754,0.16649,,,,0.20687,0.46172,0.095516,0.29335,0.56754,0.16649,0.56552,0.34157,0.19316,0.20687,0.46172,0.095516,0.25224,0.42026,0.10601,,,,,,,0.19955,0.0013906,0.00027749,0.20638,0.0014022,0.00028937,0.22773,0.078248,0.017819,0.23186,0.074581,0.017292,,,,,,,,,,0,0.00042512,0,0,0.00022222,0,,,,2623,1403,1405,,43,258,136,,,,,,,,1.077e+04,12

### Test set feature vectors

In [117]:
def is_leak(max_timestamp_pv_leak, timestamp_event):
    return max_timestamp_pv_leak >= 0 and max_timestamp_pv_leak >= timestamp_event

In [118]:
is_leak_udf = F.udf(lambda max_timestamp_pv_leak, timestamp_event: int(is_leak(max_timestamp_pv_leak, timestamp_event)), IntegerType())

In [123]:
test_validation_set_enriched_df = test_set_df.select('display_id','uuid_event','event_country','event_country_state','platform_event',
                                            'source_id_doc_event', 'publisher_doc_event','publish_time_doc_event',     
                                            'publish_time',
                                           'ad_id','document_id_promo','clicked',  
                                           'geo_location_event', 'advertiser_id', 'publisher_id',
                                           'campaign_id', 'document_id_event',
                                           'traffic_source_pv',                                           
                                        int_list_null_to_empty_list_udf('doc_event_category_id_list').alias('doc_event_category_id_list'),
                                        float_list_null_to_empty_list_udf('doc_event_confidence_level_cat_list').alias('doc_event_confidence_level_cat_list'),
                                        int_list_null_to_empty_list_udf('doc_event_topic_id_list').alias('doc_event_topic_id_list'),
                                        float_list_null_to_empty_list_udf('doc_event_confidence_level_top_list').alias('doc_event_confidence_level_top_list'),
                                        str_list_null_to_empty_list_udf('doc_event_entity_id_list').alias('doc_event_entity_id_list'),
                                        float_list_null_to_empty_list_udf('doc_event_confidence_level_ent_list').alias('doc_event_confidence_level_ent_list'),
                                       int_null_to_minus_one_udf('source_id').alias('source_id'),                                   
                                       int_null_to_minus_one_udf('timestamp_event').alias('timestamp_event'),
                                       int_list_null_to_empty_list_udf('category_id_list').alias('category_id_list'), 
                                       float_list_null_to_empty_list_udf('confidence_level_cat_list').alias('confidence_level_cat_list'), 
                                       int_list_null_to_empty_list_udf('topic_id_list').alias('topic_id_list'), 
                                       float_list_null_to_empty_list_udf('confidence_level_top_list').alias('confidence_level_top_list'), 
                                       str_list_null_to_empty_list_udf('entity_id_list').alias('entity_id_list'), 
                                       float_list_null_to_empty_list_udf('confidence_level_ent_list').alias('confidence_level_ent_list'),
                                       int_null_to_minus_one_udf('max_timestamp_pv').alias('max_timestamp_pv_leak')
                                      ) \
                            .join(user_profiles_df, on=[F.col("user_profiles.uuid") == F.col("uuid_event")], how='left') \
                            .withColumnRenamed('categories', 'user_categories') \
                            .withColumnRenamed('topics', 'user_topics') \
                            .withColumnRenamed('entities', 'user_entities') \
                            .withColumnRenamed('doc_ids', 'user_doc_ids_viewed') \
                            .withColumnRenamed('views', 'user_views_count')

In [124]:
print((test_validation_set_enriched_df.count(), len(test_validation_set_enriched_df.columns)))

(32225162, 40)


In [125]:
test_validation_set_enriched_df.take(3)

[Row(display_id=22991705, uuid_event=u'1000615e760786', event_country=u'US', event_country_state=u'US>IL', platform_event=3, source_id_doc_event=2722, publisher_doc_event=236, publish_time_doc_event=datetime.datetime(2016, 6, 28, 14, 0), publish_time=datetime.datetime(2015, 10, 17, 0, 0), ad_id=124564, document_id_promo=1201265, clicked=-999, geo_location_event=u'US>IL>602', advertiser_id=2146, publisher_id=None, campaign_id=15992, document_id_event=2959725, traffic_source_pv=1, doc_event_category_id_list=[1907, 1914], doc_event_confidence_level_cat_list=[0.8844558000564575, 0.06729555130004883], doc_event_topic_id_list=[77], doc_event_confidence_level_top_list=[0.2766975164413452], doc_event_entity_id_list=[u'753fa42329661c4eb3b1e99e63a7e46d', u'3b4a364141e7c25731a15cd4ef643d9e', u'ea7f7e8b98b3212620cea38b90d89321', u'246f2c584db092a36a14533b067ccb1b', u'87f8f9a6d35ec4f07d4fd425db84c998', u'8a74cb33e81530f941bcb99a067a6baa', u'504a2a4f3b97b8511737fc0b7d55dc46'], doc_event_confidence_l

In [126]:
test_validation_set_feature_vectors_df = test_validation_set_enriched_df \
                                .withColumn('feature_vector', 
                                            #get_ad_feature_vector_udf(
                                            get_ad_feature_vector_integral_udf(
                                                                'user_doc_ids_viewed', 
                                                                'user_views_count',
                                                                'user_categories', 
                                                                'user_topics', 
                                                                'user_entities', 
                                                                'event_country', 
                                                                'event_country_state',
                                                                'ad_id', 
                                                                'document_id_promo', 
                                                                'source_id', 
                                                                'publish_time', 
                                                                'timestamp_event', 
                                                                'platform_event',
                                                                'geo_location_event', 
                                                                'source_id_doc_event', 
                                                                'publisher_doc_event',
                                                                'publish_time_doc_event',
                                                                'traffic_source_pv',
                                                                'advertiser_id', 
                                                                'publisher_id',
                                                                'campaign_id',
                                                                'document_id_event',
                                                                'category_id_list', 
                                                                'confidence_level_cat_list', 
                                                                'topic_id_list', 
                                                                'confidence_level_top_list',
                                                                'entity_id_list', 
                                                                'confidence_level_ent_list',
                                                                'doc_event_category_id_list',
                                                                'doc_event_confidence_level_cat_list',
                                                                'doc_event_topic_id_list',
                                                                'doc_event_confidence_level_top_list',
                                                                'doc_event_entity_id_list',
                                                                'doc_event_confidence_level_ent_list')) \
                            .select(F.col('uuid').alias('uuid'),                                    
                                    'display_id',
                                    'ad_id',
                                    'document_id_event',
                                    F.col('document_id_promo').alias('document_id'),
                                    F.col('clicked').alias('label'),
                                    is_leak_udf('max_timestamp_pv_leak','timestamp_event').alias('is_leak'),
                                    'feature_vector') #\
                            #.orderBy('display_id','ad_id')

In [111]:
test_validation_feature_vector_gcs_folder_name = 'test_feature_vectors_integral'

In [None]:
%time test_validation_set_feature_vectors_df.write.parquet(OUTPUT_BUCKET_FOLDER+test_validation_feature_vector_gcs_folder_name, mode='overwrite')

In [127]:
print((test_validation_set_feature_vectors_df.count(), len(test_validation_set_feature_vectors_df.columns)))

(32225162, 8)


### Exporting integral feature vectors to CSV

In [112]:
test_validation_feature_vectors_exported_df = spark.read.parquet(OUTPUT_BUCKET_FOLDER+test_validation_feature_vector_gcs_folder_name)

In [113]:
test_validation_feature_vector_integral_csv_folder_name = 'test_feature_vectors_integral.csv'

In [114]:
integral_headers = ['label', 'display_id', 'ad_id', 'doc_id', 'doc_event_id', 'is_leak'] + feature_vector_labels_integral
    
with open(test_validation_feature_vector_integral_csv_folder_name+".header", 'w') as output:
    output.writelines('\n'.join(integral_headers))

In [48]:
test_validation_feature_vectors_integral_csv_rdd = test_validation_feature_vectors_exported_df.select(
     'label', 'display_id', 'ad_id', 'document_id', 'document_id_event', 'is_leak', 'feature_vector') \
     .rdd.map(lambda x: sparse_vector_to_csv_with_nulls_row([x['label'], x['display_id'], x['ad_id'], x['document_id'], x['document_id_event'], x['is_leak']], 
                                                  x['feature_vector'], len(integral_headers)))

In [51]:
%time test_validation_feature_vectors_integral_csv_rdd.saveAsTextFile(OUTPUT_BUCKET_FOLDER+test_validation_feature_vector_integral_csv_folder_name)

CPU times: user 372 ms, sys: 124 ms, total: 496 ms
Wall time: 1h 5min 46s


In [115]:
test_validation_feature_vectors_exported_df.show(3)

+--------------+----------+------+-----------------+-----------+-----+-------+--------------------+
|          uuid|display_id| ad_id|document_id_event|document_id|label|is_leak|      feature_vector|
+--------------+----------+------+-----------------+-----------+-----+-------+--------------------+
|10071b4472ff13|  18070318|429681|          2204038|    2077230| -999|      0|(103,[0,1,2,3,4,5...|
|10071b4472ff13|  18070318|331610|          2204038|    1771347| -999|      0|(103,[0,1,2,3,4,5...|
|10071b4472ff13|  18070318|173005|          2204038|    1031141| -999|      0|(103,[0,1,2,3,4,5...|
+--------------+----------+------+-----------------+-----------+-----+-------+--------------------+
only showing top 3 rows



In [49]:
test_validation_feature_vectors_integral_csv_rdd.take(3)

['-999,18070318,429681,2077230,2204038,0,1,0,4,1.1851e+04,1.3509e+04,0,3,5,0.13239,0.56475,0.074769,0.13184,0.5309,0.069993,,,,0.14352,0.43842,0.062924,0.13161,0.45999,0.060541,0.13043,0.19134,0.024957,0.14352,0.43842,0.062924,0.14352,0.43842,0.062924,0.13184,0.020509,0.0027038,0.13184,0.020509,0.0027038,0.19636,0.014723,0.0028909,0.19775,0.014715,0.00291,0.14696,0.12433,0.018272,0.15004,0.11577,0.01737,0.3228,0.99851,0.32232,0,8.8889e-05,0,0,7.9622e-12,0,0,0.00042512,0,0,4.4444e-05,0,0,5.6873e-12,0,805,1209,1611,,140,,,,,,,,,,6034,1708,1210,,97,252,290,29,414,1.2335e+04,566,228,1007,78,1574,1.8595e+07,2.1926e+05,1.5348e+05,3,1,1.0',
 '-999,18070318,331610,1771347,2204038,0,1,0,4,953,4061,0,3,,0.24239,0.41305,0.10012,0.23713,0.35083,0.083195,,,,0.19768,0.38172,0.075461,0.23713,0.35083,0.083195,0.41667,0.11716,0.048815,0.19768,0.38172,0.075461,0.20397,0.37274,0.076027,0.17512,0.13385,0.02344,0.18544,0.13421,0.024887,0.18473,0.015728,0.0029054,0.18716,0.017675,0.0033082,0.15073,0.14791,0