In [4]:
# Checklist:
# AWS emr-5.29.0
# MASTER r5d.8xlarge 1x, no EBS
# CORE r5d.8xlarge 4x, no EBS
# Custom bootstrap action: s3://ydatazian/bootstrap.sh
# Allow ssh in master node security group

In [101]:
import tqdm.notebook as tqdm
import numpy as np
import scipy
import sklearn
import matplotlib.pyplot as plt
import datetime

# SparkSession

https://spark.apache.org/docs/2.4.4/api/python/pyspark.html

https://spark.apache.org/docs/2.4.4/api/python/pyspark.sql.html

In [7]:
import findspark
findspark.init()

import spark_utils
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
sc = SparkContext("yarn", "My App", conf=spark_utils.get_spark_conf())
se = SparkSession(sc)
spark_utils.print_ui_links()

NameNode: http://ec2-18-188-140-254.us-east-2.compute.amazonaws.com:50070
YARN: http://ec2-18-188-140-254.us-east-2.compute.amazonaws.com:8088
Spark UI: http://ec2-18-188-140-254.us-east-2.compute.amazonaws.com:20888/proxy/application_1590906759072_0002


# Register all tables for sql queries

In [8]:
from IPython.display import display
tables = ["clicks_test", "clicks_train", 
          "documents_categories", "documents_entities", "documents_meta", "documents_topics", 
          "events", "page_views", "page_views_sample", "promoted_content"]
for name in tqdm.tqdm(tables):
    df = se.read.parquet("s3://ydatazian/{}.parquet".format(name))
    df.registerTempTable(name)
    print(name)
    display(df.limit(3).toPandas())

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

clicks_test


Unnamed: 0,display_id,ad_id
0,16874594,66758
1,16874594,150083
2,16874594,162754


clicks_train


Unnamed: 0,display_id,ad_id,clicked
0,1,42337,0
1,1,139684,0
2,1,144739,1


documents_categories


Unnamed: 0,document_id,category_id,confidence_level
0,1595802,1611,0.92
1,1595802,1610,0.07
2,1524246,1807,0.92


documents_entities


Unnamed: 0,document_id,entity_id,confidence_level
0,1524246,f9eec25663db4cd83183f5c805186f16,0.672865314504701
1,1524246,55ebcfbdaff1d6f60b3907151f38527a,0.399113728441297
2,1524246,839907a972930b17b125eb0247898412,0.392095749652966


documents_meta


Unnamed: 0,document_id,source_id,publisher_id,publish_time
0,1595802,1,603,2016-06-05 00:00:00
1,1524246,1,603,2016-05-26 11:00:00
2,1617787,1,603,2016-05-27 00:00:00


documents_topics


Unnamed: 0,document_id,topic_id,confidence_level
0,1595802,140,0.0731131601068925
1,1595802,16,0.0594164867373976
2,1595802,143,0.0454207537554526


events


Unnamed: 0,display_id,uuid,document_id,timestamp,platform,geo_location
0,1,cb8c55702adb93,379743,61,3,US>SC>519
1,2,79a85fa78311b9,1794259,81,2,US>CA>807
2,3,822932ce3d8757,1179111,182,2,US>MI>505


page_views


Unnamed: 0,uuid,document_id,timestamp,platform,geo_location,traffic_source
0,1fd5f051fba643,120,31905835,1,RS,2
1,8557aa9004be3b,120,32053104,1,VN>44,2
2,c351b277a358f0,120,54013023,1,KR>12,1


page_views_sample


Unnamed: 0,uuid,document_id,timestamp,platform,geo_location,traffic_source
0,1fd5f051fba643,120,31905835,1,RS,2
1,8557aa9004be3b,120,32053104,1,VN>44,2
2,c351b277a358f0,120,54013023,1,KR>12,1


promoted_content


Unnamed: 0,ad_id,document_id,campaign_id,advertiser_id
0,1,6614,1,7
1,2,471467,2,7
2,3,7692,3,7





# Prepare dataset for VW

We will predict a *click* based on:
- ad_id
- document_id
- campaign_id
- advertiser_id

In [9]:
%%time
se.sql("""
select 
    clicks_train.clicked,
    clicks_train.display_id,
    clicks_train.ad_id,
    promoted_content.document_id,
    promoted_content.campaign_id,
    promoted_content.advertiser_id
from clicks_train join promoted_content on clicks_train.ad_id = promoted_content.ad_id
""").write.parquet("/train_features.parquet", mode='overwrite')

CPU times: user 3.88 ms, sys: 741 µs, total: 4.62 ms
Wall time: 43.9 s


In [10]:
se.read.parquet("/train_features.parquet").show(5)

+-------+----------+------+-----------+-----------+-------------+
|clicked|display_id| ad_id|document_id|campaign_id|advertiser_id|
+-------+----------+------+-----------+-----------+-------------+
|      0|         1| 42337|     938164|       5969|         1499|
|      0|         1|139684|    1085937|      17527|         2563|
|      1|         1|144739|    1337362|      18488|         2909|
|      0|         1|156824|     992370|       7283|         1919|
|      0|         1|279295|    1670176|      27524|         1820|
+-------+----------+------+-----------+-----------+-------------+
only showing top 5 rows



In [11]:
# Format: [Label] [Importance] [Base] [Tag]|Namespace Features |Namespace Features ... |Namespace Features
# https://github.com/VowpalWabbit/vowpal_wabbit/wiki/Input-format
def vw_row_mapper(row):
    clicked = None
    features = []
    for k, v in row.asDict().items():
        if k == 'clicked':
            clicked = '1' if v == '1' else '-1'
        else:
            features.append(k + "_" + v)
    tag = row.display_id + "_" + row.ad_id
    return "{} {}| {}".format(clicked, tag, " ".join(features))

r = se.read.parquet("/train_features.parquet").take(1)[0]
print(r)
print(vw_row_mapper(r))

Row(clicked='0', display_id='1', ad_id='42337', document_id='938164', campaign_id='5969', advertiser_id='1499')
-1 1_42337| display_id_1 ad_id_42337 document_id_938164 campaign_id_5969 advertiser_id_1499


In [12]:
%%time
! hdfs dfs -rm -r /train_features.txt
(
    se.read.parquet("/train_features.parquet")
    .rdd
    .map(vw_row_mapper)
    .saveAsTextFile("/train_features.txt")
)

rm: `/train_features.txt': No such file or directory
CPU times: user 26.9 ms, sys: 20.9 ms, total: 47.8 ms
Wall time: 2min 29s


In [13]:
# copy file to local master node
! rm /mnt/train.txt
! hdfs dfs -getmerge /train_features.txt /mnt/train.txt
# preview local file
! head -n 5 /mnt/train.txt

rm: cannot remove '/mnt/train.txt': No such file or directory
-1 1_42337| display_id_1 ad_id_42337 document_id_938164 campaign_id_5969 advertiser_id_1499
-1 1_139684| display_id_1 ad_id_139684 document_id_1085937 campaign_id_17527 advertiser_id_2563
1 1_144739| display_id_1 ad_id_144739 document_id_1337362 campaign_id_18488 advertiser_id_2909
-1 1_156824| display_id_1 ad_id_156824 document_id_992370 campaign_id_7283 advertiser_id_1919
-1 1_279295| display_id_1 ad_id_279295 document_id_1670176 campaign_id_27524 advertiser_id_1820


# Train VW
https://vowpalwabbit.org/tutorials/getting_started.html

https://github.com/JohnLangford/vowpal_wabbit/wiki/Command-line-arguments

In [14]:
! ./vw -d /mnt/train.txt -b 24 -c -k --ftrl --passes 1 -f model --holdout_off --loss_function logistic --random_seed 42 --progress 8000000

final_regressor = model
Enabling FTRL based optimization
Algorithm used: Proximal-FTRL
ftrl_alpha = 0.005
ftrl_beta = 0.1
Num weight bits = 24
learning rate = 0.5
initial_t = 0
power_t = 0.5
creating cache_file = /mnt/train.txt.cache
Reading datafile = /mnt/train.txt
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
0.460142 0.460142      8000000      8000000.0  -1.0000  -1.2983        6
0.456159 0.452175     16000000     16000000.0  -1.0000  -1.4733        6
0.453326 0.447661     24000000     24000000.0  -1.0000  -2.3174        6
0.452628 0.450533     32000000     32000000.0   1.0000  -0.5719        6
0.452887 0.453924     40000000     40000000.0  -1.0000  -1.2648        6
0.452010 0.447625     48000000     48000000.0  -1.0000  -2.6227        6
0.451328 0.447232     56000000     56000000.0  -1.0000  -3.4451        6
0.450649 0.445902     64000000     64000000.0  -1.0000  -1.

In [15]:
# make prediction with VW
! echo "? tag1| ad_id_144739 document_id_1337362 campaign_id_18488 advertiser_id_2909" > /mnt/test.txt
! echo "? tag2| ad_id_156824 document_id_992370 campaign_id_7283 advertiser_id_1919" >> /mnt/test.txt
! ./vw -d /mnt/test.txt -i model -t -k -p /mnt/predictions.txt --progress 1000000 --link=logistic
# predicted probabilities of "1" class
! cat /mnt/predictions.txt

only testing
predictions = /mnt/predictions.txt
Enabling FTRL based optimization
Algorithm used: Proximal-FTRL
ftrl_alpha = 0.005
ftrl_beta = 0.1
Num weight bits = 24
learning rate = 0.5
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = /mnt/test.txt
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features

finished run
number of examples = 2
weighted example sum = 2.000000
weighted label sum = 0.000000
average loss = 5.674478
total feature number = 10
0.319262 tag1
0.036172 tag2


# Homework 2: Baseline VW model

Train a baseline model using the following features:
- **clicked**
- geo_location features (country, state, dma)
- day_of_week (from timestamp, use *date.isoweekday()*)
- ad_id
- campaign_id
- advertiser_id
- ad_document_id
- display_document_id
- platform

Make submission to Kaggle to know your leaderboard score

If you want to create a dev set, make a 90%/10% split of training data by display_id

# Submitting to Kaggle

Obtain Kaggle API token: https://github.com/Kaggle/kaggle-api#api-credentials

Making a submission: https://github.com/Kaggle/kaggle-api#submit-to-a-competition

In [52]:
! mkdir ~/.kaggle
! touch ~/.kaggle/kaggle.json
! echo '{"username":"acohn74","key":"5eee26e60a1c2cb609c4a7bdfd5f937d"}' > ~/.kaggle/kaggle.json
! cat ~/.kaggle/kaggle.json
! chmod 600 /home/hadoop/.kaggle/kaggle.json

mkdir: cannot create directory '/home/hadoop/.kaggle': File exists
{"username":"acohn74","key":"5eee26e60a1c2cb609c4a7bdfd5f937d"}


In [40]:
! aws s3 cp s3://ydatazian/sample_submission.csv .

download: s3://ydatazian/sample_submission.csv to ./sample_submission.csv


In [41]:
# https://www.kaggle.com/c/outbrain-click-prediction/overview/evaluation
# For each display_id in the test set, you must predict a space-delimited list of ad_ids, 
# ordered by decreasing likelihood of being clicked.
! head -n 5 ./sample_submission.csv

display_id,ad_id
16874594,66758 150083 162754 170392 172888 180797
16874595,8846 30609 143982
16874596,11430 57197 132820 153260 173005 288385 289122 289915
16874597,137858 143981 155945 180965 182039 285834 305790 308836


In [42]:
%%time
se.sql("""
select 
    "0" as clicked,
    clicks_test.display_id,
    clicks_test.ad_id,
    promoted_content.document_id,
    promoted_content.campaign_id,
    promoted_content.advertiser_id
from clicks_test join promoted_content on clicks_test.ad_id = promoted_content.ad_id
""").write.parquet("/test_features.parquet", mode='overwrite')

CPU times: user 3.01 ms, sys: 737 µs, total: 3.75 ms
Wall time: 43.8 s


In [43]:
%%time
! hdfs dfs -rm -r /test_features.txt
(
    se.read.parquet("/test_features.parquet")
    .rdd
    .map(vw_row_mapper)
    .saveAsTextFile("/test_features.txt")
)

rm: `/test_features.txt': No such file or directory
CPU times: user 22.7 ms, sys: 14.6 ms, total: 37.3 ms
Wall time: 2min 23s


In [44]:
# copy file to local master node
! rm /mnt/test.txt
! hdfs dfs -getmerge /test_features.txt /mnt/test.txt
# preview local file
! head -n 5 /mnt/test.txt

-1 16874594_66758| display_id_16874594 ad_id_66758 document_id_1051283 campaign_id_8949 advertiser_id_555
-1 16874594_150083| display_id_16874594 ad_id_150083 document_id_1358132 campaign_id_19045 advertiser_id_1913
-1 16874594_162754| display_id_16874594 ad_id_162754 document_id_1292723 campaign_id_17770 advertiser_id_2391
-1 16874594_170392| display_id_16874594 ad_id_170392 document_id_1083829 campaign_id_20943 advertiser_id_1731
-1 16874594_172888| display_id_16874594 ad_id_172888 document_id_1433954 campaign_id_1384 advertiser_id_16


In [45]:
! ./vw -d /mnt/test.txt -i model -t -k -p /mnt/predictions.txt --progress 1000000 --link=logistic
# predicted probabilities of "1" class
! head -n 5 /mnt/predictions.txt

only testing
predictions = /mnt/predictions.txt
Enabling FTRL based optimization
Algorithm used: Proximal-FTRL
ftrl_alpha = 0.005
ftrl_beta = 0.1
Num weight bits = 24
learning rate = 0.5
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = /mnt/test.txt
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
0.882659 0.882659      1000000      1000000.0  -1.0000   0.0992        6
0.892980 0.903301      2000000      2000000.0  -1.0000   0.2038        6
0.910004 0.944050      3000000      3000000.0  -1.0000   0.1904        6
0.930789 0.993147      4000000      4000000.0  -1.0000   0.1299        6
0.940655 0.980117      5000000      5000000.0  -1.0000   0.2314        6
0.936158 0.913676      6000000      6000000.0  -1.0000   0.2526        6
0.931499 0.903542      7000000      7000000.0  -1.0000   0.1489        6
0.934748 0.957494      8000000      8000000.0  -1.0000   0.1199 

In [46]:
! wc -l /mnt/predictions.txt

32225162 /mnt/predictions.txt


In [47]:
from collections import defaultdict
scores_by_display_id = defaultdict(dict)
for line in tqdm.tqdm(open('/mnt/predictions.txt')):
    score, tag = line.strip().split(" ")
    score = float(score)
    display_id, ad_id = tag.split("_")
    scores_by_display_id[display_id][ad_id] = score

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [48]:
with open("submission.txt", "w") as f:
    f.write("display_id,ad_id\n")
    for k, vs in tqdm.tqdm_notebook(scores_by_display_id.items()):
        f.write("{},{}\n".format(
            k, 
            " ".join([v[0] for v in sorted(vs.items(), key=lambda x: -x[1])])
        ))

HBox(children=(FloatProgress(value=0.0, max=6245533.0), HTML(value='')))




In [53]:
! kaggle competitions submit -f submission.txt outbrain-click-prediction -m "baseline"

100%|########################################| 260M/260M [00:05<00:00, 49.0MB/s]
Successfully submitted to Outbrain Click Prediction

## My Submission

### Training data & Feature Engineering

In [213]:
%%time
se.sql("""
with base as (
    select 
            clicks_train.clicked,
            clicks_train.display_id,
            clicks_train.ad_id,
            cast(promoted_content.document_id as int) document_id,
            promoted_content.campaign_id,
            promoted_content.advertiser_id
    from    clicks_train join promoted_content on clicks_train.ad_id = promoted_content.ad_id
    )
, cats as (
    select  distinct 
            cast(page_views.document_id as int) document_id, 
            dc.top_category,
            dt.top_topic,
            dm.publisher_id,
            dm.source_id,
            dm.publish_time
    from    page_views
    left    join    (select distinct 
                            m.document_id, 
                            first_value(m.category_id) over (partition by m.document_id order by m.confidence_level desc) as top_category 
                    from    documents_categories m
                    ) dc on cast(page_views.document_id as int) = cast(dc.document_id as int)
    left    join    documents_meta as dm on cast(page_views.document_id as int) = cast(dm.document_id as int)
    left    join    (select distinct 
                            t.document_id, 
                            first_value(t.topic_id) over (partition by t.document_id order by t.confidence_level desc) as top_topic
                     from   documents_topics t
                     ) as dt on cast(page_views.document_id as int) = cast(dt.document_id as int)
    )

select  base.clicked, 
        base.display_id,
        base.ad_id,
        base.document_id,
        base.campaign_id,
        base.advertiser_id,
        events.uuid,
        events.geo_location,
        cast(events.timestamp as int) + 1465876799998 as timestamp,
        events.platform,
        Rank() over(partition by events.uuid order by timestamp asc) as click_rank,
        cats.top_category,
        cats.top_topic,
        cats.publisher_id,
        cats.source_id,
        datediff(cats.publish_time, from_unixtime(floor((cast(events.timestamp as int) + 1465876799998)/1000))) as days_since_published
from    base
join    events on events.display_id = base.display_id
left    join cats on base.document_id = cats.document_id
""").write.parquet("/train_features_exp.parquet", mode='overwrite')

CPU times: user 8.56 ms, sys: 93 µs, total: 8.65 ms
Wall time: 1min 52s


In [214]:
# %%time
# # TODO columns from events are all null, problem in join?
# se.sql("""
# with base as (
#     select 
#             clicks_train.clicked,
#             clicks_train.display_id,
#             clicks_train.ad_id,
#             promoted_content.document_id,
#             promoted_content.campaign_id,
#             promoted_content.advertiser_id
#     from    clicks_train join promoted_content on clicks_train.ad_id = promoted_content.ad_id
#     ) 

# """).write.parquet("/train_features_exp.parquet", mode='overwrite')

In [226]:
# Format: [Label] [Importance] [Base] [Tag]|Namespace Features |Namespace Features ... |Namespace Features
# https://github.com/VowpalWabbit/vowpal_wabbit/wiki/Input-format
def vw_row_mapper_exp(row):
    clicked = None
    country = None
    state = None
    dma = None
    year = None
    weekday = None
    month = None
    hour = None
    features = []
    for k, v in row.asDict().items():
        if k == 'clicked':
            clicked = '1' if v == '1' else '-1'
        elif k == 'geo_location':
            if v is None:
                pass
            else:
                geo = v.split('>')
                country = geo[0]
                if len(geo) > 1:
                    if geo[1].isnumeric():
                        dma = geo[1]
                    else:
                        state = geo[1]
                if len(geo) > 2:
                    if geo[2].isnumeric():
                        dma = geo[2]
        elif k == 'timestamp':
            if v is None:
                pass
            else:
                d = datetime.datetime.utcfromtimestamp(v/1000)
                month = d.month
                weekday = d.isoweekday()
                hour = d.hour
                year = d.year
        else:
            features.append(f'{k}_{v}')
    features.append(f'country_{country}')
    features.append(f'state_{state}')
    features.append(f'dma_{dma}')
    features.append(f'weekday_{weekday}')
    features.append(f'hour_{hour}')
    features.append(f'month_{month}')
    features.append(f'year_{year}')
    tag = f'{row.display_id}_{row.ad_id}'
    return f'{clicked} {tag}| {" ".join(features)}'

r = se.read.parquet("/train_features_exp.parquet").take(1)[0]
print(r)
print(vw_row_mapper_exp(r))

Row(clicked='1', display_id='16757900', ad_id='156270', document_id=1388416, campaign_id='18892', advertiser_id='2623', uuid='100013af048bbf', geo_location='US>CA>807', timestamp=1466992982029, platform='3', click_rank=1, top_category=None, top_topic=None, publisher_id=None, source_id=None, days_since_published=None)
1 16757900_156270| display_id_16757900 ad_id_156270 document_id_1388416 campaign_id_18892 advertiser_id_2623 uuid_100013af048bbf platform_3 click_rank_1 top_category_None top_topic_None publisher_id_None source_id_None days_since_published_None country_US state_CA dma_807 weekday_1 hour_2 month_6 year_2016


In [227]:
%%time
! hdfs dfs -rm -r /train_features_exp.txt
(
    se.read.parquet("/train_features_exp.parquet")
    .rdd
    .map(vw_row_mapper_exp)
    .saveAsTextFile("/train_features_exp.txt")
)
# copy file to local master node
! rm /mnt/train_exp.txt
! hdfs dfs -getmerge /train_features_exp.txt /mnt/train_exp.txt
# preview local file
! head -n 5 /mnt/train_exp.txt

Deleted /train_features_exp.txt
1 16757900_156270| display_id_16757900 ad_id_156270 document_id_1388416 campaign_id_18892 advertiser_id_2623 uuid_100013af048bbf platform_3 click_rank_1 top_category_None top_topic_None publisher_id_None source_id_None days_since_published_None country_US state_CA dma_807 weekday_1 hour_2 month_6 year_2016
-1 16757900_147242| display_id_16757900 ad_id_147242 document_id_1108162 campaign_id_15428 advertiser_id_2241 uuid_100013af048bbf platform_3 click_rank_1 top_category_None top_topic_None publisher_id_None source_id_None days_since_published_None country_US state_CA dma_807 weekday_1 hour_2 month_6 year_2016
-1 16757900_7033| display_id_16757900 ad_id_7033 document_id_393333 campaign_id_530 advertiser_id_386 uuid_100013af048bbf platform_3 click_rank_1 top_category_None top_topic_None publisher_id_None source_id_None days_since_published_None country_US state_CA dma_807 weekday_1 hour_2 month_6 year_2016
-1 8264198_232277| display_id_8264198 ad_id_232277

In [228]:
! ./vw -d /mnt/train_exp.txt -b 24 -c -k --ftrl --passes 1 -f model --holdout_off --loss_function logistic --random_seed 42 --progress 8000000

final_regressor = model
Enabling FTRL based optimization
Algorithm used: Proximal-FTRL
ftrl_alpha = 0.005
ftrl_beta = 0.1
Num weight bits = 24
learning rate = 0.5
initial_t = 0
power_t = 0.5
creating cache_file = /mnt/train_exp.txt.cache
Reading datafile = /mnt/train_exp.txt
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
0.460720 0.460720      8000000      8000000.0  -1.0000  -2.0834       21
0.456452 0.452184     16000000     16000000.0  -1.0000  -1.4302       21
0.454260 0.449877     24000000     24000000.0   1.0000  -2.0360       21
0.452856 0.448642     32000000     32000000.0  -1.0000  -1.3743       21
0.451806 0.447607     40000000     40000000.0   1.0000  -0.1992       21
0.451028 0.447136     48000000     48000000.0  -1.0000  -0.9607       21
0.450392 0.446574     56000000     56000000.0  -1.0000  -1.0457       21
0.449882 0.446314     64000000     64000000.0  -1.0

### Test Data and Submission

In [218]:
%%time
se.sql("""
with base as (
select 
    "0" as clicked,
    clicks_test.display_id,
    clicks_test.ad_id,
    promoted_content.document_id,
    promoted_content.campaign_id,
    promoted_content.advertiser_id
from clicks_test join promoted_content on clicks_test.ad_id = promoted_content.ad_id
    )
, cats as (
    select  distinct 
            cast(page_views.document_id as int) document_id, 
            dc.top_category,
            dt.top_topic,
            dm.publisher_id,
            dm.source_id,
            dm.publish_time
    from    page_views
    left    join    (select distinct 
                            m.document_id, 
                            first_value(m.category_id) over (partition by m.document_id order by m.confidence_level desc) as top_category 
                    from    documents_categories m
                    ) dc on cast(page_views.document_id as int) = cast(dc.document_id as int)
    left    join    documents_meta as dm on cast(page_views.document_id as int) = cast(dm.document_id as int)
    left    join    (select distinct 
                            t.document_id, 
                            first_value(t.topic_id) over (partition by t.document_id order by t.confidence_level desc) as top_topic
                     from   documents_topics t
                     ) as dt on cast(page_views.document_id as int) = cast(dt.document_id as int)
    )

select  base.clicked, 
        base.display_id,
        base.ad_id,
        base.document_id,
        base.campaign_id,
        base.advertiser_id,
        events.uuid,
        events.geo_location,
        cast(events.timestamp as int) + 1465876799998 as timestamp,
        events.platform,
        Rank() over(partition by events.uuid order by timestamp asc) as click_rank,
        cats.top_category,
        cats.top_topic,
        cats.publisher_id,
        cats.source_id,
        datediff(cats.publish_time, from_unixtime(floor((cast(events.timestamp as int) + 1465876799998)/1000))) as days_since_published
from    base
join    events on events.display_id = base.display_id
left    join cats on base.document_id = cats.document_id
""").write.parquet("/test_features_exp.parquet", mode='overwrite')

CPU times: user 4.46 ms, sys: 4.07 ms, total: 8.53 ms
Wall time: 1min 46s


In [229]:
%%time
! hdfs dfs -rm -r /test_features_exp.txt
(
    se.read.parquet("/test_features_exp.parquet")
    .rdd
    .map(vw_row_mapper_exp)
    .saveAsTextFile("/test_features_exp.txt")
)

Deleted /test_features_exp.txt
CPU times: user 19.5 ms, sys: 226 ms, total: 246 ms
Wall time: 34.6 s


In [230]:
# copy file to local master node
! rm /mnt/test_exp.txt
! hdfs dfs -getmerge /test_features_exp.txt /mnt/test_exp.txt
# preview local file
! head -n 5 /mnt/test_exp.txt

-1 17199427_189845| display_id_17199427 ad_id_189845 document_id_1452059 campaign_id_22575 advertiser_id_363 uuid_1003a7cb5ec2bc platform_2 click_rank_1 top_category_None top_topic_None publisher_id_None source_id_None days_since_published_None country_GB state_None dma_None weekday_3 hour_13 month_6 year_2016
-1 17199427_361989| display_id_17199427 ad_id_361989 document_id_1509901 campaign_id_26363 advertiser_id_3775 uuid_1003a7cb5ec2bc platform_2 click_rank_1 top_category_None top_topic_None publisher_id_None source_id_None days_since_published_None country_GB state_None dma_None weekday_3 hour_13 month_6 year_2016
-1 17199427_187178| display_id_17199427 ad_id_187178 document_id_1467000 campaign_id_1010 advertiser_id_56 uuid_1003a7cb5ec2bc platform_2 click_rank_1 top_category_None top_topic_None publisher_id_None source_id_None days_since_published_None country_GB state_None dma_None weekday_3 hour_13 month_6 year_2016
-1 17199427_186645| display_id_17199427 ad_id_186645 document_

In [231]:
! ./vw -d /mnt/test_exp.txt -i model -t -k -p /mnt/predictions_exp.txt --progress 1000000 --link=logistic
# predicted probabilities of "1" class
! head -n 5 /mnt/predictions_exp.txt

only testing
predictions = /mnt/predictions_exp.txt
Enabling FTRL based optimization
Algorithm used: Proximal-FTRL
ftrl_alpha = 0.005
ftrl_beta = 0.1
Num weight bits = 24
learning rate = 0.5
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = /mnt/test_exp.txt
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
0.933210 0.933210      1000000      1000000.0  -1.0000   0.2608       21
0.931804 0.930397      2000000      2000000.0  -1.0000   0.1196       21
0.932733 0.934591      3000000      3000000.0  -1.0000   0.2330       21
0.932298 0.930995      4000000      4000000.0  -1.0000   0.2876       21
0.931688 0.929248      5000000      5000000.0  -1.0000   0.4692       21
0.931754 0.932085      6000000      6000000.0  -1.0000   0.3245       21
0.931845 0.932387      7000000      7000000.0  -1.0000   0.1586       21
0.932141 0.934212      8000000      8000000.0  -1.0000  

In [232]:
! wc -l /mnt/predictions_exp.txt

32225162 /mnt/predictions_exp.txt


In [233]:
from collections import defaultdict
scores_by_display_id = defaultdict(dict)
for line in tqdm.tqdm(open('/mnt/predictions_exp.txt')):
    score, tag = line.strip().split(" ")
    score = float(score)
    display_id, ad_id = tag.split("_")
    scores_by_display_id[display_id][ad_id] = score

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [234]:
with open("submission_exp.txt", "w") as f:
    f.write("display_id,ad_id\n")
    for k, vs in tqdm.tqdm_notebook(scores_by_display_id.items()):
        f.write("{},{}\n".format(
            k, 
            " ".join([v[0] for v in sorted(vs.items(), key=lambda x: -x[1])])
        ))

HBox(children=(FloatProgress(value=0.0, max=6245533.0), HTML(value='')))




In [235]:
! kaggle competitions submit -f submission_exp.txt outbrain-click-prediction -m "expanded-features8"

100%|########################################| 260M/260M [00:04<00:00, 59.6MB/s]
Successfully submitted to Outbrain Click Prediction