In [1]:
import numpy as np 
import pandas as pd

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


import warnings
warnings.filterwarnings("ignore")

from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse


RANDOM_SEED = 42
# CatBoost parametres
ITERATIONS = 500
LR         = 0.25
DEPTH = 6

/kaggle/input/real-time-advertisers-auction/Dataset.csv


In [2]:
df = pd.read_csv('/kaggle/input/real-time-advertisers-auction/Dataset.csv')

In [3]:
# check missing values
df.isna().sum()

date                       0
site_id                    0
ad_type_id                 0
geo_id                     0
device_category_id         0
advertiser_id              0
order_id                   0
line_item_type_id          0
os_id                      0
integration_type_id        0
monetization_channel_id    0
ad_unit_id                 0
total_impressions          0
total_revenue              0
viewable_impressions       0
measurable_impressions     0
revenue_share_percent      0
dtype: int64

In [4]:
# check duplicated values
df.duplicated().sum()

23164

In [5]:
# delete duplicated values
df = df.drop_duplicates().reset_index(drop=True)

В качестве train необходимо использовать данные до 21.06.2019 включительно, для test - все оставшиеся

In [6]:
df.date = pd.to_datetime(df.date)
df.date.describe()

count                  544127
unique                     30
top       2019-06-24 00:00:00
freq                    22525
first     2019-06-01 00:00:00
last      2019-06-30 00:00:00
Name: date, dtype: object

Пометим train (1) и test (0) с помощью нового признака sample

In [7]:
df['sample'] = df.date.apply(lambda x: 1 if x.day < 22 else 0)  # так как данные представлены только с 1 по 30 июня

df.sample(2)

Unnamed: 0,date,site_id,ad_type_id,geo_id,device_category_id,advertiser_id,order_id,line_item_type_id,os_id,integration_type_id,monetization_channel_id,ad_unit_id,total_impressions,total_revenue,viewable_impressions,measurable_impressions,revenue_share_percent,sample
383264,2019-06-10,345,10,142,1,79,3459,20,56,1,19,5102,2,0.0,0,2,1,1
108006,2019-06-24,350,10,187,2,79,3477,8,58,1,19,5168,640,0.0554,80,640,1,0


Рассчитаем CPM для train по формуле: CPM = ((revenue of the publisher * 100) / revenue_share_percentage) / measurable_impressions) * 1000

In [8]:
def cpm(revenue_of_the_publisher, revenue_share_percentage, measurable_impressions):
    if measurable_impressions == 0:
        return 0
    else:
        return ((revenue_of_the_publisher * 100 / revenue_share_percentage) / measurable_impressions) * 1000

df['CPM'] = df.apply(lambda x: cpm(x['total_revenue'], x['revenue_share_percent'], x['measurable_impressions']), axis=1)

df['CPM'].describe()

count    544127.000000
mean        120.211135
std         756.477535
min        -700.909091
25%           0.000000
50%           0.000000
75%         110.000000
max      283620.000000
Name: CPM, dtype: float64

In [9]:
# избавимся от признаков, которые явно присутствовали в формуле при расчете CPM
df.drop(['total_revenue', 'revenue_share_percent', 'measurable_impressions'], axis=1, inplace=True)

In [10]:
# обратим внимание, что признак integration_type_id имеет всего 1 уникальное значение
df.integration_type_id.nunique()

1

In [11]:
# поэтому можем избавиться от него
df.drop(['integration_type_id'], axis=1, inplace=True)

In [12]:
# зададим категориальные фичи для модели
cat_features_ids = ['site_id',
                    'ad_type_id',
                    'geo_id',
                    'device_category_id',
                    'advertiser_id',
                    'line_item_type_id',
                    'os_id',
                    'monetization_channel_id',
                    'ad_unit_id',
                    'order_id']

In [13]:
# выделим обучающую выборку
train_data = df.query('sample == 1').drop(['date', 'sample'], axis=1)

train_data['CPM'].describe()

count    379584.000000
mean        120.688944
std         814.365961
min           0.000000
25%           0.000000
50%           0.000000
75%         115.000000
max      283620.000000
Name: CPM, dtype: float64

In [14]:
train_data['CPM'].quantile(0.95)

534.9999999999999

In [15]:
# удалим выбросы
train_data = train_data.query("0 <= CPM < 535")

In [16]:
y = train_data['CPM'].values
X = train_data.drop(['CPM'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=RANDOM_SEED)

model = CatBoostRegressor(iterations = ITERATIONS,
                              learning_rate = LR,
                              depth = DEPTH,
                              eval_metric='RMSE',
                              random_seed = RANDOM_SEED)

model.fit(X_train, y_train, 
          cat_features=cat_features_ids,
          eval_set=(X_test, y_test),
          verbose_eval=50,
          use_best_model=True,
          plot=False)

y_pred = model.predict(X_test)

print(f"MSE = {mse(y_test, y_pred)}")

0:	learn: 91.4900182	test: 91.0219647	best: 91.0219647 (0)	total: 566ms	remaining: 4m 42s
50:	learn: 51.4709533	test: 51.5649025	best: 51.5649025 (50)	total: 18.5s	remaining: 2m 43s
100:	learn: 50.1534343	test: 50.4038954	best: 50.4038954 (100)	total: 36s	remaining: 2m 22s
150:	learn: 49.5243241	test: 49.9107955	best: 49.9107955 (150)	total: 53.4s	remaining: 2m 3s
200:	learn: 49.1035273	test: 49.6362433	best: 49.6362433 (200)	total: 1m 11s	remaining: 1m 45s
250:	learn: 48.8130315	test: 49.4281319	best: 49.4281319 (250)	total: 1m 29s	remaining: 1m 28s
300:	learn: 48.5900421	test: 49.3078618	best: 49.3078618 (300)	total: 1m 46s	remaining: 1m 10s
350:	learn: 48.3970509	test: 49.2016135	best: 49.2016135 (350)	total: 2m 4s	remaining: 53s
400:	learn: 48.2035405	test: 49.1079331	best: 49.1078896 (399)	total: 2m 23s	remaining: 35.4s
450:	learn: 48.0713857	test: 49.0599400	best: 49.0589370 (448)	total: 2m 41s	remaining: 17.5s
499:	learn: 47.9329585	test: 49.0031242	best: 49.0022304 (497)	total:

In [17]:
test_data = df.query('sample == 0').drop(['date', 'sample'], axis=1)
test_data['CPM'].quantile(0.95)

545.0

In [18]:
# удалим выбросы
test_data = test_data.query("0 <= CPM <= 545")


test_values = test_data['CPM'].values
test_data = test_data.drop(['CPM'], axis=1)

mse(test_values, model.predict(test_data))

2850.744593734992