In [12]:
import pandas as pd
import numpy as np
import catboost as cb

## Training

In [13]:
df = pd.read_csv('training.csv')

In [14]:
df

Unnamed: 0,id,logins_current_month,country,traffic_type,platform,ads_shown_current_month,ads_revenue_current_month,revenue_current_month,games_pvp,wins_pvp,...,ads_revenue_next_month,revenue_next_month,log_ads_revenue_next_month,log_revenue_next_month,is_active_next_month,months_after_reg,win_rate,hard_quests_rate,log_revenue_current_month,log_ads_revenue_current_month
0,1,13,81,paid,iOS,100,21.373946,23.33529,265,205,...,,,0.0,0.0,False,1,0.770677,0.080214,3.191928,3.107897
1,1,1,81,paid,iOS,1,0.867439,8.87251,18,7,...,,,0.0,0.0,False,3,0.368421,0.030303,2.289754,0.624568
2,2,1,193,paid,Android,0,0.000000,0.00000,0,0,...,,,0.0,0.0,False,4,0.000000,0.000000,0.000000,0.000000
3,3,7,180,paid,Android,0,0.000000,0.00000,26,20,...,0.0,0.0,0.0,0.0,True,0,0.740741,0.052632,0.000000,0.000000
4,3,2,180,paid,Android,0,0.000000,0.00000,1,0,...,,,0.0,0.0,False,1,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196877,133672,2,173,paid,Android,0,0.000000,0.00000,9,2,...,,,0.0,0.0,False,1,0.200000,0.054054,0.000000,0.000000
196878,133673,14,49,paid,Android,32,0.042859,0.00000,161,90,...,,,0.0,0.0,False,0,0.555556,0.055814,0.000000,0.041966
196879,133674,1,208,paid,Android,0,0.000000,0.00000,0,0,...,,,0.0,0.0,False,8,0.000000,0.250000,0.000000,0.000000
196880,133674,2,208,paid,Android,2,0.137498,0.00000,5,1,...,,,0.0,0.0,False,2,0.166667,0.000000,0.000000,0.128831


In [15]:
# training `ads_revenue_next_month`

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

X = df.drop(columns=[
    "id", "current_month", "is_active_next_month", "ads_revenue_next_month", "revenue_next_month", "log_ads_revenue_next_month", "log_revenue_next_month"
])
y = df["log_ads_revenue_next_month"]

features_names = X.columns.tolist()
cat_features_names = ['country', 'traffic_type', 'platform']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)

train_pool = cb.Pool(X_train, y_train, cat_features=cat_features_names, feature_names=features_names)
val_pool   = cb.Pool(X_val, y_val, cat_features=cat_features_names, feature_names=features_names)

model_ads_revenue_next_month = cb.CatBoostRegressor(
    iterations=2500,
    learning_rate=0.01,
    loss_function='RMSE',
    # subsample=0.8,
    l2_leaf_reg=3,
)

model_ads_revenue_next_month.fit(
    train_pool,
    eval_set=val_pool,
    early_stopping_rounds=100,
    verbose=100,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.4513239	test: 0.4479694	best: 0.4479694 (0)	total: 10.8ms	remaining: 27s
100:	learn: 0.3102247	test: 0.3088068	best: 0.3088068 (100)	total: 925ms	remaining: 22s
200:	learn: 0.2796266	test: 0.2796399	best: 0.2796399 (200)	total: 1.81s	remaining: 20.8s
300:	learn: 0.2720391	test: 0.2730439	best: 0.2730439 (300)	total: 2.77s	remaining: 20.3s
400:	learn: 0.2690429	test: 0.2707932	best: 0.2707932 (400)	total: 3.67s	remaining: 19.2s
500:	learn: 0.2673831	test: 0.2697715	best: 0.2697715 (500)	total: 4.58s	remaining: 18.3s
600:	learn: 0.2662901	test: 0.2691357	best: 0.2691357 (600)	total: 5.4s	remaining: 17.1s
700:	learn: 0.2652303	test: 0.2686032	best: 0.2686032 (700)	total: 6.2s	remaining: 15.9s
800:	learn: 0.2642404	test: 0.2681973	best: 0.2681973 (800)	total: 7.05s	remaining: 14.9s
900:	learn: 0.2632773	test: 0.2678306	best: 0.2678282 (898)	total: 7.87s	remaining: 14s
1000:	learn: 0.2623812	test: 0.2675227	best: 0.2675227 (1000)	total: 8.72s	remaining: 13.1s
1100:	learn: 0.2615

<catboost.core.CatBoostRegressor at 0x7f65141b02c0>

In [16]:
imp = list(zip(model_ads_revenue_next_month.feature_names_, model_ads_revenue_next_month.feature_importances_.tolist()))

imp.sort(key=lambda x: x[1], reverse=True)

imp

[('currency_1', 19.928535998753517),
 ('log_ads_revenue_current_month', 18.645708495520555),
 ('ads_revenue_current_month', 18.19160740468562),
 ('currency_2', 15.2010779323769),
 ('current_passed_level', 6.205294715691868),
 ('logins_current_month', 4.906978077206729),
 ('months_after_reg', 1.851535180509704),
 ('revenue_current_month', 1.7178881707705367),
 ('log_revenue_current_month', 1.6249580242050568),
 ('offers', 1.576341419741076),
 ('ads_shown_current_month', 1.4425106823424905),
 ('quests', 1.3190604065930955),
 ('current_avg_ping', 1.0118891049206602),
 ('win_rate', 0.9662120797037619),
 ('country', 0.7571083365099603),
 ('currency_6', 0.6742614699223539),
 ('games_pvp', 0.659246792515884),
 ('hard_quests', 0.5339521959479208),
 ('hard_quests_rate', 0.509804283796495),
 ('currency_4', 0.509702466216371),
 ('wins_pvp', 0.4517484072251854),
 ('currency_3', 0.4403405626354297),
 ('currency_7', 0.3876944125889468),
 ('currency_5', 0.2284038906541824),
 ('platform', 0.2274595900

In [17]:
# trainig `revenue_next_month`

from sklearn.model_selection import train_test_split

X = df.drop(columns=[
    "id", "current_month", "is_active_next_month", "ads_revenue_next_month", "revenue_next_month", "log_ads_revenue_next_month", "log_revenue_next_month"
])
y = df["log_revenue_next_month"]


X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, shuffle=True
)

train_pool = cb.Pool(X_train, y_train, cat_features=cat_features_names, feature_names=X.columns.tolist())
val_pool   = cb.Pool(X_val,   y_val,   cat_features=cat_features_names, feature_names=X.columns.tolist())

model_revenue_next_month = cb.CatBoostRegressor(
    iterations=2000,
    learning_rate=0.01,
    loss_function='RMSE',
#     subsample=0.9,
    l2_leaf_reg=3,
)

model_revenue_next_month.fit(
    train_pool,
    eval_set=val_pool,
    early_stopping_rounds=100,
    verbose=100,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6121912	test: 0.6058846	best: 0.6058846 (0)	total: 10.4ms	remaining: 20.9s
100:	learn: 0.4549128	test: 0.4529724	best: 0.4529724 (100)	total: 963ms	remaining: 18.1s
200:	learn: 0.4238474	test: 0.4255670	best: 0.4255670 (200)	total: 1.88s	remaining: 16.8s
300:	learn: 0.4158736	test: 0.4205950	best: 0.4205950 (300)	total: 2.73s	remaining: 15.4s
400:	learn: 0.4125163	test: 0.4193031	best: 0.4193031 (400)	total: 3.6s	remaining: 14.4s
500:	learn: 0.4104800	test: 0.4188336	best: 0.4188336 (500)	total: 4.47s	remaining: 13.4s
600:	learn: 0.4087735	test: 0.4185088	best: 0.4185088 (600)	total: 5.32s	remaining: 12.4s
700:	learn: 0.4071588	test: 0.4182890	best: 0.4182817 (689)	total: 6.26s	remaining: 11.6s
800:	learn: 0.4058225	test: 0.4181844	best: 0.4181831 (798)	total: 7.13s	remaining: 10.7s
900:	learn: 0.4046701	test: 0.4180224	best: 0.4180224 (900)	total: 7.96s	remaining: 9.7s
1000:	learn: 0.4033754	test: 0.4178974	best: 0.4178856 (995)	total: 8.77s	remaining: 8.76s
1100:	learn: 0

<catboost.core.CatBoostRegressor at 0x7f65138a7ec0>

In [18]:
imp = list(zip(model_revenue_next_month.feature_names_, model_revenue_next_month.feature_importances_.tolist()))

imp.sort(key=lambda x: x[1], reverse=True)

imp

[('revenue_current_month', 21.722468955457412),
 ('log_revenue_current_month', 20.674748755171297),
 ('currency_2', 13.209162252431229),
 ('currency_1', 11.102784756181624),
 ('current_passed_level', 5.663412941763721),
 ('offers', 3.833562837003542),
 ('logins_current_month', 2.5536449159892913),
 ('log_ads_revenue_current_month', 2.5025872340435913),
 ('ads_revenue_current_month', 2.1761323528676244),
 ('ads_shown_current_month', 1.8340712756969322),
 ('win_rate', 1.6474963751980445),
 ('country', 1.6397367702697128),
 ('hard_quests_rate', 1.6304134571416404),
 ('current_avg_ping', 1.3447852838933947),
 ('currency_4', 1.1512021187463044),
 ('currency_6', 1.0155268089259208),
 ('quests', 0.9838074135722115),
 ('months_after_reg', 0.8736823530114608),
 ('wins_pvp', 0.8428674879119363),
 ('currency_7', 0.8210832963674688),
 ('hard_quests', 0.816778016478237),
 ('currency_3', 0.7387598917547483),
 ('currency_5', 0.5429195927641544),
 ('games_pvp', 0.4375164153146988),
 ('platform', 0.178

In [19]:
# trainig `is_active_next_month`

from sklearn.model_selection import train_test_split

X = df.drop(columns=[
    "id", "current_month", "is_active_next_month", "ads_revenue_next_month", "revenue_next_month", "log_ads_revenue_next_month", "log_revenue_next_month"
])
y = df["is_active_next_month"]

params = dict(
    iterations=2000,
    early_stopping_rounds=200,
    learning_rate=0.025,
    loss_function='Logloss',
#     subsample=0.9,
    l2_leaf_reg=3,
    eval_metric="F1",
    auto_class_weights="Balanced"
)

pool = cb.Pool(X, y, cat_features=cat_features_names, feature_names=X.columns.tolist())

stats, models_is_active_next_month = cb.cv(
    pool=pool,
    params=params,
    fold_count=5,
    shuffle=True,
    early_stopping_rounds=200,
    plot=True,
    verbose=False,
    return_models=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/5]

bestTest = 0.8858889061
bestIteration = 1992

Training on fold [1/5]

bestTest = 0.8869321883
bestIteration = 1753

Training on fold [2/5]

bestTest = 0.8866415351
bestIteration = 1240

Training on fold [3/5]

bestTest = 0.8887112558
bestIteration = 1978

Training on fold [4/5]

bestTest = 0.8868295666
bestIteration = 1845



In [20]:
# imp = list(zip(model_is_active_next_month.feature_names_, model_is_active_next_month.feature_importances_.tolist()))
#
# imp.sort(key=lambda x: x[1], reverse=True)
#
# imp

# Predict

In [21]:
prediction = pd.read_csv('predict.csv')

In [22]:
prediction

Unnamed: 0.1,Unnamed: 0,id,logins_current_month,country,traffic_type,platform,ads_shown_current_month,ads_revenue_current_month,revenue_current_month,games_pvp,...,ads_revenue_next_month,revenue_next_month,log_ads_revenue_next_month,log_revenue_next_month,is_active_next_month,months_after_reg,win_rate,hard_quests_rate,log_revenue_current_month,log_ads_revenue_current_month
0,11,6,29,133,organic,Android,66,0.040797,0.000000,221,...,,,0.0,0.0,False,3,0.490991,0.084158,0.000000,0.039987
1,17,10,30,164,paid,iOS,781,26.512756,121.358247,401,...,,,0.0,0.0,False,1,0.582090,0.058824,4.806953,3.314650
2,19,11,7,93,organic,Android,3,0.021389,0.000000,20,...,,,0.0,0.0,False,0,0.523810,0.021739,0.000000,0.021164
3,27,18,9,112,organic,iOS,67,2.316753,5.058957,97,...,,,0.0,0.0,False,0,0.642857,0.056818,1.801538,1.198986
4,36,23,13,67,paid,Android,19,0.145450,0.000000,114,...,,,0.0,0.0,False,1,0.504348,0.057592,0.000000,0.135797
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27245,224080,133642,1,101,paid,Android,0,0.000000,0.000000,1,...,,,0.0,0.0,False,5,0.500000,0.000000,0.000000,0.000000
27246,224084,133644,1,161,paid,Android,0,0.000000,0.000000,1,...,,,0.0,0.0,False,2,0.500000,0.000000,0.000000,0.000000
27247,224098,133651,1,8,paid,Android,0,0.000000,0.000000,0,...,,,0.0,0.0,False,1,0.000000,0.000000,0.000000,0.000000
27248,224114,133662,2,164,paid,Android,2,0.002376,0.000000,1,...,,,0.0,0.0,False,4,0.000000,0.000000,0.000000,0.002374


In [23]:
def predict_average(models, input):
    predictions = np.array([ model.predict(input[model.feature_names_], prediction_type="Probability")[:, 1] for model in models ])

    return predictions.mean(axis=0)

prediction['is_active'] = np.int8(predict_average(models_is_active_next_month, prediction) > 0.5)
prediction['revenue_next_month'] = np.expm1(model_revenue_next_month.predict(prediction[model_revenue_next_month.feature_names_]))
prediction['ads_revenue_next_month'] = np.expm1(model_ads_revenue_next_month.predict(prediction[model_ads_revenue_next_month.feature_names_]))

submission = prediction[ ['id', 'is_active', 'revenue_next_month', 'ads_revenue_next_month'] ].copy()

submission

Unnamed: 0,id,is_active,revenue_next_month,ads_revenue_next_month
0,6,1,0.015074,0.058856
1,10,1,61.225416,15.655669
2,11,0,0.004484,0.012347
3,18,0,0.234950,0.176519
4,23,0,0.008136,0.019339
...,...,...,...,...
27245,133642,0,0.009948,0.007801
27246,133644,0,-0.000461,-0.000875
27247,133651,0,0.000379,0.001027
27248,133662,0,0.009067,0.013235


In [24]:
submission[ 'next_month_revenue' ] = np.maximum(submission['revenue_next_month'], 0) + np.maximum(submission['ads_revenue_next_month'], 0)

submission = submission[ ['id', 'is_active', 'next_month_revenue'] ]


In [25]:
submission

Unnamed: 0,id,is_active,next_month_revenue
0,6,1,0.073930
1,10,1,76.881084
2,11,0,0.016831
3,18,0,0.411469
4,23,0,0.027475
...,...,...,...
27245,133642,0,0.017749
27246,133644,0,0.000000
27247,133651,0,0.001406
27248,133662,0,0.022302


In [26]:
sub = pd.read_csv('history.csv')
sub = sub.drop_duplicates(subset='id', keep='first')
sub = sub[ ['id'] ]

sub.sort_values('id', inplace=True)

In [27]:
sub['is_active'] = 0
sub['next_month_revenue'] = 0.0

sub = sub.set_index('id', drop=False)

In [28]:
sub.update(submission.set_index("id"))

In [29]:
sub = sub[ ['id', 'next_month_revenue', 'is_active'] ]

sub

Unnamed: 0_level_0,id,next_month_revenue,is_active
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,0.0,0
2,2,0.0,0
3,3,0.0,0
4,4,0.0,0
5,5,0.0,0
...,...,...,...
133670,133670,0.0,0
133671,133671,0.0,0
133672,133672,0.0,0
133673,133673,0.0,0


In [36]:
sub.to_csv('submission.csv', index=False)