In [None]:
import pandas as pd
import numpy as np
import catboost as cb
from scipy.optimize import minimize

## Training

In [None]:
df = pd.read_csv('training.csv')

In [None]:
df

In [None]:
def fit_blend_weights(P, y, lam=1e-4, w0=None):
    """
    P: (n, M) OOF-предсказания, y: (n,)
    Возвращает w: (M,) на симплексе (w>=0, sum w = 1).
    """
    n, M = P.shape
    y = y.reshape(-1)

    # Целевая функция
    def obj(w):
        r = y - P.dot(w)                 # остатки (n,)
        return (r @ r) / n + lam * (w @ w)

    # Градиент (ускоряет и стабилизирует оптимизацию)
    def grad(w):
        r = y - P.dot(w)                 # (n,)
        return -(2.0/n) * P.T.dot(r) + 2.0*lam*w  # (M,)

    # Ограничения: сумма весов = 1
    cons = ({'type': 'eq', 'fun': lambda w: np.sum(w) - 1.0,
                      'jac': lambda w: np.ones_like(w)})

    # Границы: w_m >= 0
    bounds = [(0.0, 1.0)] * P.shape[1]

    # Старт: равные веса или слегка «притянутые» к лучшим столбцам
    if w0 is None:
        w0 = np.full(P.shape[1], 1.0 / P.shape[1])

    res = minimize(obj, w0, method='SLSQP', jac=grad,
                   bounds=bounds, constraints=cons,
                   options={'maxiter': 1000, 'ftol': 1e-12})
    if not res.success:
        # На практике SLSQP обычно сходится; если нет — вернём нормализованный срез
        w = np.clip(res.x, 0, None)
        s = w.sum()
        return w / s if s > 0 else np.full(P.shape[1], 1.0/P.shape[1])
    return res.x

### Training `ads_revenue_next_month` model family

In [104]:
import random

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

target = "log_ads_revenue_next_month"

X = df.drop(columns=[
    "id", "current_month", "is_active_next_month", "ads_revenue_next_month", "revenue_next_month", "log_ads_revenue_next_month", "log_revenue_next_month"
])
y = df[target]

features_names = X.columns.tolist()
cat_features_names = ['country', 'traffic_type', 'platform']

model_params = [
    dict(
        iterations=2000,
        learning_rate=0.0075,
        loss_function='RMSE',
        depth=6,
        subsample=1,
        l2_leaf_reg=1,
        rsm = 0.6,
        feature_names=features_names,
        cat_features=cat_features_names,
    ),
    dict(
        iterations=2000,
        learning_rate=0.01,
        loss_function='RMSE',
        depth=8,
        subsample=1,
        l2_leaf_reg=2,
        rsm = 0.8,
        feature_names=features_names,
        cat_features=cat_features_names,
    ),
    dict(
        iterations=2000,
        learning_rate=0.0125,
        loss_function='RMSE',
        depth=10,
        subsample=1,
        l2_leaf_reg=3,
        rsm = 1,
        feature_names=features_names,
        cat_features=cat_features_names,
    )
]

M = len(model_params)
K = 3
N = X.shape[0]

skf = KFold(n_splits=K, shuffle=True)

models_ads_revenue = [[None] * K for _ in range(M)]

P = np.zeros((N, M), dtype=float)

for k, (tr_idx, val_idx) in enumerate(skf.split(X, y)):
    X_tr, X_va = X.iloc[tr_idx], X.iloc[val_idx]
    y_tr, y_va = y.iloc[tr_idx], y.iloc[val_idx]

    for m in range(M):
        params = dict(model_params[m])

        feature_cols = params['feature_names']
        cat_features = params['cat_features']

        del params['feature_names']
        del params['cat_features']

        train_pool = cb.Pool(X_tr, y_tr, cat_features=cat_features, feature_names=feature_cols)
        valid_pool = cb.Pool(X_va, y_va, cat_features=cat_features, feature_names=feature_cols)

        model = cb.CatBoostRegressor(**params, eval_metric="RMSE")

        model.fit(train_pool, eval_set=valid_pool, use_best_model=True, early_stopping_rounds=200, plot=True, verbose=False)

        P[val_idx, m] = model.predict(X_va)

        models_ads_revenue[m][k] = model

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [105]:
resid = P - y.to_numpy()[:, None]

In [106]:
corr = np.corrcoef(resid, rowvar=False)
corr

array([[1.        , 0.99684483, 0.99232744],
       [0.99684483, 1.        , 0.99717505],
       [0.99232744, 0.99717505, 1.        ]])

In [107]:
w_ads_revenue = fit_blend_weights(P, y.to_numpy(), lam=1e-4)


In [108]:
w_ads_revenue

array([2.10285404e-17, 4.18693874e-01, 5.81306126e-01])

In [109]:
def predict_ads_revenue(X, w, b=0):
    sum = np.zeros(X.shape[0])

    for idx, family in enumerate(models_ads_revenue):
        predict = np.mean([ model.predict(X[model.feature_names_]) for model in family ], axis=0)

        sum += w[idx] * predict

    return sum + b


In [110]:
from sklearn.linear_model import RidgeCV

alphas = np.logspace(-6, 2, 30)
ridge = RidgeCV(alphas=alphas, fit_intercept=True)
ridge.fit(P, y)
w_ridge = ridge.coef_     # shape (M,)
b_ridge = ridge.intercept_

In [111]:
from sklearn.metrics import mean_squared_error

rmse1 = mean_squared_error(predict_ads_revenue(X, w_ads_revenue), y)
print(rmse1)

rmse2 = mean_squared_error(predict_ads_revenue(X, w_ridge, b_ridge), y)
print(rmse2)

# pred1 = predict(X.iloc[[0]], w)
# print(pred1)
#
# pred2 = predict(X.iloc[[0]], w_ridge, b_ridge)
# print(pred2)

0.05691283010702683
0.05600088419162987


### Training `revenue_next_month` model

In [112]:
# trainig `revenue_next_month`

from sklearn.model_selection import train_test_split

X = df.drop(columns=[
    "id", "current_month", "is_active_next_month", "ads_revenue_next_month", "revenue_next_month", "log_ads_revenue_next_month", "log_revenue_next_month"
])
y = df["log_revenue_next_month"]


X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, shuffle=True
)

train_pool = cb.Pool(X_train, y_train, cat_features=cat_features_names, feature_names=X.columns.tolist())
val_pool   = cb.Pool(X_val,   y_val,   cat_features=cat_features_names, feature_names=X.columns.tolist())

model_revenue_next_month = cb.CatBoostRegressor(
    iterations=2000,
    learning_rate=0.01,
    loss_function='RMSE',
#     subsample=0.9,
    l2_leaf_reg=3,
)

model_revenue_next_month.fit(
    train_pool,
    eval_set=val_pool,
    early_stopping_rounds=100,
    verbose=100,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6127287	test: 0.6039063	best: 0.6039063 (0)	total: 9.02ms	remaining: 18s
100:	learn: 0.4578296	test: 0.4442003	best: 0.4442003 (100)	total: 896ms	remaining: 16.8s
200:	learn: 0.4271172	test: 0.4133894	best: 0.4133894 (200)	total: 1.85s	remaining: 16.6s
300:	learn: 0.4193582	test: 0.4067884	best: 0.4067884 (300)	total: 2.74s	remaining: 15.4s
400:	learn: 0.4162872	test: 0.4051388	best: 0.4051388 (400)	total: 3.58s	remaining: 14.3s
500:	learn: 0.4144943	test: 0.4046355	best: 0.4046334 (498)	total: 4.42s	remaining: 13.2s
600:	learn: 0.4130560	test: 0.4043012	best: 0.4042955 (587)	total: 5.21s	remaining: 12.1s
700:	learn: 0.4116900	test: 0.4041602	best: 0.4041602 (700)	total: 6s	remaining: 11.1s
800:	learn: 0.4103379	test: 0.4040325	best: 0.4040325 (800)	total: 6.82s	remaining: 10.2s
900:	learn: 0.4090093	test: 0.4038235	best: 0.4038112 (890)	total: 7.65s	remaining: 9.33s
1000:	learn: 0.4078168	test: 0.4037617	best: 0.4037529 (996)	total: 8.49s	remaining: 8.47s
1100:	learn: 0.40

<catboost.core.CatBoostRegressor at 0x7f6513f55e50>

In [113]:
imp = list(zip(model_revenue_next_month.feature_names_, model_revenue_next_month.feature_importances_.tolist()))

imp.sort(key=lambda x: x[1], reverse=True)

imp

[('log_revenue_current_month', 22.40109499971068),
 ('revenue_current_month', 21.943563348127597),
 ('currency_1', 13.896211622857246),
 ('currency_2', 11.228591108578916),
 ('current_passed_level', 5.672406194018433),
 ('offers', 3.8422657323904574),
 ('ads_revenue_current_month', 2.4354568653120294),
 ('log_ads_revenue_current_month', 2.300683622160358),
 ('logins_current_month', 1.7316808101731769),
 ('hard_quests_rate', 1.5314447123558836),
 ('win_rate', 1.497422572924007),
 ('ads_shown_current_month', 1.4244712671348336),
 ('country', 1.3023357160450462),
 ('months_after_reg', 1.1242471454278256),
 ('current_avg_ping', 1.021598019216955),
 ('currency_4', 0.9470002716399792),
 ('quests', 0.8755798946618566),
 ('currency_3', 0.8566884058997387),
 ('currency_6', 0.8088461388467234),
 ('currency_7', 0.7166870949832157),
 ('wins_pvp', 0.6454945929741989),
 ('hard_quests', 0.642264062588056),
 ('games_pvp', 0.5136816621536945),
 ('currency_5', 0.38887525875004453),
 ('platform', 0.20067

### Training `is_active_next_month` models

In [114]:
# trainig `is_active_next_month`

from sklearn.model_selection import train_test_split

X = df.drop(columns=[
    "id", "current_month", "is_active_next_month", "ads_revenue_next_month", "revenue_next_month", "log_ads_revenue_next_month", "log_revenue_next_month"
])
y = df["is_active_next_month"]

params = dict(
    iterations=2000,
    early_stopping_rounds=200,
    learning_rate=0.0125,
    loss_function='Logloss',
#     subsample=0.9,
    l2_leaf_reg=3,
    eval_metric="F1",
    auto_class_weights="Balanced"
)

pool = cb.Pool(X, y, cat_features=cat_features_names, feature_names=X.columns.tolist())

stats, models_is_active_next_month = cb.cv(
    pool=pool,
    params=params,
    fold_count=5,
    shuffle=True,
    early_stopping_rounds=200,
    plot=True,
    verbose=False,
    return_models=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/5]

bestTest = 0.7925105486
bestIteration = 1098

Training on fold [1/5]

bestTest = 0.7916633517
bestIteration = 620

Training on fold [2/5]

bestTest = 0.7908162755
bestIteration = 1

Training on fold [3/5]

bestTest = 0.7934739444
bestIteration = 36

Training on fold [4/5]

bestTest = 0.7970577183
bestIteration = 1938



In [115]:
# imp = list(zip(model_is_active_next_month.feature_names_, model_is_active_next_month.feature_importances_.tolist()))
#
# imp.sort(key=lambda x: x[1], reverse=True)
#
# imp

# Predict

In [116]:
prediction = pd.read_csv('predict.csv')

In [117]:
prediction

Unnamed: 0.1,Unnamed: 0,id,logins_current_month,country,traffic_type,platform,ads_shown_current_month,ads_revenue_current_month,revenue_current_month,games_pvp,...,ads_revenue_next_month,revenue_next_month,log_ads_revenue_next_month,log_revenue_next_month,is_active_next_month,months_after_reg,win_rate,hard_quests_rate,log_revenue_current_month,log_ads_revenue_current_month
0,11,6,29,133,organic,Android,66,0.040797,0.000000,221,...,,,0.0,0.0,False,3,0.490991,0.084158,0.000000,0.039987
1,17,10,30,164,paid,iOS,781,26.512756,121.358247,401,...,,,0.0,0.0,False,1,0.582090,0.058824,4.806953,3.314650
2,19,11,7,93,organic,Android,3,0.021389,0.000000,20,...,,,0.0,0.0,False,0,0.523810,0.021739,0.000000,0.021164
3,27,18,9,112,organic,iOS,67,2.316753,5.058957,97,...,,,0.0,0.0,False,0,0.642857,0.056818,1.801538,1.198986
4,36,23,13,67,paid,Android,19,0.145450,0.000000,114,...,,,0.0,0.0,False,1,0.504348,0.057592,0.000000,0.135797
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27245,224080,133642,1,101,paid,Android,0,0.000000,0.000000,1,...,,,0.0,0.0,False,5,0.500000,0.000000,0.000000,0.000000
27246,224084,133644,1,161,paid,Android,0,0.000000,0.000000,1,...,,,0.0,0.0,False,2,0.500000,0.000000,0.000000,0.000000
27247,224098,133651,1,8,paid,Android,0,0.000000,0.000000,0,...,,,0.0,0.0,False,1,0.000000,0.000000,0.000000,0.000000
27248,224114,133662,2,164,paid,Android,2,0.002376,0.000000,1,...,,,0.0,0.0,False,4,0.000000,0.000000,0.000000,0.002374


In [118]:
def predict_average(models, input):
    predictions = np.array([model.predict(input[model.feature_names_], prediction_type="Probability")[:, 1] for model in models])

    return predictions.mean(axis=0)

prediction['is_active'] = np.int8(predict_average(models_is_active_next_month, prediction) > 0.5)
prediction['revenue_next_month'] = np.expm1(model_revenue_next_month.predict(prediction[model_revenue_next_month.feature_names_]))
prediction['ads_revenue_next_month'] = np.expm1(predict_ads_revenue(prediction, w_ads_revenue))

submission = prediction[ ['id', 'is_active', 'revenue_next_month', 'ads_revenue_next_month'] ].copy()

submission

Unnamed: 0,id,is_active,revenue_next_month,ads_revenue_next_month
0,6,1,0.019996,0.056172
1,10,1,47.218777,17.407986
2,11,0,0.003983,0.012787
3,18,0,0.294593,0.173837
4,23,0,0.006306,0.022704
...,...,...,...,...
27245,133642,0,0.008703,0.008448
27246,133644,0,-0.001649,0.001269
27247,133651,0,-0.000442,0.002043
27248,133662,0,0.008422,0.011823


In [119]:
submission[ 'next_month_revenue' ] = np.maximum(submission['revenue_next_month'], 0) + np.maximum(submission['ads_revenue_next_month'], 0)

submission = submission[ ['id', 'is_active', 'next_month_revenue'] ]


In [120]:
submission

Unnamed: 0,id,is_active,next_month_revenue
0,6,1,0.076168
1,10,1,64.626763
2,11,0,0.016769
3,18,0,0.468430
4,23,0,0.029010
...,...,...,...
27245,133642,0,0.017151
27246,133644,0,0.001269
27247,133651,0,0.002043
27248,133662,0,0.020245


In [121]:
sub = pd.read_csv('history.csv')
sub = sub.drop_duplicates(subset='id', keep='first')
sub = sub[ ['id'] ]

sub.sort_values('id', inplace=True)

In [122]:
sub['is_active'] = 0
sub['next_month_revenue'] = 0.0

sub = sub.set_index('id', drop=False)

In [123]:
sub.update(submission.set_index("id"))

In [124]:
sub = sub[ ['id', 'next_month_revenue', 'is_active'] ]

sub

Unnamed: 0_level_0,id,next_month_revenue,is_active
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,0.0,0
2,2,0.0,0
3,3,0.0,0
4,4,0.0,0
5,5,0.0,0
...,...,...,...
133670,133670,0.0,0
133671,133671,0.0,0
133672,133672,0.0,0
133673,133673,0.0,0


In [125]:
sub.to_csv('submission.csv', index=False)